Masatoshi Ito の備忘録

旧Blogのアーカイブです。

goo blogの閉鎖に伴い、アーカイブとして移行することにしました。

古い情報やリンク切れなどがあると思いますが、ご了承ください。

RSS取得プログラム（RSSリーダーではないです…多分…）

（2009年09月10日 22時31分50秒｜Zaurus/Python）

PythonでRSSを取得するプログラムを書いてみました。
TTextReaderで読めるテキスト形式で保存するだけですので、TTextReaderと組み合わせれば、RSSリーダーと呼べなくはないと思いますが…使い方次第でしょうか…(^_^;

実行するにはPythonとFeed Parserが必要です。

Festina lente: ZaurusにPythonインストール
http://noel-festinalente.cocolog-nifty.com/blog/2006/06/zauruspython_a346_1.html
Feed Parser
~~http://www.feedparser.org/~~

以下の場所に設定ファイルを置いて下さい。

/home/zaurus/Documents/Text_Files/rss/rssurl.txt

設定ファイルの内容は以下の通りです。

[サイトの名前][TAB][RSSのURL]
[サイトの名前][TAB][RSSのURL]

[サイトの名前]は省略可能。
[TAB]はタブ文字です。
１行につき１サイトの設定を書きます。

rssutl.txt（設定ファイル）の例。

	http://blog.goo.ne.jp/ito65/index.rdf
レクリエーションWiki	http://ito65-web.hp.infoseek.co.jp/cgi-bin/recwiki/wiki.cgi?RecentChanges

ネットが繋がる状態で起動すると以下の場所にTTextReaderで読めるテキストファイルを書き出します。
パス等を変更する場合は適宜プログラム（一番下ぐらい）を変更して下さい。

/home/zaurus/Documents/Text_Files/rss/rss-日付-時間.txt

ここからプログラムです。

#!/usr/bin/env python
# vim: fileencoding=utf-8

import codecs
import feedparser
from time import localtime, strftime
from calendar import timegm
from HTMLParser import HTMLParser, HTMLParseError

class ToTextHTMLParser(HTMLParser):

	def __init__(self):
		HTMLParser.__init__(self)
		self.text = ""

	def handle_data(self, data):
		data = data.strip("trn")
		self.text += data

	def handle_starttag(self, tag, attrs):
		if tag == "br":
			self.text += "n"
		elif tag == "p":
			self.text += "n"
		elif tag == "img":
			self.text += "[IMG"
			attrs = dict(attrs)
			if "alt" in attrs and attrs["alt"]:
				self.text += ":" + attrs["alt"]
			self.text += "]"

	def handle_endtag(self, tag):
		pass

def html2text(html):
	try:
		p = ToTextHTMLParser()
		p.feed(html)
		p.close()
		text = p.text
	except HTMLParseError:
		text = html
	return text

def getText(detail):
	if detail.type.find("html") >= 0:
		text = html2text(detail.value)
	else:
		text = detail.value
	return text

def convLine(line):
	line = line.strip(u"　 rn")
	if line.find(".") == 0:
		line = line.replace(".", " .", 1)
	line = line.replace("n.", "n .")
	while True:
		work = line.replace("nnn", "nn")
		if work == line:
			break
		line = work
	return line

def getFeedTextData(title, url):
	if url.find("http") != 0:
		return None

	print url+"tparse"
	d = feedparser.parse(url)
	if not "title" in d.feed:
		return None

	# add text
	print "ttext"
	if not title:
		title = getText(d.feed.title_detail)
	text = "."  + title + "n"
	text += title + "n"
	if "subtitle_detail" in d.feed:
		text += getText(d.feed.subtitle_detail) + "n"
	text += d.feed.link + "n"

	for e in d.entries:
		# format
		title = getText(e.title_detail)
		if e.updated_parsed:
			updated = localtime(timegm(e.updated_parsed))
			timestr = strftime("%Y/%m/%d %H:%M:%S",updated)
		elif e.updated:
			timestr = e.updated
		else:
			timestr = "----/--/-- --:--:--"

		# add text
		text += ".." + timestr + " " + title + "n"
		text += timestr + " " + title + "n"
		text += e.link + "n"
		text += "n"
		if "content" in e:
			for c in e.content:
				text += convLine(getText(c)) + "n"
		elif "summary_detail" in e:
			text += convLine(getText(e.summary_detail)) + "n"
		elif "summary" in e:
			text += convLine(e.summary) + "n"

	print "tend"
	return text

if __name__ == "__main__":
	rssfile = strftime("/home/zaurus/Documents/Text_Files/rss/rss-%Y%m%d-%H%M%S.txt",localtime())
	rssfile = codecs.open(rssfile, "w", "utf8")
	for line in codecs.open("/home/zaurus/Documents/Text_Files/rss/rssurl.txt", "r", "utf8").readlines():
		if line.find("#") == 0:
			continue
		try:
			title, url = line.split("t");
		except ValueError:
			continue
		text = getFeedTextData(title, url)
		if text:
			rssfile.write(text)

ここまでです。