python-twitter で本文と付加情報の分離

(追記) どの投稿に対する返信なのかについて、python-twitter のクラス Status に in_replay_to_status_id ってのがありました。

status.in_reply_to_screen_name (None | string)
status.in_reply_to_user_id (None | long)
status.in_reply_to_status_id (None | long)

返信機能を使った場合に返信先のつぶやきID, ユーザID, ユーザ名が取得できます。

とりあえずこんな感じでしょうか。Replies, keywords, retweets, URLs のデータ構造はまだ考えていない。

#!/usr/bin/env python 
# coding: UTF-8

from datetime import datetime
import re
import twitter


# ########
def main():
	api = twitter.Api("username", "password")
	
	timeline = api.GetFriendsTimeline()
	
	for post in timeline:
		# 日付を datetime オブジェクトにする
		# +0000 の部分を表す %z は使えなかったのでそのまま
		# 実質上GMTで返ってくるので問題ない
		date = datetime.strptime(post.created_at, "%a %b %d %H:%M:%S +0000 %Y")
		datestr = date.strftime("%Y-%m-%d %H:%M:%S")
		
		# post.text から本当の本文を抜き出す
		(pureText, replies, keywords, retweets, urls) = getPureText(post.text)
		
		# テスト。整形して表示
		print post.user.screen_name + " (" + datestr + "):"
		print "==" + post.text
		print "++" + pureText
		if replies:
			print "  reply to: " + replies
		if keywords:
			print "  keywords: " + keywords
		if retweets:
			print "  retweets: " + retweets
		if urls:
			print "  urls: " + " ".join(urls)
		print ""


# ########
def getPureText(text):
	# 付加情報
	replies = ""
	retweets = ""
	keywords = ""
	urls = []
	
	# URLを取り除く
	p = re.compile(r"(https?://[-0-9A-Za-z/,.;:~?&%\@=+#]+)")
	m = p.search(text)
	while m:
		(text, extra) = splitMatchedString(text, m)
		urls.append(extra)
		m = p.search(text)
	
	# Reply
	p = re.compile(r"(?:|\.\s*)(@\w+(?:\s+|$))+")
	m = p.match(text)
	if m:
		(text, extra) = splitMatchedString(text, m)
		replies = extra
	
	# Keywords
	p = re.compile(r"#\w+\b")
	m = p.search(text)
	while m:
		(text, extra) = splitMatchedString(text, m)
		keywords += extra + " "
		m = p.search(text)
	
	# ReTweet (1)
	p = re.compile(r"\bRT @(\w+):\s*(.*)")
	m = p.search(text)
	if m:
		(text, extra) = splitMatchedString(text, m)
		retweets = extra
	
	# ReTweet (2)
	p = re.compile(r"\bRT:\s*(.*)\(via @(\w+)\)")
	m = p.search(text)
	if m:
		(text, extra) = splitMatchedString(text, m)
		retweets += extra
	
	return (text, replies, keywords, retweets, urls)


# ########
def splitMatchedString(text, m):
	return (text[ : m.start() ] + text[ m.end() : ], text[ m.start() : m.end() ])


# ########
main()

リストを文字列として結合するのは "separator".join(list) なのかー慣れないな。
参考にした/これからする：

正規表現のグループに名前付けられるんだっけ？