# coding: utf-8
# wrapShortenedURLs.py: use a regular expression to find URLs from
# URL-shortening services, wrap them in elements
# that point to the real destination. Good for archived Twitter messages.
# See http://www.devx.com/webdev/Article/40359/1954 about simple
# ways of using Twitter's API to retrieve Twitter messages.
# copyright 2009 Bob DuCharme no warranty expressed or implied.
# 2009-04-16 Now wrapping a/@href in ' instead of " for easier use
# with CSV files
import twitter
import re
import sys
import httplib
import string
import urlparse
shorteningSvcs = "is.gd|tinyurl.com|bit.ly|snipurl.com|cli.gs"
URLPattern = re.compile("(?Phttp://(" + shorteningSvcs + \
")[a-zA-Z0-9\/\?\&\-\_\=\#\~]+)",re.I)
if len(sys.argv) != 2:
print "\nEnter\n\n wrapShortenedURLs.py infile.txt\n"
print "where infile.txt is the file where you want shortened URLs wrapped"
print "with elements pointing to the actual destination. Typically,"
print "this will be a file of Tweets retrieved with the Twitter API."
sys.exit()
else:
infile = sys.argv[1]
fh = open(infile, "r")
body = fh.readlines()
for line in body:
# Each line may have more than one shortened URLs to replace,
# so put them all in a list and go through the list.
foundURLList = URLPattern.findall(line)
if len(foundURLList) > 0:
for URLEntry in foundURLList:
URL = URLEntry[0]
o = urlparse.urlparse(URL)
conn = httplib.HTTPConnection(o.netloc)
conn.request("GET", o.path)
r = conn.getresponse()
realURL = r.getheader('Location')
if realURL == None:
realURL = "could not resolve URL"
else:
realURL = string.replace(realURL,'&','&')
line = string.replace(line,URL,"%s" % \
(realURL,URL))
print line,
fh.close()