# coding: utf-8 # wrapShortenedURLs.py: use a regular expression to find URLs from # URL-shortening services, wrap them in elements # that point to the real destination. Good for archived Twitter messages. # See http://www.devx.com/webdev/Article/40359/1954 about simple # ways of using Twitter's API to retrieve Twitter messages. # copyright 2009 Bob DuCharme no warranty expressed or implied. # 2009-04-16 Now wrapping a/@href in ' instead of " for easier use # with CSV files import twitter import re import sys import httplib import string import urlparse shorteningSvcs = "is.gd|tinyurl.com|bit.ly|snipurl.com|cli.gs" URLPattern = re.compile("(?Phttp://(" + shorteningSvcs + \ ")[a-zA-Z0-9\/\?\&\-\_\=\#\~]+)",re.I) if len(sys.argv) != 2: print "\nEnter\n\n wrapShortenedURLs.py infile.txt\n" print "where infile.txt is the file where you want shortened URLs wrapped" print "with elements pointing to the actual destination. Typically," print "this will be a file of Tweets retrieved with the Twitter API." sys.exit() else: infile = sys.argv[1] fh = open(infile, "r") body = fh.readlines() for line in body: # Each line may have more than one shortened URLs to replace, # so put them all in a list and go through the list. foundURLList = URLPattern.findall(line) if len(foundURLList) > 0: for URLEntry in foundURLList: URL = URLEntry[0] o = urlparse.urlparse(URL) conn = httplib.HTTPConnection(o.netloc) conn.request("GET", o.path) r = conn.getresponse() realURL = r.getheader('Location') if realURL == None: realURL = "could not resolve URL" else: realURL = string.replace(realURL,'&','&') line = string.replace(line,URL,"%s" % \ (realURL,URL)) print line, fh.close()