# coding: utf-8

# wrapShortenedURLs.py: use a regular expression to find URLs from
# URL-shortening services, wrap them in <a href=""></a> elements
# that point to the real destination. Good for archived Twitter messages.
# See http://www.devx.com/webdev/Article/40359/1954 about simple
# ways of using Twitter's API to retrieve Twitter messages.

# copyright 2009 Bob DuCharme no warranty expressed or implied.

# 2009-04-16 Now wrapping a/@href in ' instead of " for easier use
# with CSV files

import twitter
import re
import sys
import httplib
import string
import urlparse

shorteningSvcs = "is.gd|tinyurl.com|bit.ly|snipurl.com|cli.gs"

URLPattern = re.compile("(?P<URL>http://(" + shorteningSvcs + \
                        ")[a-zA-Z0-9\/\?\&\-\_\=\#\~]+)",re.I)

if len(sys.argv) != 2:
    print "\nEnter\n\n   wrapShortenedURLs.py infile.txt\n"
    print "where infile.txt is the file where you want shortened URLs wrapped"
    print "with <a></a> elements pointing to the actual destination. Typically,"
    print "this will be a file of Tweets retrieved with the Twitter API."
    sys.exit()
else:
    infile = sys.argv[1]

fh = open(infile, "r")
body = fh.readlines()

for line in body:
    # Each line may have more than one shortened URLs to replace, 
    # so put them all in a list and go through the list.

    foundURLList = URLPattern.findall(line)
    if len(foundURLList) > 0:
        for URLEntry in foundURLList:
            URL = URLEntry[0]
            o = urlparse.urlparse(URL)
            conn = httplib.HTTPConnection(o.netloc)
            conn.request("GET", o.path) 
            r = conn.getresponse()
            realURL = r.getheader('Location')
            if realURL == None:
                realURL = "could not resolve URL"
            else:
                realURL = string.replace(realURL,'&','&amp;') 
            line = string.replace(line,URL,"<a href='%s'>%s</a>" % \
                                  (realURL,URL))

    print line,
    
fh.close()