I just wanted to put this up here so that I can show what I did last night in case it looks like I’m not doing anything (I’m talking to myself in the future by the way. I will forget how busy I was.)
This script txt2link.py, is used by my squidparser to makes links out of all those email addresses and urls in the squid events. Now it’s new, it’s improved, it’s da da dum…SUPERUSEFULL!!!!
import re
""" this was adapted from vagueurl.py -- regular expression to match
informal URLs in plain text.
This doesn't do and exact job (it doesn't parse the complete syntax
of URLs) but it should find URLs that at least start right.
It looks for the start of an address then gobbles up as many legal URL
characters as it can find.
originally by Glyn Webster 1999-04-27
now by dave primmer /
"""
pattern = r'''
( ( \w | - | % )+ @ # email address prefix (e.g. "glyn@")
| \w+ :// # or protocol prefix (e.g. "http://")
| news: # or "news:" prefix (special case: no "//")
| mailto: #
| www \. # lazy typists leave off common prefixes
| ftp \.
) # then
[^\\{}|[\]^<>"'\s]* # the rest are any characters allowed in a URL.
[^\\{}|[\]^<>"'\s.,;?:!]
# it mustn't end in a punctuation mark or this would
# match this wrong: "Www.w3.org, ftp.simtel.net."
'''
def vagueurl(url):
""" Add an appropiate prefix to an informal URL
if a match object is passed (convert to a string).
"""
if re.match(r"\w+:", url): #Has prefix already, leave it alone.
return url
else:
if re.match(r"(\w|-|%)+@", url): #Starts like an email address.
return "mailto:" + url
elif url[:3] == 'ftp': #Starts like an FTP address.
return "ftp://" + url
else: #Assume it's a WWW address.
return "http://" + url
def URL2htmllink(url):
""" wraps html link code around url
url is a match object which is converted to a string
"""
url = url.group(0)
return '%s' % (vagueurl(url), url)
def URL2xmllink(url):
""" wraps xml link code around url
url is a match object which is converted to a string
"""
url = url.group(0)
return '%s%s ' % (vagueurl(url),url)
def linktext(text,mode='html'):
regexp = re.compile(pattern, re.IGNORECASE | re.VERBOSE)
if mode == 'xml':
return re.sub(regexp,URL2xmllink ,text)
else:return re.sub(regexp,URL2htmllink,text)
if __name__ == '__main__':
sampletext = """dsik sdksils difjsk / akd is sss.sss.www
www.ddd another and nobody@nowhere.com aid """
print linktext(sampletext)