Wednesday April 24, 2002

I just wanted to put this up here so that I can show what I did last night in case it looks like I'm not doing anything (I'm talking to myself in the future by the way. I will forget how busy I was.)

This script txt2link.py, is used by my squidparser to makes links out of all those email addresses and urls in the squid events. Now it's new, it's improved, it's da da dum....SUPERUSEFULL!!!!

import re

""" this was adapted from vagueurl.py -- regular expression to match
    informal URLs in plain text.

    This doesn't do and exact job (it doesn't parse the complete syntax
    of URLs) but it should find URLs that at least start right.
    It looks for the start of an address then gobbles up as many legal URL
    characters as it can find.
    originally by Glyn Webster <glyn@ninz.org.nz> 1999-04-27
    now by dave primmer http://primco.org
"""

pattern = r'''
  ( ( \w | - | % )+ @  #  email address prefix (e.g. "glyn@")
  | \w+ ://            #  or protocol prefix (e.g. "http://")
  | news:              #  or "news:" prefix (special case: no "//")
  | mailto:            #
  | www \.             #  lazy typists leave off common prefixes
  | ftp \.
  )                    #  then
  [^\\{}|[\]^<>"'\s]*  #  the rest are any characters allowed in a URL.
  [^\\{}|[\]^<>"'\s.,;?:!]
                       #  it mustn't end in a punctuation mark or this would
                       #  match this wrong: "Www.w3.org, ftp.simtel.net."
'''


def vagueurl(url):
  """ Add an appropiate prefix to an informal URL
      if a match object is passed (convert to a string).
  """
  if re.match(r"\w+:", url):           #Has prefix already, leave it alone.
    return url
  else:
    if re.match(r"(\w|-|%)+@", url):   #Starts like an email address.
      return "mailto:" + url
    elif url[:3] == 'ftp':             #Starts like an FTP address.
      return "ftp://" + url
    else:                              #Assume it's a WWW address.
      return "http://" + url

def URL2htmllink(url):
    """ wraps html link code around url
        url is a match object which is converted to a string
    """
    url = url.group(0)
    return '<a href="%s">%s</a>' % (vagueurl(url), url)

def URL2xmllink(url):
    """ wraps xml link code around url
        url is a match object which is converted to a string
    """
    url = url.group(0)
    return '<link><address>%s</address><text>%s</text></link>' % (vagueurl(url),url)

def linktext(text,mode='html'):
    regexp = re.compile(pattern, re.IGNORECASE | re.VERBOSE)
    if mode == 'xml':
      return re.sub(regexp,URL2xmllink ,text)
    else:return re.sub(regexp,URL2htmllink,text)

if __name__ == '__main__':
  sampletext = """dsik sdksils difjsk http://primco.org akd is sss.sss.www
 www.ddd another and nobody@nowhere.com aid """
  print linktext(sampletext)