#!/usr/local/bin/python

import sys, os, getopt, time, string, socket
import urlparse, urllib
import neo_cgi, neo_util
import textindex

sys.path.insert(0, "/home/www/blong/programs/python")

import sgmllib
from bsddb3 import dbshelve

TITLE_CACHE = "/home/www/blong/titles.db"

class TitleExtractor(sgmllib.SGMLParser):
  def __init__ (self):
    sgmllib.SGMLParser.__init__ (self, 0)
    self._title = ""
    self._in_title = 0

  def start_title(self, a):
    self._in_title = 1

  def handle_data(self, data):
    if self._in_title:
      self._title = self._title + data

  def end_title(self):
    self._in_title = 0

  def title(self):
    return self._title

class MyURLopener(urllib.FancyURLopener):
  def __init__(self, *args):
    self.version = "Mozilla/5.0 (references extractor)"
    urllib.FancyURLopener.__init__(self, *args)
  def prompt_user_passwd(self, host, realm):
    return None, None

urllib._urlopener = MyURLopener()


class GenerateReferences:
  def __init__ (self, prefix_match, local):
    self.prefix_match = prefix_match
    self.local = local
    # self.match_re = re.compile(matching)
    
    self.refs = {}
    self.queries = []

  def process_file(self, logfile):
    fp = open(logfile)
    sys.stderr.write("Processing file %s\n" % logfile)

    while 1:
      line = fp.readline()
      if not line: break
      parts = string.split(line, ' ')
      if len(parts) < 11: continue
      url = parts[6]
      ref = parts[10]

      if ref[0] == '"':
        ref = ref[1:-1]

      if not ref or ref == "-": continue
      if ref.find("wholesale4.com") != -1: continue
      if ref.find("precisionintelligence.com") != -1: continue
      if ref.find("google") != -1 and ref.find("cache:") != -1: continue
      if ref.find("corp.google") != -1: continue
      if ref.find("casino-game-internet-casino-on-line") != -1: continue

      if self.local:
        (proto, host, path, params, qstr, frag) = urlparse.urlparse(ref)
        if host.find(self.local) != -1: continue

      # Ignore links to my bookmarklet script
      if string.find(url, "blinks.cgi") != -1:
        continue

      if string.find(url, self.prefix_match) == 0:
        # sys.stderr.write("Matching %s\n" % url)
        (who, query) = self.determine_query(ref)
        if query: query = query.strip()
        if query:
          self.queries.append( (who, query, ref) )
        else:
          try:
            count = self.refs[ref]
          except KeyError:
            count = 0
          count = count + 1
          self.refs[ref] = count

  def qdict_from_url(self, url):
    (proto, host, path, params, qstr, frag) = urlparse.urlparse(url)
    if not qstr: return {}
    parts = string.split(qstr, '&')
    qdict = {}
    for part in parts:
      sub_parts = string.split(part, '=', 1)
      if len(sub_parts) == 2:
        qdict[neo_cgi.urlUnescape(sub_parts[0])] = neo_cgi.urlUnescape(sub_parts[1])
    return qdict

  def determine_query(self, url):
    (proto, host, path, params, qstr, frag) = urlparse.urlparse(url)
    x = string.find(host, ':')
    if x!=-1: 
      host = host[:x]
    host = host.lower()
    if not qstr: return None, None
    parts = string.split(qstr, '&')
    qdict = {}
    for part in parts:
      sub_parts = string.split(part, '=', 1)
      if len(sub_parts) == 2:
        qdict[neo_cgi.urlUnescape(sub_parts[0])] = neo_cgi.urlUnescape(sub_parts[1])
        
    if string.find(host, 'google') != -1 and path in ('/search', '/custom', '/ie', '/hws/search', '/microsoft', '/imgres'):
      query = qdict.get('q', '') 
      if not query:
        query = qdict.get('as_q', '') 
      if not query:
        query = qdict.get('as_oq', '') 
      if not query:
        query = qdict.get('as_epq', '') 
      if not query:
        prev = qdict.get('prev', '')
        qdict = self.qdict_from_url(prev)
        query = qdict.get('q', '') 
        if not query:
          query = qdict.get('as_q', '') 
        if not query:
          query = qdict.get('as_oq', '') 
        if not query:
          query = qdict.get('as_epq', '') 
      return "Google", query
    elif host[-16:] == 'search.yahoo.com' or host in ['websearch.yahoo.co.jp', 'search.yahoo.co.jp']:
      query = qdict.get('p')
      if not query:
        query = qdict.get('va', '')
      if not query:
        query = qdict.get('vp', '')
      return "Yahoo", query
    elif host.find('altavista.com') != -1:
      query =  qdict.get('q', '')
      if not query:
        query = qdict.get('aqa', '')
      if not query:
        query = qdict.get('aqb', '')
      return "Altavista", query
    elif host in ['mysearch.myway.com', 'kd.mysearch.myway.com', 'www.mywebsearch.com']:
      query = qdict.get('searchfor', '')
      return "MyWay", query
    elif host.find('search.msn.') != -1:
      query = qdict.get('q', '')
      if not query:
        query = qdict.get('MT', '')
      return "MSN", query
    elif host in ['search.lycos.com']:
      return "Lycos", qdict.get('query', '')
    elif host in ['www.alltheweb.com']:
      return "AllTheWeb", qdict.get('q', '')
    elif host in ['s.teoma.com']:
      return "Teoma", qdict.get('q', '')
    elif host in ['search.naver.co.jp', 'search.naver.com']:
      return "Naver", qdict.get('query', '')
    elif host == 'cgi.search.biglobe.ne.jp':
      return "BigGlobe", qdict.get('q', '')
    elif host.find('aolsearch') != -1 or host in ['find.web.aol.com', 'search.aol.com']:
      query = qdict.get('query', '')
      if not query:
        query = qdict.get('userQuery', '')
      return "AOL", query
    elif host == 'search.jp.aol.com':
      return "AOL/JP", qdict.get('query', '')
    elif path.find("dog/results") != -1:
      return "Dogpile", qdict.get('qkw', '')
    elif host == 'www.worldnet.att.net':
      return "ATT", qdict.get('qry', '')
    elif host in ['web.ask.com', 'www.ask.co.uk', 'tm.wc.ask.com', 'pictures.ask.com']:
      bpg = qdict.get('bpg', '')
      if bpg:
        qdict = self.qdict_from_url(bpg)
      query = qdict.get('q', '')
      if not query:
        query = qdict.get('ask', '')
      return "AskJeeves", qdict.get('q', '')
    elif host == 'search.netscape.com':
      return "Netscape", qdict.get('query', '')
    elif host == 'srch.lop.com':
      return 'SearchWebNow', qdict.get('s', '')
    elif host.find('mysearch.com') != -1:
      return 'MySearch', qdict.get('searchfor', '')
    elif host == 'search.active-max.com':
      return 'ActiveMax', qdict.get('s', '')
    elif host == 'www.overture.com':
      return 'Overture', qdict.get('Keywords', '')
    elif host == 'www.comcast.net' and path == '/qry/websearch':
      return 'Comcast', qdict.get('query', '')
    elif host in ['web.resultsondemand.net', 'web.yoursearchfinder.com']:
      return 'Quick!', qdict.get('s', '')
    elif host == 'search.cometsystems.com':
      return 'Starware', qdict.get('qry', '')
    elif host == 'ixquick.com':
      return 'Ixquick', qdict.get('query', '')
    elif host == 'search.earthlink.net':
      return 'Earthlink', qdict.get('q', '')
    elif host == 'www.picsearch.com':
      return 'Picsearch', qdict.get('q', '')
    elif host == 'www.excite.co.jp':
      return 'Excite', qdict.get('search','')
    elif host == 'search.freeserve.com':
      return 'Freeserve', qdict.get('q','')
    elif host in ['www.looksmart.com', 'rr.looksmart.com']:
      return 'Looksmart', qdict.get('key','')
    elif host == 'inazuma':
      return 'Inazuma', qdict.get('QUERY', '')
    elif host == 'asp.usatoday.com':
      return 'USA Today', qdict.get('q', '')
    elif host == 'drs.yahoo.com':
      parts = path.split('/')
      query = ''
      for part in parts:
        sub = part.split('=', 1)
        if len(sub) != 2: continue
        if sub[0] == 'K':
          query = sub[1]
      return 'Yahoo!', query
    elif host == 'www.searchalot.com':
      return 'SearchALot', qdict.get('q', '')
    elif host == 'freshmeat.net':
      return 'Freshmeat', qdict.get('q', '')
    elif host == 'results.searchscout.net':
      return 'SearchScout', qdict.get('k', '')
    elif host == 'hoststart.oingo.com':
      return 'DomainPark', qdict.get('Keywords', '')
    elif host == 'sitefinder.verisign.com':
      return 'SiteFinder', qdict.get('host', '')
    
    return None, None

  def fetch_title(self, url):
    db = dbshelve.open(TITLE_CACHE)
    try:
      try:
        title = db[url]
        if title == '403 Forbidden': raise KeyError
        return title
      except KeyError:
        pass

      (proto, host, path, params, qstr, frag) = urlparse.urlparse(url)

      if host[:14] == "groups.google.":
        return "Google Groups"

      if proto == "http":
        # strip fraq part
        sys.stderr.write("Fetching %s\n" % url)
        try:
          fp = urllib.urlopen(urlparse.urlunparse((proto, host, path, params, qstr, '')))
          page = fp.read()
          parser = TitleExtractor()
          parser.feed(page)
          parser.close()
          title = parser.title()
          if title:
            if title.find(chr(0x27) + "$B") != -1:
              title = host
          if not title: title = host
          if title == '403 Forbidden': title = host
          db[url] = title
          return title
        except IOError:
          sys.stderr.write("Unable to fetch url: %s\n" % url)
        
      return host
    finally:
      db.close()

  def output(self, outfile):
    hdf = neo_util.HDF()
    hdf.setValue('AdsEnabled', '0')
    items = self.refs.items()
    items = map(lambda x: (x[1], x[0]), items)
    items.sort()
    items.reverse()
    x = 0
    for count, url in items:
      title = self.fetch_title(url)
      if title == '403 Forbidden': continue
      hdf.setValue("References.%d.url" % x, url)
      hdf.setValue("References.%d.count" % x, str(count))
      if title:
        hdf.setValue("References.%d.title" % x, neo_cgi.htmlEscape(title))
      x = x + 1

    queries = self.queries
    queries.reverse()
    query_count = {}
    popular_query = {}
    popular_words = {}
    popular_engine = {}
    for (who, query, url) in queries:
      query = query.lower()
      uniq = who + query
      query_count[uniq] = query_count.get(uniq, 0) + 1
      popular_engine[who] = popular_engine.get(who, 0) + 1
      popular_query[query] = popular_query.get(query, 0) + 1
      words = textindex.count_words(query)
      for word in words.keys():
        popular_words[word] = popular_words.get(word, 0) + 1
    x = 0
    query_seen = {}
    for (who, query, url) in queries:
      uniq = who + query.lower()
      if query_seen.has_key(uniq): continue
      query_seen[uniq] = 1
      hdf.setValue("Queries.%d.who" % x, neo_cgi.htmlEscape(who))
      hdf.setValue("Queries.%d.query" % x, neo_cgi.htmlEscape(query))
      hdf.setValue("Queries.%d.url" % x, url)
      hdf.setValue("Queries.%d.count" % x, str(query_count.get(uniq, 0)))
      x = x + 1

    popular_engine = map(lambda x: (x[1], x[0]), popular_engine.items())
    popular_engine.sort()
    popular_engine.reverse()
    
    x = 0
    for (count, engine) in popular_engine:
      hdf.setValue("QueryEngine.%d.who" % x, neo_cgi.htmlEscape(engine))
      hdf.setValue("QueryEngine.%d.count" % x, str(count))
      x = x + 1

    popular_query = map(lambda x: (x[1], x[0]), popular_query.items())
    popular_query.sort()
    popular_query.reverse()
    
    x = 0
    for (count, query) in popular_query:
      hdf.setValue("QueryPop.%d.query" % x, neo_cgi.htmlEscape(query))
      hdf.setValue("QueryPop.%d.count" % x, str(count))
      x = x + 1

    popular_words = map(lambda x: (x[1], x[0]), popular_words.items())
    popular_words.sort()
    popular_words.reverse()
    
    x = 0
    for (count, word) in popular_words:
      hdf.setValue("QueryWords.%d.word" % x, neo_cgi.htmlEscape(word))
      hdf.setValue("QueryWords.%d.count" % x, str(count))
      x = x + 1
      
    if outfile == "-":
      print hdf.dump()
    else:
      hdf.writeFileAtomic(outfile)


def main(argv, environ):
  alist, args = getopt.getopt(argv[1:], "", ["help", "matching=", "outfile=", "local="])

  outfile = "-"
  matching = ""
  domain = socket.gethostname() 
  parts = string.split(domain, '.')
  domain = string.join(parts[1:], '.')
  local = domain

  sys.stderr.write("Domain: %s\n" % domain)

  for (field, val) in alist:
    if field == '--help':
      usage(argv[0])
      return
    if field == "--outfile":
      outfile = val
    if field == "--matching":
      matching = val
    if field == "--local":
      local = val

  sys.stderr.write("Local: %s\n" % local)
  genref = GenerateReferences(matching, local)

  for file in args:
    genref.process_file(file)

  sys.stderr.write("Dumping\n")
  genref.output(outfile)


if __name__ == "__main__":
  main (sys.argv, os.environ)
