import re
import urllib2
import socket
import ssl
import httplib
import settings
import time

"""
This module contains functions that have to do with the URLs themselves, 
including parsing of the HTML sources and the actual checking of URLs. It was
mostly included in order to separate the more "raw functionality" away from
the abstract crawler workings.
"""

def read_safe_links (filename):
  ''' Reads in links that are considered safe
  
  Given a filename, reads in links, expecting them to be new-line separated.
  Ignores any content on a line after a '#' sign to allow any comments that 
  might be added.
  '''
  safe_links = set()
  safe_file = open(filename,'r')
  comment_sign = '#'
  for line in safe_file:
    cpos = line.find(comment_sign)
    if cpos != -1:
      if cpos == 0:
        continue
      line = line[:cpos]
    safe_links.add(line.strip())
  safe_file.close()
  return safe_links
  
def read_extra_prefixes (filename):
  ''' Reads in link prefixes that should also be 
  
  Given a filename, reads in prefixes, starting with either 'www' or 'http',
  such that any page found during crawling containing the given prefix will
  also be included in the check, rather than skipped if not having the same 
  prefix as the initial starting page.
  '''
  extra_pref = set()
  extra_file = open(filename,'r')
  comment_sign = '#'
  for line in extra_file:
    cpos = line.find(comment_sign)
    if cpos != -1:
      if cpos == 0:
        continue
      line = line[:cpos]
    line = line.strip()
    if not line.startswith("http") and not line.startswith("www"):
      print "Ignoring malformed extra prefix %s..." % (line)
      continue
    extra_pref.add(line)
  extra_file.close()
  return extra_pref

def clear_source (source):
  ''' Removes certain code from a HTML source
  
  Given a HTML source, it clear certain code that is of no interest, i.e.
  comments, and returns that string. Can be extended to exclude any code that 
  should not be parsed for URLs, as desired.
  '''
  while source.find("<!--") != -1:
    pos = source.find("<!--")
    epos = source.find("-->", pos)
    if epos == -1:
      return False
    #if epos > source.find("<!--", pos+1) or epos == -1:
      #source = source[:pos] + source[pos+len("<!--"):]
    #else:
    source = source[:pos] + source[epos+len("-->"):]
  return source

def find_base (source):
  "Retrieves the base tag if found"
  pos = source.find("<base")
  if pos == -1:
    return False
  else:
    epos = source.find(">", pos)
    upos = source.find("href", pos)
    if upos != -1 and upos < epos:
      upos += len("href=")
      dchar = source[upos]
      url = source[upos+1:source.find(dchar, upos+1)]
      return url
    return False
   
def find_anchors (source):             
  ''' Finds anchor positions in source string
  
  Given a HTML source, returns a list of positions in the given source where
  anchor tags with a valid href have been identified.
  The way in which this is done is simply finding a "<a" in the source string
  and checking whether the href attribute for it is set. This might mean that
  certain tags which are not necessarily only anchors, but have the href
  attribute defined, might be wrongly identified as anchors. But as the 
  purpose of the program overall is to check validity of links, this should
  provide additional, albeit unwarranted functionaly.
  '''
  all_anchors = []
  for match in re.finditer("<a", source):
    epos = source.find(">", match.end())
    if source.find("href", match.end()) < epos and source.find("href", match.end()) != -1:
      all_anchors.append(match.start())
  return all_anchors
  
def find_frames (source):
  ''' Finds frame positions in source string
  
  Similar functionality to the find_anchors() function, but instead searches for
  a "<frame" string and an associated src parameter for the tag
  '''
  all_frames = []
  for match in re.finditer("<frame", source):
    epos = source.find(">", match.end())
    if source.find("src", match.end()) < epos and source.find("src", match.end()) != -1:
      all_frames.append(match.start())
  return all_frames
  
def get_urls (anchors, frames, source):
  ''' Retrieves the URLs for given tags, being provided with their positions
  
  Given a source string and a list of anchor positions, returns all URLs 
  linked by the anchors (meaning the values of the "href" attribute).
  For all anchor positions given (as string indeces for the sources), it finds
  the next instance of a href attribute before a ">" symbol (meaning the href
  has to be set for this particular instance of an anchor) and returns the 
  string contained within the two following instances of either single or
  double quotes.
  '''
  all_urls = {}
  url_pos = {}
  for a_pos in anchors:
    cpos = source.find(">", a_pos)
    bpos = source.find("href", a_pos) + len('href="')
    dchar = source[bpos-1]             # check if single or double quotes
    if not bpos < cpos:
      continue
    if dchar != "'" and dchar != '"':
      bpos -= 1
      epos = bpos
      while source[epos] != ' ' and source[epos] != '>':
        epos += 1
    else:
      epos = source.find(dchar, bpos)
    url = source[bpos:epos]
    
    if source[cpos+1:cpos+1+len("</a")] == "</a":
      all_urls[url] = False
    else:
      all_urls[url] = True
    url_pos[url] = a_pos
    
  # TODO these two could maybe be somehow made into a single block
  for f_pos in frames:
    cpos = source.find(">", f_pos)
    bpos = source.find("src", f_pos) + len('src="')
    dchar = source[bpos-1]             # check if single or double quotes
    if not bpos < cpos:
      continue
    if dchar != "'" and dchar != '"':
      bpos -= 1
      epos = bpos
      while source[epos] != ' ' and source[epos] != '>':
        epos += 1
    else:
      epos = source.find(dchar, bpos)
    url = source[bpos:epos]
    
    all_urls[url] = True
    url_pos[url] = f_pos
    
  return all_urls, url_pos
  
def adopt_orphans (urls, url_pos, base):
  ''' Adds any relevant information to relative links
  
  Fixes links by adding specific information that might be trivially omitted,
  like "http://" prefix or adding the parent for links beginning with "/"
  '''
  adopted_links = {}
  adopted_pos = {}
  parent = base[:[m.start() for m in re.finditer("/", base)][2]+1]
  base_dir = base.endswith("/")
  for url in urls.keys():
    init_url = url
    orphan_pos = url_pos[url]
    if settings.FEEDBACK_MODE != settings.MODE_COMPLETE and url.find("?") != -1:
      url = url.split("?")[0]
    if not url.startswith("http"):
      if '#' in url:
        url = url.split("#")[0]
      if url.startswith("//"):
        url = "http://" + url[len("//"):]
      elif url.startswith("/"):
        url = parent + url[len("/"):]
      elif url.startswith("./"): # recursive + ../ case
        url = base + url[len("./"):]
      elif url.startswith("../"):
        base_url = base
        while url.startswith("../"):
          if base_url.endswith("/"):
            base_url = base_url[:-len("/")]
          base_url = base_url[:base_url.rfind("/")]
          url = url[len("../"):]
        if base_url.endswith("/"):
          base_url = base_url[:-len("/")]
        if url.startswith("/"):
          url = url[len("/"):]
        url = base_url + "/" + url
      elif url.startswith("www"):
        url = "http://" + url
      elif not ' ' in url.strip() and not 'mailto' in url:
        if base_dir:
          url = base + url.strip()
        else: 
          url = base[:base.rfind("/")+1] + url.strip()
    adopted_links[url] = urls[init_url]
    adopted_pos[url] = orphan_pos
  return adopted_links, adopted_pos
  
def check_one_url (url, newurls, httperrs, urlerrs, record):
  ''' Sends a request and stores relevant information
  
  Given a URL and feedback information parameters (lists to store feedback and
  a boolean to select whether or not information is stored, in case of cursory
  URL checking), builds and issues a request based on the URL, then modifies 
  the given lists appropriately by adding any suitable information returned
  in the answer.
  '''
  req = urllib2.Request(url, headers={"Accept" : "*/*"})
  try:
    usock = urllib2.urlopen(req, timeout = 30) 
    usock.close()
    returl = usock.geturl()
    print "HTTP status code: %s" % (usock.getcode())
    if returl != url and url not in returl:
      print "Found redirected link %s..." % (returl)
      newurls[returl] = url
    return True
  except urllib2.HTTPError as httperr:
    print "HTTP Error : %s : %s" % (httperr.code, httperr.reason)
    if record:
      httperrs[req.get_full_url()] = httperr.code
    return False
  except urllib2.URLError as urlerr:
    print "URL Error : %s" % (urlerr.reason)
    if record:
      urlerrs[req.get_full_url()] = urlerr.reason
    return False
  except socket.timeout as tout:
    print "URL timeout."
    if record:
      urlerrs[req.get_full_url()] = tout
    return False
  except socket.error as socke:
    print "Socket error: %s." % (socke)
    if record:
      urlerrs[req.get_full_url()] = socke
    return False
  except ssl.SSLError as ssltout:
    print "Secure URL timeout."
    if record:
      urlerrs[req.get_full_url()] = ssltout
    return False
  except ValueError as val:
    print "Invalid URL given."
    if record:
      urlerrs[req.get_full_url()] = val
    return False
  except httplib.BadStatusLine as bsle:
    print "Unknown status returned from server."
    if record:
      urlerrs[req.get_full_url()] = bsle
    return False
  
def check_urls (urls, curr_url, checked, safe): 
  ''' Checks a list of URLs and records any errors
  
  Given a list of URLs and a parent URL for relative links, iterates through
  the list and calls the check_one_url function on each of them (twice, for
  more reliability) and stores appropriate errors in two distinct lists,
  one for HTTP errors, the other for URL errors, as well as remembering any 
  URLs that have been ignored due to simple syntax checks. Returns the three
  lists as a tuple.
  '''
  httperrs = {}
  urlerrs = {}
  ignored = []
  newurls = {}
  alias = {}
  for url in urls.keys():
    print "-" * 20
    if url.startswith("http://") or url.startswith("https://"):
      print "[%s] Checking %s..." % (time.strftime("%H:%M:%S"), url)
    else:
      print "[%s] Ignoring %s!" % (time.strftime("%H:%M:%S"), url)
      ignored.append(url)
      continue
    is_safe = False
    for safeurl in safe:
      if safeurl in url:
        print "Safe URL found, skipping..."
        is_safe = True
        break
    if is_safe:
      continue
    if url in checked:
      if checked[url][0] == "REDIRECT":
        print "Previously idenitified redirected URL..."
        chkurl = checked[url][1]
      else:
        chkurl = url
      if checked[chkurl] == "OK":
        print "Valid URL previously identified..."
      elif checked[url][0] == "HTTP":
        print "HTTP error for previously identified invalid URL. Skipping..."
        httperrs[url] = checked[chkurl][1]
      elif checked[chkurl][0] == "URL":
        print "URL error for previously identified invalid URL. Skipping..."
        urlerrs[url] = checked[chkurl][1]
    elif not check_one_url(url, newurls, httperrs, urlerrs, False):
      print "Retrying..."
      check_one_url(url, newurls, httperrs, urlerrs, True)
  print "-" * 40
  
  for (newurl, oldurl) in newurls.items():
    urls[newurl] = urls[oldurl]
    alias[oldurl] = newurl
  return httperrs, urlerrs, ignored, alias
  
def parse_errors (err_links):
  if settings.FEEDBACK_MODE == settings.MODE_COMPLETE:
    return err_links
  elif settings.FEEDBACK_MODE == settings.MODE_SERVER_VERIFIED:
    return {k:(v[0], {}, v[2]) for k,v in err_links.items() if len(v[0]) > 0}
  elif settings.FEEDBACK_MODE == settings.MODE_STRICT:
    strict_lnks = {}
    for (url, httperrs) in {k:v[0] for k,v in err_links.items()}.items():
      for httperr in httperrs.items():
        if httperr[1] == 404:
          if url not in strict_lnks:
            strict_lnks[url] = ({}, {}, err_links[url][2])
          strict_lnks[url][0][httperr[0]] = httperr[1]
    return strict_lnks
    
  
def record_page(errlnks, source, url, wfile, urls):
  ''' Records broken URL information in a file
  
  This function is to be called on one page at a time, giving its URL and 
  source, a file object where to write the information and a tuple of HTTP and
  URL errors. It records all links that issued errors when checking and are 
  according the mode set from the command line (more information can be found
  in crawler.py regarding this variable), as well as the corresponding anchor
  tag within the source, allowing for fast searching.
  '''
  
  if len(errlnks[0]) > 0 or (len(errlnks[1]) > 0 and settings.FEEDBACK_MODE == settings.MODE_COMPLETE):
    message = "On page at %s, the following broken links have been identified:\n" % (url)
    
    if len(errlnks[0]) != 0:
      message += "[Links broken due to HTTP errors]\n"
      for (httpurl, httperr) in errlnks[0].iteritems():
        if settings.FEEDBACK_MODE == settings.MODE_STRICT:
          if httperr == 404:
            message += "\t%s: status %s: %s\n" % (httpurl, httperr, httplib.responses[httperr])
            message += "\t[ " + source[errlnks[2][httpurl]:source.find(">", errlnks[2][httpurl])+1] + " ]\n"
        else:
	  try:
	    desc = httplib.responses[httperr]
          except KeyError:
	    desc = "Unknown status retrieved"
          message += "\t%s: status %s: %s\n" % (httpurl, httperr, desc)
          message += "\t[ " + source[errlnks[2][httpurl]:source.find(">", errlnks[2][httpurl])+1] + " ]\n"
        if not urls[httpurl]:
          message += "\t[ EMPTY anchor detected ]\n"
        message += "\n"
        
    if settings.FEEDBACK_MODE == settings.MODE_COMPLETE and len(errlnks[1]) != 0:
      message += "[Links broken due to URL errors]\n"
      for (urlurl, urlerr) in errlnks[1].iteritems():
        message += "\t%s: reason %s\n" % (urlurl, urlerr)
        message += "\t[ " + source[errlnks[2][urlurl]:source.find(">", errlnks[2][urlurl])+1] + " ]\n"
        if not urls[urlurl]:
          message += "\t[ EMPTY anchor detected ]\n"
        message += "\n"
    message += "-" * 20 + "\n"
    wfile.write(message)