import urllib2
import re
import functions
import owners
import mocking
import httplib
import socket
import sys
import settings
import make_report
import time
import datetime
import logging
import os

"""
This is the core of the Crawler utility; it handles keeping track of pages
and URLs checked and brings all the other modules together, calling them
as appropriate.

The way in which the crawler works is:
  * it requires a starting point from the user (a valid URL)
  * it will retrieve the source of the given URL and identify all existing 
  anchor tags, excluding those enclosed in comments
  * it queries each found URL in turn, by sending a HTTP request; if an error
  occurs, it is registered as either a HTTP error (server has responded, but
  the request could not be fulfilled due to certain circumstances; the most
  common issue will be a 404 error, indicating that the requested resource 
  (in most cases, a web page) could not be found on the server) or a URL error
  (the server could not be contacted, which does not necessarily indicate an
  invalid link, but one that might require a more particular request).
  * it builds the list of next pages to crawl by taking all the URLs that 
  have been checked, removing those that are the same as pages that have been
  already checked, and also removing those that are not children of the 
  initially given URL
  * it recursively does the previous 3 steps, iterating through the list built
  at the previous step; what this means is that the program runs after a
  breadth-first search algorithm
  * finally, with the settings offered in the owner configuration file, it 
  sends out appropriate e-mails to inform interested parties of the broken 
  links identified, alongside any information deemed useful in fixing the 
  issues
"""

#---------------------------------------------------------------------------#
# OUTPUT REDIRECTION

if not os.path.isdir(settings.LOG_FOLDER):
  os.mkdir(settings.LOG_FOLDER)
sys.stdout = open(settings.LOG_FILE, "w");
sys.stderr = open(settings.LOG_ERROR_FILE, "w");

#---------------------------------------------------------------------------#
# ARGUMENT PARSING

url = sys.argv[1]                       # the URL to start checking from

settings.FEEDBACK_MODE = int(sys.argv[2]) 
                                        # the feedback detail level:
                                        #   * 1 - only 404 HTTP errors
                                        #   * 2 - only HTTP errors
                                        #   * 3 - all errors
                                        # [can be modified in settings.py]
                                        # [more info in executeCrawl.sh]
                                        
#---------------------------------------------------------------------------#
# VARIABLE INITIALISATION

checked_urls = {}                       # contains the URLs checked and 
                                        # whether the URL is valid or not
                                        
checked_pages = set()                   # contains the pages checked

to_check_pages = set()                  # contains all pages to be checked
                                        # in the next level iteration
                                        
page_errs = {}                          # pages that were marked as to be 
                                        # checked, but no connection could be
                                        # made

error_links = {}                        # contains the pages which contain 
                                        # broken links, as well as some error
                                        # information regarding why the links
                                        # are broken 

url_pos = {}                            # contains which URLs are located at
                                        # which positions in the sources
                                        
sources = {}                            # contains all pages with their 
                                        # respective sources for reuse

level = 0
owns = owners.gen_owners(settings.OWNER_CONFIG_FILENAME)
safe = functions.read_safe_links(settings.SAFE_LINK_FILENAME)
extra = functions.read_extra_prefixes(settings.EXTRA_PREF_FILENAME)
wfile = open(settings.FEEDBACK_WRITE_FILENAME, "w")

# Automatic fixing of trivial links
if not url.startswith("http://"):
  url = "http://" + url

# Initial request to obtain correct URL
init_req = urllib2.Request(url, headers={"Accept" : "*/*"})
try:
  init_sock = urllib2.urlopen(init_req, timeout = 20)
except (urllib2.URLError, socket.error, socket.timeout, ValueError) as e:
  print "Error opening initial URL: %s." % (e)
  sys.exit()
init_sock.close()
init_url = init_sock.geturl()
to_check_pages.add(init_url)
checked_urls[init_url] = "OK"

# Initialising logging
logging.basicConfig(filename = settings.PAGE_LOG, format = "[%(asctime)s] %(message)s", datefmt = "%H:%M:%S", level = logging.DEBUG)

print("Initialisation complete.")
start_time = time.time()

#---------------------------------------------------------------------------#
# CRAWLER CORE

while len(to_check_pages) != 0 and (level <= settings.MAX_LEVEL or settings.MAX_LEVEL < 0):
  print "[%s] Currently checking level %s. %s pages to investigate." % (time.strftime("%H:%M:%S // %a %d"), level, len(to_check_pages))
  curr_page = 0
  logging.info("--Level %s start--" % (level))
  for url in to_check_pages:            
    
    # Get the current checked page's HTML source
    curr_page += 1
    iniurl = url
    req = urllib2.Request(url, headers={"Accept" : "*/*"})
    try:
      usock = urllib2.urlopen(req, timeout = 120)
    except urllib2.URLError as e:
      print "Error connecting to %s: %s" % (url, e.reason)
      checked_pages.add(url)
      page_errs[url] = "Connection error."
      logging.debug("Error for %s -- %s." % (url, e.reason))
      continue
    except socket.error as socke:
      print "Socket error: %s." % (socke)
      checked_pages.add(url)
      page_errs[url] = "Connection error."
      logging.debug("Error for %s -- %s." % (url, socke))
      continue
    except socket.timeout as stoe:
      print "Connection timeout."
      checked_pages.add(url)
      page_errs[url] = "Connection timeout."
      logging.debug("Error for %s -- %s." % (url, stoe))
      continue
    except ValueError as verr:
      print "Invalid URL provided: %s" % verr
      checked_pages.add(url)
      page_errs[url] = "Invalid URL."
      logging.debug("Error for %s -- %s." % (url, verr))
      continue
    source = usock.read().lower()
    usock.close()
    url = usock.geturl()

    if '#' in url:
      url = url.split('#')[0]
      print "Truncating relative URL..."
    # Double check if real URL (for redirected links) not already checked
    if url in checked_pages:
      print "[%s / %s / %s] Page at %s previously checked, skipping..." % (level, curr_page, len(to_check_pages), url)
      print "[%s / %s / %s] [ %s ]" % (level, curr_page, len(to_check_pages), iniurl)
      logging.debug("Skipped %s -- %s." % (url, iniurl))
      if iniurl != url:
        checked_pages.add(iniurl)
      continue
    # Double check if real URL is still valid
    if not url.startswith(init_url) and not [expref for expref in extra if expref in url]: 
      print "[%s / %s / %s] Page at %s reported URL outside of valid checking region, skipping..." % (level, curr_page, len(to_check_pages), url)
      print "[%s / %s / %s] [ %s ]" % (level, curr_page, len(to_check_pages), iniurl)
      logging.debug("Skipped %s -- %s." % (url, iniurl))
      checked_pages.add(url)
      if iniurl != url:
        checked_pages.add(iniurl)
      continue
    # Double check if real URL is safe
    is_safe = False
    for spage in safe:
      if spage in url:
        logging.debug("Skipped safe page %s -- %s." % (url, iniurl))
        is_safe = True
        break
    if is_safe:
      checked_pages.add(url)
      continue

    # Checks succeeded, retrieving source and starting the check process
    print "*" * 20
    print "[%s/%s] Checking page at %s..." % (curr_page, len(to_check_pages), url)
    print "[%s/%s] [ %s ]" % (curr_page, len(to_check_pages), iniurl)
    logging.debug("Initiating %s check." % (url))
    source = functions.clear_source(source)
    if source == False:
      print "Could not parse comments due to improper syntax!"
      checked_pages.add(url)
      page_errs[url] = "Error parsing comments in source."
      continue
    sources[url] = source
    
    
    # Find the URLs contained in the checked page and check them
    anchors = functions.find_anchors(source)
    frames = functions.find_frames(source)
    urls, u_pos = functions.get_urls(anchors, frames, source)
    base = functions.find_base(source)
    if not base:
      base = url
    else:
      print "Found base URL %s." % (base)
    urls, url_pos[url] = functions.adopt_orphans(urls, u_pos, base)
    errs = functions.check_urls(urls, url, checked_urls, safe)
      
      
    for chkurl in set(urls.keys()).difference(errs[2]):     #exclude ignored URLs
      if chkurl in errs[0].keys():
        checked_urls[chkurl] = "HTTP", errs[0][chkurl]
      elif chkurl in errs[1].keys():
        checked_urls[chkurl] = "URL", errs[1][chkurl]
      elif chkurl in errs[3].keys():
        checked_urls[chkurl] = "REDIRECT", errs[3][chkurl]
      else:
        checked_urls[chkurl] = "OK"
    
    #for chkurl in set(urls.keys()):
      #checked_urls[chkurl] = "OK"
    
    error_links[url] = errs[0], errs[1], url_pos[url]
    checked_pages.add(url)
    functions.record_page(error_links[url], source, url, wfile, urls)

  to_check_pages = set(checked_urls.keys())
  #for errlnks in error_links.get(url)[0], error_links.get(url)[1]:
    #to_check_pages = to_check_pages.difference(errlnks.keys())
  
  # Parsing the checked URLs and selecting the pages to be checked for the next
  # level
  direct_pages = set([page for page in to_check_pages if page.startswith(init_url)])
  if len(extra) != 0:
    extra_pages = set()
    for exlink in set(checked_urls.keys()).difference(direct_pages):
      for prefix in extra:
        if prefix in exlink:
          extra_pages.add(exlink)
    to_check_pages = direct_pages.union(extra_pages)
  else:
    to_check_pages = direct_pages
  to_check_pages = to_check_pages.difference(set(k for k,v in checked_urls.items() if v != "OK"))
  to_check_pages = to_check_pages.difference(checked_pages)
  to_rem = set()
  for qpage in to_check_pages:
    ext = qpage[qpage.rfind("/"):]
    if "." in ext:
      ext = ext[ext.find("."):]
      if ext != ".html" or ext != ".htm" or ext != ".php":
        to_rem.add(qpage)
    for spage in safe:
      if spage in qpage:
        to_rem.add(spage)
  to_check_pages = to_check_pages.difference(to_rem)
  logging.debug("--Finished level %s--" % (level))
  level += 1

parse_errs = functions.parse_errors(error_links) 
no_bklinks = sum([len(errs[0])+len(errs[1]) for key,errs in parse_errs.iteritems()])
wfile.write("Found %s pages containing %s broken links total." % (len(parse_errs), no_bklinks))
wfile.close()
  
#---------------------------------------------------------------------------#
# CONSOLE FEEDBACK RETURN
  
print "*" * 40
for (url, errs) in error_links.iteritems():
  print "Page URL = %s" % (url)
  print "-" * 20
  for (httpurl, httperr) in errs[0].iteritems():
    print "HTTP broken link = %s" % (httpurl)
    print "HTTP status code = %s" % (httperr)
    print "-" * 20
  for (urlurl, urlerr) in errs[1].iteritems():
    print "URL broken link = %s" % (urlurl)
    print "URL reason = %s" % (urlerr)
    print "-" * 20
  print "*" * 20
if len(page_errs) > 0:
  print "Pages that were not checked due to errors connecting to them:"
  for (page_err, cause) in page_errs.items():
    print "\t%s: %s" % (page_err, cause)
print "*" * 40

location = mocking.gen_mocks(functions.parse_errors(error_links), sources, owns, init_url)
#owners.check_owners(error_links, owns)

# Makes a report as a html document
make_report.create_dir()
make_report.writeHTMLDoc(functions.parse_errors(error_links), location) 
for uname, err_links in owners.get_owner(functions.parse_errors(error_links), owns).iteritems():
  make_report.writeHTMLDoc(err_links, location, uname)

end_time = time.time()
print "Crawler job succesfully checked %s pages in %s." % (len(checked_pages), str(datetime.timedelta(seconds=end_time-start_time)))


#---------------------------------------------------------------------------#

