ó
ć Rc           @   sÄ   d  d l  Z  d  d l Z d  d l Z d  d l Z d  d l Z d  d l Z d  d l Z d   Z d   Z d   Z	 d   Z
 d   Z d   Z d   Z d	   Z d
   Z d   Z d   Z d   Z d S(   i’’’’Nc         C   s   t    } t |  d  } d } x[ | D]S } | j |  } | d k re | d k rX q% n  | |  } n  | j | j    q% W| j   | S(   sŚ    Reads in links that are considered safe
  
  Given a filename, reads in links, expecting them to be new-line separated.
  Ignores any content on a line after a '#' sign to allow any comments that 
  might be added.
  t   rt   #i’’’’i    (   t   sett   opent   findt   addt   stript   close(   t   filenamet
   safe_linkst	   safe_filet   comment_signt   linet   cpos(    (    sH   /vol/project/2012/wmproject2013/chandra/web-project/Crawler/functions.pyt   read_safe_links   s    	
c         C   sæ   t    } t |  d  } d } x | D] } | j |  } | d k re | d k rX q% n  | |  } n  | j   } | j d  r  | j d  r  d | GHq% n  | j |  q% W| j   | S(   s?   Reads in link prefixes that should also be 
  
  Given a filename, reads in prefixes, starting with either 'www' or 'http',
  such that any page found during crawling containing the given prefix will
  also be included in the check, rather than skipped if not having the same 
  prefix as the initial starting page.
  R    R   i’’’’i    t   httpt   wwws%   Ignoring malformed extra prefix %s...(   R   R   R   R   t
   startswithR   R   (   R   t
   extra_preft
   extra_fileR   R   R   (    (    sH   /vol/project/2012/wmproject2013/chandra/web-project/Crawler/functions.pyt   read_extra_prefixes$   s     	 	
c         C   sm   xf |  j  d  d k rh |  j  d  } |  j  d |  } | d k rI t S|  |  |  | t d  }  q W|  S(   sō    Removes certain code from a HTML source
  
  Given a HTML source, it clear certain code that is of no interest, i.e.
  comments, and returns that string. Can be extended to exclude any code that 
  should not be parsed for URLs, as desired.
  s   <!--i’’’’s   -->(   R   t   Falset   len(   t   sourcet   post   epos(    (    sH   /vol/project/2012/wmproject2013/chandra/web-project/Crawler/functions.pyt   clear_source=   s     c         C   s¢   |  j  d  } | d k r t S|  j  d |  } |  j  d |  } | d k r | | k  r | t d  7} |  | } |  | d |  j  | | d  !} | St Sd S(   s   Retrieves the base tag if founds   <basei’’’’t   >t   hrefs   href=i   N(   R   R   R   (   R   R   R   t   upost   dchart   url(    (    sH   /vol/project/2012/wmproject2013/chandra/web-project/Crawler/functions.pyt	   find_baseO   s    
!c         C   s   g  } x t  j d |   D]p } |  j d | j    } |  j d | j    | k  r |  j d | j    d k r | j | j    q q W| S(   sa   Finds anchor positions in source string
  
  Given a HTML source, returns a list of positions in the given source where
  anchor tags with a valid href have been identified.
  The way in which this is done is simply finding a "<a" in the source string
  and checking whether the href attribute for it is set. This might mean that
  certain tags which are not necessarily only anchors, but have the href
  attribute defined, might be wrongly identified as anchors. But as the 
  purpose of the program overall is to check validity of links, this should
  provide additional, albeit unwarranted functionaly.
  s   <aR   R   i’’’’(   t   ret   finditerR   t   endt   appendt   start(   R   t   all_anchorst   matchR   (    (    sH   /vol/project/2012/wmproject2013/chandra/web-project/Crawler/functions.pyt   find_anchors^   s    <c         C   s   g  } x t  j d |   D]p } |  j d | j    } |  j d | j    | k  r |  j d | j    d k r | j | j    q q W| S(   s¾    Finds frame positions in source string
  
  Similar functionality to the find_anchors() function, but instead searches for
  a "<frame" string and an associated src parameter for the tag
  s   <frameR   t   srci’’’’(   R!   R"   R   R#   R$   R%   (   R   t
   all_framesR'   R   (    (    sH   /vol/project/2012/wmproject2013/chandra/web-project/Crawler/functions.pyt   find_framesq   s    <c         C   s$  i  } i  } x|  D]} | j  d |  } | j  d |  t d  } | | d } | | k  sg q n  | d k rĆ | d k rĆ | d 8} | }	 xC | |	 d k ræ | |	 d k ræ |	 d 7}	 q Wn | j  | |  }	 | | |	 !}
 | | d | d t d  !d k rt | |
 <n
 t | |
 <| | |
 <q Wxė | D]ć } | j  d |  } | j  d	 |  t d
  } | | d } | | k  sq3n  | d k rć| d k rć| d 8} | }	 xC | |	 d k rß| |	 d k rß|	 d 7}	 q²Wn | j  | |  }	 | | |	 !}
 t | |
 <| | |
 <q3W| | f S(   s!   Retrieves the URLs for given tags, being provided with their positions
  
  Given a source string and a list of anchor positions, returns all URLs 
  linked by the anchors (meaning the values of the "href" attribute).
  For all anchor positions given (as string indeces for the sources), it finds
  the next instance of a href attribute before a ">" symbol (meaning the href
  has to be set for this particular instance of an anchor) and returns the 
  string contained within the two following instances of either single or
  double quotes.
  R   R   s   href="i   t   't   "t    s   </aR)   s   src="(   R   R   R   t   True(   t   anchorst   framesR   t   all_urlst   url_post   a_posR   t   bposR   R   R   t   f_pos(    (    sH   /vol/project/2012/wmproject2013/chandra/web-project/Crawler/functions.pyt   get_urls~   sF    
#%

#
c         C   s¼  i  } i  } | g  t  j d |  D] } | j   ^ q" d d  } | j d  } x]|  j   D]O} | }	 | | }
 t j t j k r² | j d  d k r² | j	 d  d } n  | j
 d  sd | k rć | j	 d  d } n  | j
 d	  r	d
 | t d	  } q| j
 d  r/| | t d  } q| j
 d  rU| | t d  } q| j
 d  r| } xY | j
 d  rÅ| j d  r| t d   } n  | | j d   } | t d  } qmW| j d  ré| t d   } n  | j
 d  r| t d  } n  | d | } q| j
 d  r8d
 | } qd | j   k rd | k r| ro| | j   } q| | j d  d  | j   } qn  |  |	 | | <|
 | | <q_ W| | f S(   sĪ    Adds any relevant information to relative links
  
  Fixes links by adding specific information that might be trivially omitted,
  like "http://" prefix or adding the parent for links beginning with "/"
  t   /i   i   t   ?i’’’’i    R   R   s   //s   http://s   ./s   ../R   R.   t   mailto(   R!   R"   R%   t   endswitht   keyst   settingst   FEEDBACK_MODEt   MODE_COMPLETER   t   splitR   R   t   rfindR   (   t   urlsR3   t   baset   adopted_linkst   adopted_post   mt   parentt   base_dirR   t   init_urlt
   orphan_post   base_url(    (    sH   /vol/project/2012/wmproject2013/chandra/web-project/Crawler/functions.pyt   adopt_orphansµ   sN    7
''c         C   s  t  j |  d i d d 6} yp t  j | d d } | j   | j   } d | j   GH| |  k r |  | k r d | GH|  | | <n  t SWnt  j k
 rÖ } d | j | j	 f GH| rŅ | j | | j
   <n  t St  j k
 r}	 d	 |	 j	 GH| r|	 j	 | | j
   <n  t St j k
 rH}
 d
 GH| rD|
 | | j
   <n  t St j k
 r} d | GH| r|| | | j
   <n  t St j k
 r“} d GH| r°| | | j
   <n  t St k
 rå} d GH| rį| | | j
   <n  t St j k
 r} d GH| r| | | j
   <n  t SXd S(   s   Sends a request and stores relevant information
  
  Given a URL and feedback information parameters (lists to store feedback and
  a boolean to select whether or not information is stored, in case of cursory
  URL checking), builds and issues a request based on the URL, then modifies 
  the given lists appropriately by adding any suitable information returned
  in the answer.
  t   headerss   */*t   Acceptt   timeouti   s   HTTP status code: %ss   Found redirected link %s...s   HTTP Error : %s : %ss   URL Error : %ss   URL timeout.s   Socket error: %s.s   Secure URL timeout.s   Invalid URL given.s$   Unknown status returned from server.N(   t   urllib2t   Requestt   urlopenR   t   geturlt   getcodeR/   t	   HTTPErrort   codet   reasont   get_full_urlR   t   URLErrort   socketRO   t   errort   sslt   SSLErrort
   ValueErrort   httplibt   BadStatusLine(   R   t   newurlst   httperrst   urlerrst   recordt   reqt   usockt   returlt   httperrt   urlerrt   toutt   socket   ssltoutt   valt   bsle(    (    sH   /vol/project/2012/wmproject2013/chandra/web-project/Crawler/functions.pyt   check_one_urlä   sZ    	
		c         C   s  i  } i  } g  } i  } i  } x”|  j    D]}	 d GH|	 j d  sT |	 j d  ro d t j d  |	 f GHn( d t j d  |	 f GH| j |	  q+ t }
 x) | D]! } | |	 k r¤ d GHt }
 Pq¤ q¤ W|
 rÕ q+ n  |	 | k r| |	 d	 d
 k rd GH| |	 d } n |	 } | | d k r)d GHq¾| |	 d	 d k rWd GH| | d | |	 <q¾| | d	 d k r¾d GH| | d | |	 <q¾q+ t |	 | | | t  s+ d GHt |	 | | | t  q+ q+ Wd d GHx2 | j   D]$ \ } } |  | |  | <| | | <qŲW| | | | f S(   sĆ   Checks a list of URLs and records any errors
  
  Given a list of URLs and a parent URL for relative links, iterates through
  the list and calls the check_one_url function on each of them (twice, for
  more reliability) and stores appropriate errors in two distinct lists,
  one for HTTP errors, the other for URL errors, as well as remembering any 
  URLs that have been ignored due to simple syntax checks. Returns the three
  lists as a tuple.
  t   -i   s   http://s   https://s   [%s] Checking %s...s   %H:%M:%Ss   [%s] Ignoring %s!s   Safe URL found, skipping...i    t   REDIRECTs(   Previously idenitified redirected URL...i   t   OKs"   Valid URL previously identified...t   HTTPs=   HTTP error for previously identified invalid URL. Skipping...t   URLs<   URL error for previously identified invalid URL. Skipping...s   Retrying...i(   s   --------------------(	   R<   R   t   timet   strftimeR$   R   R/   Ro   t   items(   RB   t   curr_urlt   checkedt   safeRb   Rc   t   ignoredRa   t   aliasR   t   is_safet   safeurlt   chkurlt   newurlt   oldurl(    (    sH   /vol/project/2012/wmproject2013/chandra/web-project/Crawler/functions.pyt
   check_urls  sR    
	c         C   s÷   t  j t  j k r |  St  j t  j k r< d   |  j   D St  j t  j k ró i  } x d   |  j   D j   D]z \ } } xk | j   D]] } | d d k r | | k rŹ i  i  |  | d f | | <n  | d | | d | d <q q Wqq W| Sd  S(   Nc         S   sF   i  |  ]< \ } } t  | d   d  k r | d  i  | d f |  q S(   i    i   (   R   (   t   .0t   kt   v(    (    sH   /vol/project/2012/wmproject2013/chandra/web-project/Crawler/functions.pys
   <dictcomp>V  s   	 c         S   s#   i  |  ] \ } } | d  |  q S(   i    (    (   R   R   R   (    (    sH   /vol/project/2012/wmproject2013/chandra/web-project/Crawler/functions.pys
   <dictcomp>Y  s   	 i   i  i   i    (   R=   R>   R?   t   MODE_SERVER_VERIFIEDRw   t   MODE_STRICT(   t	   err_linkst   strict_lnksR   Rb   Rh   (    (    sH   /vol/project/2012/wmproject2013/chandra/web-project/Crawler/functions.pyt   parse_errorsR  s    )%c         C   s  t  |  d  d k s> t  |  d  d k rt j t j k rd | } t  |  d  d k r£| d 7} x8|  d j   D]#\ } } t j t j k r | d k r{| d | | t j | f 7} | d | |  d | | j d	 |  d |  d !d
 7} q{n{ y t j | } Wn t	 k
 r*d } n X| d | | | f 7} | d | |  d | | j d	 |  d |  d !d
 7} | | s| d 7} n  | d 7} qy Wn  t j t j k rgt  |  d  d k rg| d 7} x |  d j   D]z \ }	 }
 | d |	 |
 f 7} | d | |  d |	 | j d	 |  d |	  d !d
 7} | |	 sV| d 7} n  | d 7} qęWn  | d d 7} | j
 |  n  d S(   sč   Records broken URL information in a file
  
  This function is to be called on one page at a time, giving its URL and 
  source, a file object where to write the information and a tuple of HTTP and
  URL errors. It records all links that issued errors when checking and are 
  according the mode set from the command line (more information can be found
  in crawler.py regarding this variable), as well as the corresponding anchor
  tag within the source, allowing for fast searching.
  i    i   s@   On page at %s, the following broken links have been identified:
s"   [Links broken due to HTTP errors]
i  s   	%s: status %s: %s
s   	[ i   R   s    ]
s   Unknown status retrieveds   	[ EMPTY anchor detected ]
s   
s!   [Links broken due to URL errors]
s   	%s: reason %s
Rp   i   Ns   --------------------(   R   R=   R>   R?   t	   iteritemsR   R_   t	   responsesR   t   KeyErrort   write(   t   errlnksR   R   t   wfileRB   t   messaget   httpurlRh   t   desct   urlurlRi   (    (    sH   /vol/project/2012/wmproject2013/chandra/web-project/Crawler/functions.pyt   record_pageb  s8    >

?
9
(
9
(   R!   RP   RZ   R\   R_   R=   Ru   R   R   R   R    R(   R+   R7   RL   Ro   R   R   R   (    (    (    sH   /vol/project/2012/wmproject2013/chandra/web-project/Crawler/functions.pyt   <module>   s$   								7	/	7	7	