From af691dc2ded5fcf0fc7d2d56af841adad50ffd02 Mon Sep 17 00:00:00 2001
From: Leonardo Robol <leo@robol.it>
Date: Tue, 27 Oct 2009 23:08:30 +0100
Subject: [PATCH] =?UTF-8?q?Modificato=20il=20meccanismo=20di=20gestione=20?=
 =?UTF-8?q?degli=20"step"=20per=20controllare=20quanto=20un=20link=20si=20?=
 =?UTF-8?q?trova=20lontano=20dal=20punto=20di=20partenza=20(tramite=20il?=
 =?UTF-8?q?=20percorso=20pi=C3=B9=20breve)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 spidy.py | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 94 insertions(+), 21 deletions(-)
diff --git a/spidy.py b/spidy.py
index 51bc6d6..84e54db 100755
--- a/spidy.py
+++ b/spidy.py
@@ -21,9 +21,11 @@ url_counter = range(size)
 
 max_steps = 5
 
-debug = False
+debug = 0
 outfile = "connections.txt"
 
+ExitRequired = False
+
 
 def get_links(page):
     """Restituisce una lista con i link
@@ -52,6 +54,9 @@ def expand_url(parent, url):
       http://www.example.com/pagina.html
     """
 
+    if len(url) == 0:
+        return url
+
     ## Controllo che l'url non cominci con un punto
     ## nel qual caso cerchiamo di rimediare subito, 
     ## ma non cadiamo nel tranello di ignorare i ..
@@ -105,16 +110,17 @@ class Page():
     def __repr__(self):
         return "<Page object: %s>" % self.url
 
-    def __init__(self, url=""):
+    def __init__(self, url="", parent=None):
+        
 
         if(url != ""):
-            mtx_url_dict.lock(self.__new_page, url)
+            mtx_url_dict.lock(self.__new_page, (url, parent))
             mtx_url_dict.unlock()
         else:
-            mtx_url_dict.lock(self.__get_page, 0)
+            mtx_url_dict.lock(self.__get_page, parent)
             mtx_url_dict.unlock()
 
-    def __get_page(self, num):
+    def __get_page(self, parent):
 
         if(len(url_counter) == 0):
             self.exhausted = True
@@ -134,14 +140,22 @@ class Page():
             if not page_found:
                 time.sleep(1)
 
-        
+            # Questo è un punto dove il Crawler
+            # si potrebbe bloccare e quindi facciamo
+            # un check sull' ExitRequired
+            # Ovviamente poi sarà necessario che anche
+            # il chiamante lo faccia!
+            if ExitRequired:
+                return
 
+        
         self.ID = page.ID
         self.analyzed = page.analyzed
         self.exhausted = False
+        self.step = page.step
         url_dict[url].analyzed = True
         
-    def __new_page(self, url):
+    def __new_page(self, (url, parent)):
         # Questo ci serve per tenere il 
         # conto di tutti gli url
         global url_dict
@@ -150,11 +164,18 @@ class Page():
         self.exhausted = False
         self.analyzed = False
         self.url = url
+        
 
         if(url_dict.has_key(url)):
             # Preservo i parametri che esistono già!
             self.ID = url_dict[url].ID
             self.analyzed = url_dict[url].analyzed
+            self.step = url_dict[url].step
+            if parent == None:
+                self.step = 0
+            else:
+                if(parent.step < self.step):
+                    self.step = parent.step + 1
 
         else:
             try:
@@ -163,14 +184,20 @@ class Page():
                 self.exhausted = True
                 
 
+            # Conto in quanti passi si raggiunge questa pagina
+            if parent == None:
+                self.step = 0
+            else:
+                self.step = parent.step + 1
             url_dict[url] = self
             url_dict[url].links = []
 
+
     def add_link(self, page):
 
         if(page.exhausted):
             return -1
-        if debug:
+        if debug >= 2:
             print " => Adding link to %s" % page.url
         mtx_url_dict.lock(self.__add_link, page.ID)
         mtx_url_dict.unlock()
@@ -193,27 +220,43 @@ class Crawler(threading.Thread):
     def __init__(self, startpage=default_page):
         threading.Thread.__init__(self)
         self.start_page = startpage
+        self.waitingforpage = False
+
+    def WaitingForPage(self):
+        """Ritorna True se il Crawler sta cercando di ottenere
+        una nuova pagina"""
+        return self.waitingforpage
         
 
     def run(self):
 
-        step_counter = 0
-
         # Capiamo che pagina ci serve
         page = Page(self.start_page)
 
         while(not page.exhausted):
+            self.waitingforpage = True
+            ## Stiamo attenti a non fare troppi passi
+            ## dalla pagina di partenza
+            page = Page()
+            
+            ## Se ci chiedono di uscire perché abbiamo
+            ## analizzato tutte la pagine ce ne andiamo
+            if ExitRequired:
+                return
 
-            if(step_counter > max_steps):
+            if page.step >= max_steps:
+                print " => Ohi, troppi step! Riparto"
                 page = Page(self.start_page)
-                step_counter = 0
-            else:
-                page = Page()
-                step_counter += 1
+
+            self.waitingforpage = False
 
             if page.exhausted:
                 break
 
+            if debug >= 1:
+                print " => Analyzing page %s" % page.url
+
+
             # Come prima cosa devo fare il parsing dei
             # link che ci sono nella pagina
             # Diamo una mixata per simulare meglio
@@ -227,7 +270,7 @@ class Crawler(threading.Thread):
             if not links == -1:
                 random.shuffle(links)
                 for l in links:
-                    lpage = Page(l)
+                    lpage = Page(l, page)
 
                     if not lpage.exhausted:
                         page.add_link(lpage)
@@ -242,8 +285,8 @@ if __name__ == "__main__":
     parser = OptionParser()
     parser.add_option("-c", "--concurrency", dest="concurrency", action="store",
                       help="Set level of concurrency (i.e. how many threads)", default=3)
-    parser.add_option("-d", "--debug", dest="debug", action="store_true",
-                      help="Activate debug mode", default=False)
+    parser.add_option("-d", "--debug", dest="debug", action="store",
+                      help="Set debug level", default=0)
     parser.add_option("-o", "--output", dest="outfile", action="store",
                       help="Name of the output file for the connection matrix", default="connections.txt")
     parser.add_option("-n", "--number", dest="size", action="store",
@@ -263,6 +306,7 @@ if __name__ == "__main__":
     outfile = option.outfile
     size = int(option.size)
     url_counter = range(size)
+    url_counter.reverse()
     max_steps = int(option.max_steps)
     default_page = option.start_page
 
@@ -276,23 +320,52 @@ if __name__ == "__main__":
     " % (l.tm_hour,l.tm_min,l.tm_sec, concurrency, size, max_steps, default_page, outfile)
     
     
+    ## Avvio i thread, di modo che si impegnino a lavorare
+    ## Ora il mio scopo sarà controllare che non finiscano
+    ## le pagine
     threads = []
     for i in range(0, concurrency):
         threads.append(Crawler(default_page))
         threads[i].start()
 
+    while threads[0].isAlive():
+        ## Controllo se tutti i thread sono
+        ## alla ricerca di una pagina. Se la
+        ## risposta è sì le pagine sono finite
+        ## e possiamo uscire
+        PageAreExhausted = True
+        for t in threads:
+            if not t.WaitingForPage():
+                PageAreExhausted = False
+                break
+
+        if PageAreExhausted:
+            ## Questa variabile globale farà
+            ## uscire tutti i thread
+            ExitRequired = True
+            print " => There are no more pages in my range, exiting"
+            break
+
+        ## Se non c'è niente da fare posso
+        ## anche rilassarmi ed aspettare un
+        ## secondo prima di rieseguire il check
+        time.sleep(1)
+
 
     ## Qui non c'è modo umano di terminare il
     ## suo lavoro, bisognerà studiarci sopra
-    for i in range(0, concurrency):
-        threads[i].join()
+    for t  in threads:
+        t.join()
         
 
     ## A questo punto mi devo preoccupare di salvare
     ## la matrice in un formato soddisfacente
 
     out = open(outfile, 'w')
-    out.write(str(size) + "\n")
+
+    ## Il numero massimo di pagine meno quelle avanzate = 
+    ## le pagine effettivamente usate!
+    out.write(str(size - len(url_counter)) + "\n")
 
     
     
-- 
2.1.4