From ddf634e46eedb96b5cbfa5cea1267313360c54d3 Mon Sep 17 00:00:00 2001
From: Leonardo Robol <leo@robol.it>
Date: Wed, 28 Oct 2009 17:52:20 +0100
Subject: [PATCH] =?UTF-8?q?Altre=20modifiche=20all'interfaccia=20resa=20pi?=
 =?UTF-8?q?=C3=B9=20consistente?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 spidy.py | 52 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/spidy.py b/spidy.py
index be5e8e2..ff96166 100755
--- a/spidy.py
+++ b/spidy.py
@@ -58,7 +58,7 @@ class UrlDict():
         self.url_dict[page.url] = page
         if not page.analyzed:
             freepages.set()
-            debug(" => Aggiunta %s" % str(self.url_dict[page.url]), 3)
+            debug("Aggiunta %s" % str(self.url_dict[page.url]), 3)
         self.release("addPage")
         return None
 
@@ -66,13 +66,13 @@ class UrlDict():
         """Ottiene una pagina non analizzata"""
         ## Aspetto che ci siano pagine libere
         freepages.wait()
-        debug(" => Abbiamo ottenuto una nuova pagina", 3)
+        debug("Abbiamo ottenuto una nuova pagina", 3)
 
         self.acquire("getNewPage")
         resp_page = None
         for url in self.url_dict:
             page = self.url_dict[url]
-            if not page.analyzed:
+            if (not page.analyzed) & (page.step <= max_steps):
                 resp_page = page
                 debug("Restituisco la pagina %s" % str(resp_page), 3)
                 break
@@ -81,7 +81,7 @@ class UrlDict():
         last = True
         for url in self.url_dict:
             page = self.url_dict[url]
-            if not page.analyzed:
+            if (not page.analyzed) & (page.step <= max_steps):
                 last = False
         if last:
             freepages.clear()
@@ -133,11 +133,11 @@ class UrlDict():
         return None
 
     def acquire(self, message=""):
-        debug(" => Thread %s acquiring lock -- %s " % (str(threading.currentThread()), message), 4)
+        debug("Thread %s acquiring lock -- %s " % (str(threading.currentThread()), message), 4)
         self.lock.acquire()
 
     def release(self, message = ""):
-        debug(" => Thread %s releasing lock  -- %s" % (str(threading.currentThread()), message), 4)
+        debug("Thread %s releasing lock  -- %s" % (str(threading.currentThread()), message), 4)
         self.lock.release()
 
 url_dict = UrlDict()
@@ -250,7 +250,7 @@ class Page():
             # Ovviamente poi sarà necessario che anche
             # il chiamante lo faccia!
             if ExitRequired:
-                debug("Ho incotrato exit required!", 2)
+                debug("Ho incontrato exit required!", 2)
                 return
         if not page.exhausted:
             self.ID = page.ID
@@ -275,10 +275,12 @@ class Page():
         if(url_dict.hasKey(url)):
             # Preservo i parametri che esistono già!
             page = url_dict.getPage(url)
-            try:
-                self.ID = page.ID
-            except:
-                print " => Error getting ID of %s" % url
+            if page.exhausted:
+                self.exhausted = page.exhausted
+                return
+
+            self.ID = page.ID
+
             self.analyzed = page.analyzed
             self.step = page.step
             if parent == None:
@@ -308,7 +310,7 @@ class Page():
         if(page.exhausted):
             return -1
         if debug >= 2:
-            debug(" => Adding link to %s" % page.url, 2)
+            debug("Adding link to %s" % page.url, 2)
         url_dict.addLink(page.url, page.ID)
         return 0
 
@@ -360,15 +362,12 @@ class Crawler(threading.Thread):
                 break
 
 
-            if page.step >= max_steps:
-                debug(" => Ohi, troppi step! Riparto",2)
-                page = Page(self.start_page)
 
             self.waitingforpage = False
 
            
             if debug >= 1:
-                debug(" => Analyzing page %s" % str(page), 2)
+                debug("Analyzing page %s" % str(page), 1)
 
 
 
@@ -412,13 +411,16 @@ if __name__ == "__main__":
                       default="http://poisson.phc.unipi.it",
                       help="Starting page for all the crawlers",
                       action="store")
+    parser.add_option("-l", "--legend-file", dest="legend_file", action="store",
+                      help="Conversion table from integers indexes to urls", default="legend.txt")
 
 
     (option, args) = parser.parse_args()
 
     concurrency = int(option.concurrency)
-    debug_value = bool(option.debug)
+    debug_value = int(option.debug)
     outfile = option.outfile
+    legend_file = option.legend_file
     size = int(option.size)
     url_stack = range(size)
     url_stack.reverse()
@@ -431,8 +433,9 @@ if __name__ == "__main__":
     %d thread(s)\n\
     %d pages to analyze\n\
     %d max steps from the start page, %s\n\
-    Writing on file %s\n\
-    " % (l.tm_hour,l.tm_min,l.tm_sec, concurrency, size, max_steps, default_page, outfile)
+    %d level of debug\n\
+    Writing on file %s and %s\n\
+    " % (l.tm_hour,l.tm_min,l.tm_sec, concurrency, size, max_steps, default_page, debug_value, outfile, legend_file)
     
     
     ## Avvio i thread, di modo che si impegnino a lavorare
@@ -480,7 +483,7 @@ if __name__ == "__main__":
     ## la matrice in un formato soddisfacente
 
     out = open(outfile, 'w')
-    leg = open("legenda.txt", 'w')
+    leg = open(legend_file, 'w')
 
     ## Il numero massimo di pagine meno quelle avanzate = 
     ## le pagine effettivamente usate!
@@ -489,16 +492,21 @@ if __name__ == "__main__":
     ## Il numero di elementi non 0 sono i link!
     out.write(str(link_counter) + "\n")
 
-    
+    ## Con questa contiamo quante pagine abbiamo
+    ## perso troncando il procedimento
+    no_id = 0
     
     for url in url_dict:
         try:
             leg.write(str(url_dict.getPage(url).ID) + "\t" + url + "\n" )
         except AttributeError:
-            print " => No ID on %s" % str(url)
+            no_id += 1
         for link in url_dict.getLinks(url):
             out.write(str(url_dict.getPage(url).ID) + "\t" + str(link) + "\n")
 
+    print " => %d pagine sono state analizzate ma non incluse per rispettare \n\
+    la dimensione della matrice scelta" % no_id
+
     out.close()
     leg.close()
 
-- 
2.1.4