From ddf634e46eedb96b5cbfa5cea1267313360c54d3 Mon Sep 17 00:00:00 2001 From: Leonardo Robol Date: Wed, 28 Oct 2009 17:52:20 +0100 Subject: [PATCH] =?UTF-8?q?Altre=20modifiche=20all'interfaccia=20resa=20pi?= =?UTF-8?q?=C3=B9=20consistente?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- spidy.py | 52 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/spidy.py b/spidy.py index be5e8e2..ff96166 100755 --- a/spidy.py +++ b/spidy.py @@ -58,7 +58,7 @@ class UrlDict(): self.url_dict[page.url] = page if not page.analyzed: freepages.set() - debug(" => Aggiunta %s" % str(self.url_dict[page.url]), 3) + debug("Aggiunta %s" % str(self.url_dict[page.url]), 3) self.release("addPage") return None @@ -66,13 +66,13 @@ class UrlDict(): """Ottiene una pagina non analizzata""" ## Aspetto che ci siano pagine libere freepages.wait() - debug(" => Abbiamo ottenuto una nuova pagina", 3) + debug("Abbiamo ottenuto una nuova pagina", 3) self.acquire("getNewPage") resp_page = None for url in self.url_dict: page = self.url_dict[url] - if not page.analyzed: + if (not page.analyzed) & (page.step <= max_steps): resp_page = page debug("Restituisco la pagina %s" % str(resp_page), 3) break @@ -81,7 +81,7 @@ class UrlDict(): last = True for url in self.url_dict: page = self.url_dict[url] - if not page.analyzed: + if (not page.analyzed) & (page.step <= max_steps): last = False if last: freepages.clear() @@ -133,11 +133,11 @@ class UrlDict(): return None def acquire(self, message=""): - debug(" => Thread %s acquiring lock -- %s " % (str(threading.currentThread()), message), 4) + debug("Thread %s acquiring lock -- %s " % (str(threading.currentThread()), message), 4) self.lock.acquire() def release(self, message = ""): - debug(" => Thread %s releasing lock -- %s" % (str(threading.currentThread()), message), 4) + debug("Thread %s releasing lock -- %s" % (str(threading.currentThread()), message), 4) self.lock.release() url_dict = UrlDict() @@ -250,7 +250,7 @@ class Page(): # Ovviamente poi sarà necessario che anche # il chiamante lo faccia! if ExitRequired: - debug("Ho incotrato exit required!", 2) + debug("Ho incontrato exit required!", 2) return if not page.exhausted: self.ID = page.ID @@ -275,10 +275,12 @@ class Page(): if(url_dict.hasKey(url)): # Preservo i parametri che esistono già! page = url_dict.getPage(url) - try: - self.ID = page.ID - except: - print " => Error getting ID of %s" % url + if page.exhausted: + self.exhausted = page.exhausted + return + + self.ID = page.ID + self.analyzed = page.analyzed self.step = page.step if parent == None: @@ -308,7 +310,7 @@ class Page(): if(page.exhausted): return -1 if debug >= 2: - debug(" => Adding link to %s" % page.url, 2) + debug("Adding link to %s" % page.url, 2) url_dict.addLink(page.url, page.ID) return 0 @@ -360,15 +362,12 @@ class Crawler(threading.Thread): break - if page.step >= max_steps: - debug(" => Ohi, troppi step! Riparto",2) - page = Page(self.start_page) self.waitingforpage = False if debug >= 1: - debug(" => Analyzing page %s" % str(page), 2) + debug("Analyzing page %s" % str(page), 1) @@ -412,13 +411,16 @@ if __name__ == "__main__": default="http://poisson.phc.unipi.it", help="Starting page for all the crawlers", action="store") + parser.add_option("-l", "--legend-file", dest="legend_file", action="store", + help="Conversion table from integers indexes to urls", default="legend.txt") (option, args) = parser.parse_args() concurrency = int(option.concurrency) - debug_value = bool(option.debug) + debug_value = int(option.debug) outfile = option.outfile + legend_file = option.legend_file size = int(option.size) url_stack = range(size) url_stack.reverse() @@ -431,8 +433,9 @@ if __name__ == "__main__": %d thread(s)\n\ %d pages to analyze\n\ %d max steps from the start page, %s\n\ - Writing on file %s\n\ - " % (l.tm_hour,l.tm_min,l.tm_sec, concurrency, size, max_steps, default_page, outfile) + %d level of debug\n\ + Writing on file %s and %s\n\ + " % (l.tm_hour,l.tm_min,l.tm_sec, concurrency, size, max_steps, default_page, debug_value, outfile, legend_file) ## Avvio i thread, di modo che si impegnino a lavorare @@ -480,7 +483,7 @@ if __name__ == "__main__": ## la matrice in un formato soddisfacente out = open(outfile, 'w') - leg = open("legenda.txt", 'w') + leg = open(legend_file, 'w') ## Il numero massimo di pagine meno quelle avanzate = ## le pagine effettivamente usate! @@ -489,16 +492,21 @@ if __name__ == "__main__": ## Il numero di elementi non 0 sono i link! out.write(str(link_counter) + "\n") - + ## Con questa contiamo quante pagine abbiamo + ## perso troncando il procedimento + no_id = 0 for url in url_dict: try: leg.write(str(url_dict.getPage(url).ID) + "\t" + url + "\n" ) except AttributeError: - print " => No ID on %s" % str(url) + no_id += 1 for link in url_dict.getLinks(url): out.write(str(url_dict.getPage(url).ID) + "\t" + str(link) + "\n") + print " => %d pagine sono state analizzate ma non incluse per rispettare \n\ + la dimensione della matrice scelta" % no_id + out.close() leg.close() -- 2.1.4