From a28447785dc91071d588d2cdea1dd59116f00f8c Mon Sep 17 00:00:00 2001 From: Leonardo Robol Date: Tue, 27 Oct 2009 08:41:50 +0100 Subject: [PATCH] =?UTF-8?q?Aggiunta=20la=20gestione=20delle=20redirezioni.?= =?UTF-8?q?=20Manca=20ancora=20la=20capacit=C3=A0=20di=20gestire=20link=20?= =?UTF-8?q?che=20contengono=20".."?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- spidy | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/spidy b/spidy index f744550..048cf51 100755 --- a/spidy +++ b/spidy @@ -28,7 +28,8 @@ outfile = "connections.txt" def get_links(page): """Restituisce una lista con i link presenti nella pagina data, in forma canonica""" - content = get_page(page.url) + content, real_url = get_content(page.url) + if(content == -1): return -1 @@ -37,7 +38,7 @@ def get_links(page): for link in links: # Espando il link in modo da (speriamo!) # garantire l'unicità - ret.append(expand_url(page.url, link)) + ret.append(expand_url(real_url, link)) return ret @@ -73,7 +74,8 @@ def expand_url(parent, url): if re.search("\.[^/]*$", parent): parent = re.sub("[^/]*$", "", parent) else: - parent += "/" + if not parent.endswith("/"): + parent += "/" @@ -84,15 +86,15 @@ def expand_url(parent, url): url = parent + url return url -def get_page(url): +def get_content(url): """Cerca di scaricare l'url dato e restituisce -1 se non ce la fa, il contenuto altrimenti""" try: req = urllib2.urlopen(url) except: - return -1 + return (-1, None) - return req.read() + return (req.read(), req.geturl()) class Page(): """Una pagina web. Questa classe, quando viene istanziata, @@ -268,12 +270,13 @@ if __name__ == "__main__": %d thread(s)\n\ %d pages to analyze\n\ %d max steps from the start page, %s\n\ - " % (l.tm_hour,l.tm_min,l.tm_sec, concurrency, size, max_steps, default_page) + Writing on file %s\n\ + " % (l.tm_hour,l.tm_min,l.tm_sec, concurrency, size, max_steps, default_page, outfile) threads = [] for i in range(0, concurrency): - threads.append(Crawler()) + threads.append(Crawler(default_page)) threads[i].start() @@ -293,7 +296,7 @@ if __name__ == "__main__": for page in url_dict: for link in url_dict[page].links: - out.write(str(url_dict[page].ID) + "\t" + str(link) + "\n") + out.write(page + "\t" + str(url_dict[page].ID) + "\t" + str(link) + "\n") l = time.localtime(time.time()) print " => Work completed at %s:%s:%s " % (l.tm_hour,l.tm_min,l.tm_sec) -- 2.1.4