From a28447785dc91071d588d2cdea1dd59116f00f8c Mon Sep 17 00:00:00 2001
From: Leonardo Robol <leo@robol.it>
Date: Tue, 27 Oct 2009 08:41:50 +0100
Subject: [PATCH] =?UTF-8?q?Aggiunta=20la=20gestione=20delle=20redirezioni.?=
 =?UTF-8?q?=20Manca=20ancora=20la=20capacit=C3=A0=20di=20gestire=20link=20?=
 =?UTF-8?q?che=20contengono=20".."?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 spidy | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/spidy b/spidy
index f744550..048cf51 100755
--- a/spidy
+++ b/spidy
@@ -28,7 +28,8 @@ outfile = "connections.txt"
 def get_links(page):
     """Restituisce una lista con i link
     presenti nella pagina data, in forma canonica"""
-    content = get_page(page.url)
+    content, real_url = get_content(page.url)
+
     if(content == -1):
         return -1
 
@@ -37,7 +38,7 @@ def get_links(page):
     for link in links:
         # Espando il link in modo da (speriamo!)
         # garantire l'unicità
-        ret.append(expand_url(page.url, link))
+        ret.append(expand_url(real_url, link))
 
     return ret
         
@@ -73,7 +74,8 @@ def expand_url(parent, url):
         if re.search("\.[^/]*$", parent):
             parent = re.sub("[^/]*$", "", parent)
         else:
-            parent += "/"
+            if not parent.endswith("/"):
+                parent += "/"
 
         
     
@@ -84,15 +86,15 @@ def expand_url(parent, url):
         url = parent + url
     return url
 
-def get_page(url):
+def get_content(url):
     """Cerca di scaricare l'url dato e restituisce
     -1 se non ce la fa, il contenuto altrimenti"""
     try:
         req = urllib2.urlopen(url)
     except:
-        return -1
+        return (-1, None)
 
-    return req.read()
+    return (req.read(), req.geturl())
 
 class Page():
     """Una pagina web. Questa classe, quando viene istanziata,
@@ -268,12 +270,13 @@ if __name__ == "__main__":
     %d thread(s)\n\
     %d pages to analyze\n\
     %d max steps from the start page, %s\n\
-    " % (l.tm_hour,l.tm_min,l.tm_sec, concurrency, size, max_steps, default_page)
+    Writing on file %s\n\
+    " % (l.tm_hour,l.tm_min,l.tm_sec, concurrency, size, max_steps, default_page, outfile)
     
     
     threads = []
     for i in range(0, concurrency):
-        threads.append(Crawler())
+        threads.append(Crawler(default_page))
         threads[i].start()
 
 
@@ -293,7 +296,7 @@ if __name__ == "__main__":
     
     for page in url_dict:
         for link in url_dict[page].links:
-            out.write(str(url_dict[page].ID) + "\t" + str(link) + "\n")
+            out.write(page + "\t" + str(url_dict[page].ID) + "\t" + str(link) + "\n")
 
     l = time.localtime(time.time())
     print " => Work completed at %s:%s:%s " % (l.tm_hour,l.tm_min,l.tm_sec)
-- 
2.1.4