diff --git a/spidy b/spidy
old mode 100644
new mode 100755
index c2aa1bf..f744550
--- a/spidy
+++ b/spidy
@@ -15,12 +15,16 @@ __author__ = "Leonardo Robol <leo@robol.it>"
mtx_url_dict = mutex.mutex()
-N = 1000
+size = 1000
url_dict = {}
-url_counter = range(N)
+url_counter = range(size)
max_steps = 5
+debug = False
+outfile = "connections.txt"
+
+
def get_links(page):
"""Restituisce una lista con i link
presenti nella pagina data, in forma canonica"""
@@ -164,7 +168,8 @@ class Page():
if(page.exhausted):
return -1
- print " => Adding link to %s" % page.url
+ if debug:
+ print " => Adding link to %s" % page.url
mtx_url_dict.lock(self.__add_link, page.ID)
mtx_url_dict.unlock()
return 0
@@ -213,12 +218,12 @@ class Crawler(threading.Thread):
# il caso.. dato che tanto è probabile che
# alcuni link rimarranno non visitati!
links = get_links(page)
- random.shuffle(links)
## A questo punto io che mi occupo della pagina devo
## aggiungere tutti i link alla pagina
if not links == -1:
+ random.shuffle(links)
for l in links:
lpage = Page(l)
@@ -235,12 +240,36 @@ if __name__ == "__main__":
parser = OptionParser()
parser.add_option("-c", "--concurrency", dest="concurrency", action="store",
help="Set level of concurrency (i.e. how many threads)", default=3)
+ parser.add_option("-d", "--debug", dest="debug", action="store_true",
+ help="Activate debug mode", default=False)
+ parser.add_option("-o", "--output", dest="outfile", action="store",
+ help="Name of the output file for the connection matrix", default="connections.txt")
+ parser.add_option("-n", "--number", dest="size", action="store",
+ help="Number of pages to analyze", default=1000)
+ parser.add_option("-m", "--max-steps", dest="max_steps", action="store",
+ help="Max steps to walk from the starting page", default=5)
+ parser.add_option("-s", "--start-page", dest="start_page", default="http://poisson.phc.unipi.it",
+ action="store")
- (option, args) = parser.parse_args()
- concurrency = option.concurrency
+ (option, args) = parser.parse_args()
- print " => Starting with concurrency %d" % concurrency
+ concurrency = int(option.concurrency)
+ debug = bool(option.debug)
+ outfile = option.outfile
+ size = int(option.size)
+ url_counter = range(size)
+ max_steps = int(option.max_steps)
+ default_page = option.start_page
+
+
+ l = time.localtime(time.time())
+ print " => Starting with this configuration at %s:%s:%s\n\
+ %d thread(s)\n\
+ %d pages to analyze\n\
+ %d max steps from the start page, %s\n\
+ " % (l.tm_hour,l.tm_min,l.tm_sec, concurrency, size, max_steps, default_page)
+
threads = []
for i in range(0, concurrency):
@@ -257,13 +286,17 @@ if __name__ == "__main__":
## A questo punto mi devo preoccupare di salvare
## la matrice in un formato soddisfacente
- out = open("ji.txt", 'w')
+ out = open(outfile, 'w')
+ out.write(str(size) + "\n")
+
+
for page in url_dict:
for link in url_dict[page].links:
- out.write(page + "\t" + str(url_dict[page].ID) + "\t" + str(link) + "\n")
-
+ out.write(str(url_dict[page].ID) + "\t" + str(link) + "\n")
+ l = time.localtime(time.time())
+ print " => Work completed at %s:%s:%s " % (l.tm_hour,l.tm_min,l.tm_sec)