#This code comes from this tutorial: http://answers.oreilly.com/topic/1088-how-to-build-a-simple-web-crawler/
# -*- coding: utf-8 -*-
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
# Create a list of words to ignore
ignorewords=set(['the','of','to','and','a','in','is','it'])
#set looks like a list, and being as though it isn't a frozenset and thus not immutable, it really looks like a list. So what's the difference?
def crawl(self,pages,depth=2):
#Why do functions make calls to themselves?
#This is a function right?
for i in range(depth):
#Where is depth defined?
#What is this i?
newpages=set( )
#Why des this have nothing but a space?
for page in pages:
#Same question about the i above is for the 'page'.
try:
c=urllib2.urlopen(page)
except:
print "Could not open %s" % page
continue
soup=BeautifulSoup(c.read( ))
#Again, another empty something... I don't know what you call that, but all it has is a space in between the parenthesis.
self.addtoindex(page,soup)
#where did "addtoindex" come from?
links=soup('a')
for link in links:
if ('href' in dict(link.attrs)):
#And attrs?
url=urljoin(page,link['href'])
#not sure where link was defined
if url.find("'")!=-1: continue
#What is the if called? What is the url.find called? What is the ("'") called? What is the continue called?
#why would it ever be equal to -1?
url=url.split('#')[0] # remove location portion
if url[0:4]=='http' and not self.isindexed(url):
newpages.add(url)
linkText=self.gettextonly(link)
self.addlinkref(page,url,linkText)
self.dbcommit( )
#dbommit is defined where?
pages=newpages
................
Here we go!
................
>>> import searchengine
>>> pagelist=['http://kiwitobes.com/wiki/Perl.html']
>>> crawler = searchengine.crawler('')
Traceback (most recent call last):
File "<input>", line 1, in <module>
AttributeError: 'module' object has no attribute 'crawler'