spacepaste

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
#This code comes from this tutorial: http://answers.oreilly.com/topic/1088-how-to-build-a-simple-web-crawler/


# -*- coding: utf-8 -*-
import urllib2

from BeautifulSoup import *

from urlparse import urljoin



# Create a list of words to ignore

ignorewords=set(['the','of','to','and','a','in','is','it'])

#set looks like a list, and being as though it isn't a frozenset and thus not immutable, it really looks like a list. So what's the difference?


def crawl(self,pages,depth=2):
  #Why do functions make calls to themselves?
  #This is a function right?


  for i in range(depth):
    #Where is depth defined?
    #What is this i?
    
    newpages=set( )
    #Why des this have nothing but a space?
    

    for page in pages:
      #Same question about the i above is for the 'page'.

      try:

        c=urllib2.urlopen(page)

      except:

        print "Could not open %s" % page

        continue

      soup=BeautifulSoup(c.read( ))
      #Again, another empty something... I don't know what you call that, but all it has is a space in between the parenthesis.
      

      self.addtoindex(page,soup)
      #where did "addtoindex" come from?


      links=soup('a')

      for link in links:

        if ('href' in dict(link.attrs)):
	  #And attrs?
	  

          url=urljoin(page,link['href'])
          #not sure where link was defined
          

          if url.find("'")!=-1: continue
	  #What is the if called? What is the url.find called? What is the ("'") called? What is the continue called?
	  #why would it ever be equal to -1?

          url=url.split('#')[0] # remove location portion

          if url[0:4]=='http' and not self.isindexed(url):

            newpages.add(url)

          linkText=self.gettextonly(link)

          self.addlinkref(page,url,linkText)



        self.dbcommit( )
        #dbommit is defined where?



        pages=newpages



................
Here we go!
................


>>> import searchengine
>>> pagelist=['http://kiwitobes.com/wiki/Perl.html']
>>> crawler = searchengine.crawler('')
Traceback (most recent call last):
  File "<input>", line 1, in <module>
AttributeError: 'module' object has no attribute 'crawler'