Sketch of recursive link python webcrawler

As evolution of the previous python snippet, this script allows to search a string inside url content and recursively follow their links.

import urllib2
import sys
import re

def getlinksfromurl(url):
        linkset = set()
        try:    
                usock = urllib2.urlopen(url)
                data = usock.read()
                usock.close()
        except:
                data= ''
        s = re.finditer('href', data)
        for link in s:
                endlink=data.find('"',link.end()+2,len(data))
                linkstring=data[link.end()+2:endlink]
                if linkstring.startswith('http'):
                        linkset.add(linkstring)
                if linkstring.startswith('/'):
                        linkset.add(url+linkstring)
                if linkstring.startswith('./'):
                        linkset.add(url+linkstring[1:])
        return linkset

def searchstringfromurl(string,url):
        try:    
                usock = urllib2.urlopen(url)
                data = usock.read()
                usock.close()
        except:
                data= ''        
        s = re.finditer(string, data,re.IGNORECASE)
        count = 0
        for hits in s:
                count+=1        
        return count

def searchstringfromurlrecursive(string,url,deep,maxdeep): # TODO: backlinks control
        linkset=getlinksfromurl(url)
        totalhits=0
        count=1
        print "Analizing (level %i) url %s" %(deep,url)
        print "%i links retrived" % (len(list(linkset)))
        if deep>=maxdeep: #last iteration level
                print "Reading last level iteration."
                totalhits=totalhits+searchstringfromurl(string,url)
        if deep<maxdeep: #recursive section
                for link in linkset:
                        print "We are at %i recursive level (max %i levels). There are %i links, going into link number %i at url %s" %(deep+1,maxdeep,len(list(linkset)),count,link)
                        totalhits=totalhits+searchstringfromurlrecursive(string,link,deep+1,maxdeep)
                        count+=1
                totalhits=totalhits+searchstringfromurl(string,url)
                print "Leaving linkset..."
        return totalhits

# Main program
url = sys.argv[1]
string = sys.argv[2]
maxdeep = sys.argv[3]
print "%i matches with the word %s" % (searchstringfromurlrecursive(string,url,0,maxdeep),string)

Source

Posted in Internet, Programming, Python Tagged with: , ,

Leave a Reply

Your email address will not be published. Required fields are marked *

*