Python Web Crawler Class
[caption id="attachment_775" align="alignnone" width="202"] web crawler[/caption]
[sourcecode language="python"]
import sys
import re
import urllib2
import urlparse
import datetime
import os
class WebCrawler:
tocrawl = set([])
crawled = set([])
linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')
def setBaseURL(self,url):
self.tocrawl = set([url])
def run(self):
url = raw_input('Enter an URL to crawl\n')
self.setBaseURL(url)
dir = raw_input('Where should I put crawled HTML source files?\n')
self.crawl(dir)
def getTitle(self,html):
startPos = html.find('<title>')
if startPos != -1:
endPos = html.find('</title>', startPos+7)
if endPos != -1:
title = html[startPos+7:endPos]
return title
def writeToFile(self,url):
with open('hyperlinks.txt', 'a') as file:
file.write(url + '\n')
def writeHTML(self,fileName,html,dirPath):
self.verifyDir(dirPath)
fileName = re.sub('[\\/:"*?<>|]',"",fileName)
with open(dirPath+'\\'+ fileName + '.txt', 'w+') as file:
file.write(html)
def verifyDir(self,path):
if not os.path.exists(path):
os.makedirs(path)
def crawl(self,dir_path):
while 1:
try:
if self.tocrawl:
crawling = self.tocrawl.pop()
print '\n\nStart Crawling - ' + crawling + '\n'
except KeyError:
raise StopIteration
url = urlparse.urlparse(crawling)
try:
response = urllib2.urlopen(crawling)
except:
continue
msg = response.read()
self.writeHTML(crawling,msg,dir_path)
#Display page title
print self.getTitle(msg)
links = self.linkregex.findall(msg)
self.crawled.add(crawling)
self.writeToFile(crawling)
for link in links:
if link.startswith('mailto'):
continue
if link.startswith('/'):
link = 'http://' + url[1] + link
elif link.startswith('#'):
link = 'http://' + url[1] + url[2] + link
elif not link.startswith('http'):
link = 'http://' + url[1] + '/' + link
if link not in self.crawled:
print '----' + link
self.tocrawl.add(link)
[/sourcecode]
How to run this crawler?
Run following command in Python shell.
[sourcecode language="python"]
import sys
import re
import urllib2
import urlparse
import datetime
import os
class WebCrawler:
tocrawl = set([])
crawled = set([])
linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')
def setBaseURL(self,url):
self.tocrawl = set([url])
def run(self):
url = raw_input('Enter an URL to crawl\n')
self.setBaseURL(url)
dir = raw_input('Where should I put crawled HTML source files?\n')
self.crawl(dir)
def getTitle(self,html):
startPos = html.find('<title>')
if startPos != -1:
endPos = html.find('</title>', startPos+7)
if endPos != -1:
title = html[startPos+7:endPos]
return title
def writeToFile(self,url):
with open('hyperlinks.txt', 'a') as file:
file.write(url + '\n')
def writeHTML(self,fileName,html,dirPath):
self.verifyDir(dirPath)
fileName = re.sub('[\\/:"*?<>|]',"",fileName)
with open(dirPath+'\\'+ fileName + '.txt', 'w+') as file:
file.write(html)
def verifyDir(self,path):
if not os.path.exists(path):
os.makedirs(path)
def crawl(self,dir_path):
while 1:
try:
if self.tocrawl:
crawling = self.tocrawl.pop()
print '\n\nStart Crawling - ' + crawling + '\n'
except KeyError:
raise StopIteration
url = urlparse.urlparse(crawling)
try:
response = urllib2.urlopen(crawling)
except:
continue
msg = response.read()
self.writeHTML(crawling,msg,dir_path)
#Display page title
print self.getTitle(msg)
links = self.linkregex.findall(msg)
self.crawled.add(crawling)
self.writeToFile(crawling)
for link in links:
if link.startswith('mailto'):
continue
if link.startswith('/'):
link = 'http://' + url[1] + link
elif link.startswith('#'):
link = 'http://' + url[1] + url[2] + link
elif not link.startswith('http'):
link = 'http://' + url[1] + '/' + link
if link not in self.crawled:
print '----' + link
self.tocrawl.add(link)
[/sourcecode]
How to run this crawler?
Run following command in Python shell.
import web_crawler
w = web_crawler.WebCrawler()
w.run()
I blog quite often and I truly appreciate your content.
ReplyDeleteThis great article has truly peaked my interest. I'm going to bookmark your website and keep checking for new details about once a week. I opted in for your RSS feed too.
I really like your blog.. very nice colors &
ReplyDeletetheme. Did you make this website yourself
or did you hire someone to do it for you? Plz answer back as I'm looking to construct my own blog and
would like to know where u got this from. thank you
Hi,
ReplyDeleteThanks for commenting. If you really need to create a nice blog you can use ready to use templates. If you 're going to host your blog yourself in Wordpress there 're no limits. You can select and upload a theme by yourself. But if you 're going to use a wordpress.com blog like this, you have few options. However in wordpress.com also there 're some types of pre-built templates. This blog uses one such template.
Anyway, I'm afraid whether you clear your doubts.
Hello, I think your blog could possibly be having internet browser compatibility issues.
ReplyDeleteWhenever I take a look at your site in Safari, it looks fine but when opening in I.E., it's got some overlapping issues.
I just wanted to give you a quick heads up!
Aside from that, excellent website!