Python Email Crawler Class
[caption id="attachment_775" align="alignnone" width="202"] Email Crawler[/caption]
[sourcecode language="python"]
import os
import re
class EmailCrawler:
email_list = []
emailregex = re.compile('\w+[@][a-zA-Z_\.]+\.[a-zA-Z]{2,6}')
output_path = os.getcwd()
def setBaseURL(self,url):
self.tocrawl = set([url])
def run(self):
dir = raw_input('Enter the directory path for crawled files\n')
self.verifyDir(dir)
self.crawl(dir)
def output(self):
with open("emails.txt", "w") as a:
for email in self.email_list:
a.write(str(email) + os.linesep)
print email
def verifyDir(self,path):
if not os.path.exists(path):
print "This directory does not exist"
exit
def crawl(self,dir_path):
print "Crawling Email links in " + dir_path + "....\n\n"
for path, subdirs, files in os.walk(self.get_raw_string(dir_path)):
for filename in files:
filePath = os.path.join(path, filename)
f=open(filePath, 'r')
html=f.read()
f.close()
results = self.emailregex.findall(html)
if results:
for email in results:
if email not in self.email_list:
self.email_list.append(email)
self.output()
def get_raw_string(self,text):
"""Returns a raw string representation of text"""
escape_dict={'\a':r'\a',
'\b':r'\b',
'\c':r'\c',
'\f':r'\f',
'\n':r'\n',
'\r':r'\r',
'\t':r'\t',
'\v':r'\v',
'\'':r'\'',
'\"':r'\"'}
new_string=''
for char in text:
try:
new_string += escape_dict[char]
except KeyError:
new_string += char
return new_string
[/sourcecode]
How to run this email crawler?
Load and run this program in Python Shell. (In Python shell, Run >> Run Module)
[sourcecode language="python"]
import os
import re
class EmailCrawler:
email_list = []
emailregex = re.compile('\w+[@][a-zA-Z_\.]+\.[a-zA-Z]{2,6}')
output_path = os.getcwd()
def setBaseURL(self,url):
self.tocrawl = set([url])
def run(self):
dir = raw_input('Enter the directory path for crawled files\n')
self.verifyDir(dir)
self.crawl(dir)
def output(self):
with open("emails.txt", "w") as a:
for email in self.email_list:
a.write(str(email) + os.linesep)
print email
def verifyDir(self,path):
if not os.path.exists(path):
print "This directory does not exist"
exit
def crawl(self,dir_path):
print "Crawling Email links in " + dir_path + "....\n\n"
for path, subdirs, files in os.walk(self.get_raw_string(dir_path)):
for filename in files:
filePath = os.path.join(path, filename)
f=open(filePath, 'r')
html=f.read()
f.close()
results = self.emailregex.findall(html)
if results:
for email in results:
if email not in self.email_list:
self.email_list.append(email)
self.output()
def get_raw_string(self,text):
"""Returns a raw string representation of text"""
escape_dict={'\a':r'\a',
'\b':r'\b',
'\c':r'\c',
'\f':r'\f',
'\n':r'\n',
'\r':r'\r',
'\t':r'\t',
'\v':r'\v',
'\'':r'\'',
'\"':r'\"'}
new_string=''
for char in text:
try:
new_string += escape_dict[char]
except KeyError:
new_string += char
return new_string
[/sourcecode]
How to run this email crawler?
Load and run this program in Python Shell. (In Python shell, Run >> Run Module)
import email_crawler
e = email_crawler.EmailCrawler()
e.run()
0 comments: