-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtestScrape.py
More file actions
16 lines (15 loc) · 4.11 KB
/
testScrape.py
File metadata and controls
16 lines (15 loc) · 4.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from scrapy.selector import Selector
import re
import string
#filename="14d9010d785bd789.eml"
#A sample of Dice emails about mid-way through the database history
rootpath="C:\\Users\\Jake\\Documents\\Cont. Ed\\gyb\\gyb-0.47-windows\\gyb\\GYB-GMail-Backup-jacobbumpus@gmail.com\\";
filenames=['2015\\9\\11\\14fbc31f89b76495.eml','2015\\9\\10\\14fb70bc8f5c7ac1.eml','2015\\9\\9\\14fb1e4778ce4038.eml','2015\\9\\8\\14facbfce2eb92b1.eml','2015\\9\\7\\14fa795aa86f3d91.eml','2015\\9\\2\\14f8dd6d4993cc17.eml','2015\\9\\1\\14f88b8637801905.eml','2015\\8\\31\\14f838c10fc43817.eml','2015\\8\\30\\14f7e66a0510f86e.eml','2015\\8\\29\\14f79413a4378491.eml','2015\\8\\27\\14f6ef663fecf988.eml','2015\\8\\25\\14f64b4495dece5f.eml','2015\\8\\24\\14f5f8039488afee.eml','2015\\8\\23\\14f5a57e069192aa.eml','2015\\8\\22\\14f5532542ecdac6.eml','2015\\8\\21\\14f500dc75cf2cfe.eml','2015\\8\\20\\14f4ae7036aad5eb.eml','2015\\8\\19\\14f45c06e930ad0a.eml','2015\\8\\18\\14f40a53e9dc2fc6.eml','2015\\8\\17\\14f3b7528a4a5556.eml','2015\\8\\16\\14f364e5a1cd8bc9.eml','2015\\8\\15\\14f312864331b03f.eml','2015\\8\\14\\14f2c0322e2e1405.eml','2015\\8\\13\\14f26deec8e36df6.eml','2015\\8\\12\\14f21b7f10d9c65a.eml','2015\\8\\11\\14f1c9370f57273a.eml','2015\\8\\10\\14f176902a89928b.eml','2015\\8\\9\\14f1245ca6156502.eml','2015\\8\\8\\14f0d1f291da7159.eml','2015\\8\\7\\14f07f9596ade63f.eml','2015\\8\\6\\14f02d55fc30ead3.eml','2015\\8\\5\\14efda6cff001989.eml','2015\\8\\3\\14ef35fef4195337.eml','2015\\8\\2\\14eee36e974ffc05.eml','2015\\8\\1\\14ee9129534d36cd.eml','2015\\7\\31\\14ee3ea4c47a3903.eml','2015\\7\\30\\14edebc6a09b7f60.eml','2015\\7\\29\\14ed9955124fc51d.eml','2015\\7\\28\\14ed47ecc8e031a1.eml','2015\\7\\27\\14ecf5d3e1ebdd58.eml','2015\\7\\26\\14eca31e0b02f88e.eml','2015\\7\\25\\14ec50de3c82ce53.eml','2015\\7\\24\\14ebfeb11b5215b2.eml','2015\\7\\23\\14ebac00d0cca6e5.eml','2015\\7\\22\\14eb59bf3c0208a8.eml','2015\\7\\21\\14eb071f98803ca6.eml','2015\\7\\20\\14eab4a76f42443c.eml','2015\\7\\19\\14ea62585a4b6314.eml','2015\\7\\18\\14ea0fe5fdd0210f.eml','2015\\7\\17\\14e9bda6b9dd3976.eml','2015\\7\\16\\14e96b5709034992.eml','2015\\7\\15\\14e918dcbe7eb07a.eml','2015\\7\\14\\14e8c545a5d806e6.eml','2015\\7\\13\\14e873f742585c99.eml','2015\\7\\12\\14e8219dcce8e953.eml','2015\\7\\11\\14e7cf3d38457a7f.eml','2015\\7\\10\\14e77bad153ca3de.eml','2015\\7\\9\\14e729527cfb8dd3.eml','2015\\7\\8\\14e6d6febb4a9d13.eml','2015\\7\\7\\14e6859304ffca2d.eml','2015\\7\\6\\14e6324c195531e2.eml','2015\\7\\5\\14e5dfe22b80b910.eml','2015\\7\\4\\14e58d6e1ac7a6b2.eml','2015\\7\\3\\14e53b0644b94133.eml','2015\\7\\2\\14e4e89a5ed7beac.eml','2015\\7\\1\\14e49649930367b6.eml','2015\\6\\30\\14e4443d555e8647.eml','2015\\6\\29\\14e3f199278e5728.eml','2015\\6\\28\\14e39f2cc266e3cb.eml','2015\\6\\27\\14e34cc145a56f5c.eml','2015\\6\\26\\14e2fa5ae70b6aae.eml','2015\\6\\25\\14e2a80d068ab473.eml','2015\\6\\24\\14e255bcd70f9ca3.eml','2015\\6\\23\\14e203658f2e2b79.eml','2015\\6\\22\\14e1b11f8a2cb50d.eml','2015\\6\\21\\14e15ebe4e553ef6.eml','2015\\6\\20\\14e10c5c8a844e74.eml','2015\\6\\19\\14e0b9e705299bab.eml','2015\\6\\18\\14e0675f8e423544.eml','2015\\6\\17\\14e01512d507d619.eml','2015\\6\\16\\14dfc2fb7020caec.eml','2015\\6\\15\\14df7070190743ff.eml','2015\\6\\14\\14df1e00f6cc81fd.eml','2015\\6\\13\\14decbb1bdbfbe7c.eml','2015\\6\\12\\14de795c71addd37.eml','2015\\6\\11\\14de26d85cd1f975.eml','2015\\6\\10\\14ddd47cdd511fa3.eml','2015\\6\\9\\14dd9f854be9c8ca.eml','2015\\6\\8\\14dd2f7060cf64e7.eml','2015\\6\\7\\14dcdd869f307c3f.eml','2015\\6\\6\\14dc8b22856c045c.eml','2015\\6\\5\\14dc389bf485473a.eml','2015\\6\\4\\14dbe658073e09c0.eml','2015\\6\\3\\14db93fa6c39602c.eml','2015\\6\\2\\14db422bbbddffe2.eml','2015\\6\\1\\14daef41653d6e46.eml','2015\\5\\31\\14da9cdeb6e3fe6e.eml','2015\\5\\30\\14da4a9a8ab4aee2.eml','2015\\5\\29\\14d9f811760ef781.eml','2015\\5\\28\\14d9a5b9f755bcf4.eml','2015\\5\\27\\14d9535d7b0890ff.eml','2015\\5\\26\\14d9010d785bd789.eml']
for file in filenames:
f = open(rootpath+file, 'r')
body = f.read()
try:
print file
print "Job title : "+ string.replace(Selector(text=body).xpath("//td[@class='3D\"job-title\"']/a/text()").extract()[0],"=\n","")
except IndexError:
print "Assign null instead"