-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathscraper.py
More file actions
24 lines (20 loc) · 738 Bytes
/
scraper.py
File metadata and controls
24 lines (20 loc) · 738 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import pandas as pd
import spacy
from goose3 import Goose
from textblob import TextBlob
# import nltk
# nltk.download('omw-1.4')
nlp = spacy.load('en_core_web_sm')
# Read list of hundreds of urls from a file
url_list = open("URL.txt", "r").read().split("\n")
# loop for each url
for url in url_list:
g = Goose()
article = g.extract(url=url)
# process/store ...
article.cleaned_text # cleaning the extracted text
print(article.cleaned_text) # printing the extracted text
# print(len(article.cleaned_text)) # printing the no of words in articles after article is printed
with open("Output.txt", "w") as external_file:
print(article.cleaned_text, file=external_file)
external_file.close()