https://www.foxsports.com.au/nrl/nrl-premiership/teams/sharks/james-maloneys-shock-confession-over-relationship-with-shane-flanagan/news-story/7134a366ebb93358cb23a2291ff78409
https://www.foxsports.com.au/nrl/nrl-premiership/teams/cowboys/michael-morgan-reveals-chances-of-nrl-return-for-cowboys-in-round-2/news-story/b05717e27eae7b4b9e37600f52bc6d27
https://www.foxsports.com.au/nrl/nrl-premiership/round-2-nrl-late-mail-michael-morgan-races-the-clock-valentine-holmes-fullback-switch/news-story/7aa197f891d0040e4cbf40379f8d008c
https://www.foxsports.com.au/football/socceroos/bert-van-marwijk-trims-socceroos-squad-for-friendlies-with-norway-and-colombia/news-story/2fc3aca788a999cf5db95d7d5cc9a027
https://www.foxsports.com.au/football/premier-league/teams/manchester-united/romelu-lukaku-accuses-manchester-united-colleagues-of-hiding-in-loss-to-sevilla/news-story/27594ec0847505167c5a4edab34bff0a
https://www.foxsports.com.au/football/uefa-champions-league/joes-mourinho-ive-sat-in-this-chair-twice-before-with-porto-with-real-madrid/news-story/2f09690c42efefe58405356c3148f01b
https://www.foxsports.com.au/football/a-league/aleague-hour-on-fox-sports-mark-bosnichs-indepth-expansion-plan-to-fix-the-aleague/news-story/ecbb2ded5a8e105b58e435f6de92ed8d
https://www.foxsports.com.au/football/asian-champions-league/live-asian-champions-league-melbourne-victory-v-kawasaki-frontale/news-story/51a78d57a1e600f239d9a2effffbdd66
https://www.foxsports.com.au/football/socceroos/massimo-luongos-qpr-recorded-a-shock-31-win-at-mile-jedinaks-promotion-hopefuls-aston-villa/news-story/2e8e0eccac5ab8595a9f02e76cf6dec0
https://www.foxsports.com.au/football/asian-champions-league/sydney-fc-cannot-afford-another-loss-in-the-asian-champions-league-when-they-play-kashima-antlers/news-story/99e01e4b4baf375135432d801faee669
# Extract urls from text file
url_list_file = open("crawl_list.txt", "r")
crawlList = [line for line in url_list_file]
crawlList = [url.replace('n', '') for url in crawlList]
print(crawlList)
# helper function to extract information from a given article
def ArticleDataExtractor(some_url):
'''function to pull ou all key information from a given article url'''
from newspaper import Article
output = {}
article = Article(some_url)
article.download()
article.parse()
output['url'] = some_url
output['authors'] = article.authors
output['pubDate'] = str(article.publish_date)
output['title'] = article.title
output['text'] = article.text
# do some NLP
article.nlp()
output['keywords'] = article.keywords
output['summary'] = article.summary
return output
# save files in json format
import json
for url in crawlList:
articleID = url.split("/")[-1]
extractedData = ArticleDataExtractor(url)
my_filename = 'data/' + str(articleID) + '.json'
with open(my_filename, 'w') as fp:
json.dump(extractedData, fp, indent = 4)