meta database phase 2

https://www.foxsports.com.au/nrl/nrl-premiership/teams/sharks/james-maloneys-shock-confession-over-relationship-with-shane-flanagan/news-story/7134a366ebb93358cb23a2291ff78409
https://www.foxsports.com.au/nrl/nrl-premiership/teams/cowboys/michael-morgan-reveals-chances-of-nrl-return-for-cowboys-in-round-2/news-story/b05717e27eae7b4b9e37600f52bc6d27
https://www.foxsports.com.au/nrl/nrl-premiership/round-2-nrl-late-mail-michael-morgan-races-the-clock-valentine-holmes-fullback-switch/news-story/7aa197f891d0040e4cbf40379f8d008c
https://www.foxsports.com.au/football/socceroos/bert-van-marwijk-trims-socceroos-squad-for-friendlies-with-norway-and-colombia/news-story/2fc3aca788a999cf5db95d7d5cc9a027
https://www.foxsports.com.au/football/premier-league/teams/manchester-united/romelu-lukaku-accuses-manchester-united-colleagues-of-hiding-in-loss-to-sevilla/news-story/27594ec0847505167c5a4edab34bff0a
https://www.foxsports.com.au/football/uefa-champions-league/joes-mourinho-ive-sat-in-this-chair-twice-before-with-porto-with-real-madrid/news-story/2f09690c42efefe58405356c3148f01b
https://www.foxsports.com.au/football/a-league/aleague-hour-on-fox-sports-mark-bosnichs-indepth-expansion-plan-to-fix-the-aleague/news-story/ecbb2ded5a8e105b58e435f6de92ed8d
https://www.foxsports.com.au/football/asian-champions-league/live-asian-champions-league-melbourne-victory-v-kawasaki-frontale/news-story/51a78d57a1e600f239d9a2effffbdd66
https://www.foxsports.com.au/football/socceroos/massimo-luongos-qpr-recorded-a-shock-31-win-at-mile-jedinaks-promotion-hopefuls-aston-villa/news-story/2e8e0eccac5ab8595a9f02e76cf6dec0
https://www.foxsports.com.au/football/asian-champions-league/sydney-fc-cannot-afford-another-loss-in-the-asian-champions-league-when-they-play-kashima-antlers/news-story/99e01e4b4baf375135432d801faee669
# Extract urls from text file

url_list_file = open("crawl_list.txt", "r")

crawlList = [line for line in url_list_file]
crawlList = [url.replace('n', '') for url in crawlList]

print(crawlList)
# helper function to extract information from a given article

def ArticleDataExtractor(some_url):
    '''function to pull ou all key information from a given article url'''
    
    from newspaper import Article
    
    output = {}
    article = Article(some_url)
    article.download()
    article.parse()
    
    
    output['url'] = some_url
    output['authors'] = article.authors
    output['pubDate'] = str(article.publish_date)
    output['title'] = article.title
    output['text'] = article.text
    
    # do some NLP
    article.nlp()
    
    output['keywords'] = article.keywords
    output['summary'] = article.summary
    
    
    return output
# save files in json format

import json

for url in crawlList:
    articleID = url.split("/")[-1]
    extractedData = ArticleDataExtractor(url)
    
    my_filename = 'data/' + str(articleID) + '.json'
    with open(my_filename, 'w') as fp:
        json.dump(extractedData, fp, indent = 4)

Send a Comment

Your email address will not be published.