meta database phase 2
# Extract urls from text file

url_list_file = open("crawl_list.txt", "r")

crawlList = [line for line in url_list_file]
crawlList = [url.replace('n', '') for url in crawlList]

# helper function to extract information from a given article

def ArticleDataExtractor(some_url):
    '''function to pull ou all key information from a given article url'''
    from newspaper import Article
    output = {}
    article = Article(some_url)
    output['url'] = some_url
    output['authors'] = article.authors
    output['pubDate'] = str(article.publish_date)
    output['title'] = article.title
    output['text'] = article.text
    # do some NLP
    output['keywords'] = article.keywords
    output['summary'] = article.summary
    return output
# save files in json format

import json

for url in crawlList:
    articleID = url.split("/")[-1]
    extractedData = ArticleDataExtractor(url)
    my_filename = 'data/' + str(articleID) + '.json'
    with open(my_filename, 'w') as fp:
        json.dump(extractedData, fp, indent = 4)

Send a Comment

Your email address will not be published.