From d5f4540f65f7aa25c5076516b3538cbf3eb5b88a Mon Sep 17 00:00:00 2001 From: santhoshse7en Date: Sat, 29 Jun 2019 17:01:40 +0530 Subject: [PATCH] news-fetch --- LICENSE | 2 +- README.md | 73 +++++++++++++- news_fetch/__init__.py | 1 + news_fetch/news.py | 209 +++++++++++++++++++++++++++++++++++++++++ news_fetch/utils.py | 13 +++ requirements.txt | 8 ++ setup.cfg | 2 + setup.py | 39 ++++++++ 8 files changed, 344 insertions(+), 3 deletions(-) create mode 100644 news_fetch/__init__.py create mode 100644 news_fetch/news.py create mode 100644 news_fetch/utils.py create mode 100644 requirements.txt create mode 100644 setup.cfg create mode 100644 setup.py diff --git a/LICENSE b/LICENSE index 2fdf788..79e30dc 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2019 M Santhosh Kumar +Copyright (c) [2019] [M Santhosh Kumar] Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 9b85133..7b8b236 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,71 @@ -# news-fetch -A Python Package which helps to scrape all news details +[![PyPI Version](https://img.shields.io/pypi/v/news_fetch.svg)](https://pypi.org/project/news_fetch) +[![Coverage Status](https://coveralls.io/repos/github/santhoshse7en/news_fetch/badge.svg?branch=master)](https://coveralls.io/github/santhoshse7en/news_fetch?branch=master) +[![License](https://img.shields.io/pypi/l/news_fetch.svg)](https://pypi.python.org/pypi/news_fetch/) +[![Documentation Status](https://readthedocs.org/projects/pip/badge/?version=latest&style=flat)](https://santhoshse7en.github.io/news_fetch_doc) + +# news_fetch + +news_fetch scrape all the news related attributes with helps [Google Search](https://www.google.com/) and [Newspaper3K](https://pypi.org/project/newspaper3k/) which reduce the NaN or '' or [] or None values while scraping. + +| Source | Link | +| --- | --- | +| PyPI: | https://pypi.org/project/news_fetch/ | +| Repository: | https://santhoshse7en.github.io/news_fetch/ | +| Documentation: | https://santhoshse7en.github.io/news_fetch_doc/ | + +## Dependencies + +- beautifulsoup4 +- selenium +- chromedriver-binary +- fake_useragent +- pandas +- pattern + + + +## Dependencies Installation + +Use the package manager [pip](https://pip.pypa.io/en/stable/) to install following +```bash +pip install -r requirements.txt +``` + +## Usage + +Download it by clicking the green download button here on [Github](https://github.com/santhoshse7en/news_fetch/archive/master.zip). To extract URLs from targeted website call google_crawler function, you only need to parse argument of keyword and newspaper website. + +```python +>>> from news_fetch.news import google_search +>>> google = google_search('Alcoholics Anonymous', 'https://timesofindia.indiatimes.com/') +``` + +**Directory of google search results urls** + +![google](https://user-images.githubusercontent.com/47944792/60381562-67363380-9a74-11e9-99ea-51c27bf08abc.PNG) + +To scrape the all news details call news_crawler function + +```python +>>> from news_fetch.news import newspaper +>>> news = newspaper('https://www.bbc.co.uk/news/world-48810070') +``` + +**Directory of news_crawler** + +![news](https://user-images.githubusercontent.com/47944792/60381950-969b6f00-9a79-11e9-8167-c9cb45033c91.PNG) + +```python +>>> news.headline + +'g20 summit: trump and xi agree to restart us china trade talks' +``` + +## Contributing + +Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. + +Please make sure to update tests as appropriate. + +## License +[MIT](https://choosealicense.com/licenses/mit/) diff --git a/news_fetch/__init__.py b/news_fetch/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/news_fetch/__init__.py @@ -0,0 +1 @@ + diff --git a/news_fetch/news.py b/news_fetch/news.py new file mode 100644 index 0000000..f9654ba --- /dev/null +++ b/news_fetch/news.py @@ -0,0 +1,209 @@ +# -*- coding: utf-8 -*- +""" +Created on Sat Jun 29 10:10:04 2019 + +@author: M.Santhosh Kumar +""" +from news_fetch.utils import * + +class google_search: + + def __init__(self, keyword, newspaper_url): + + self.keyword = keyword + self.newspaper_url = newspaper_url + + random_headers = {'User-Agent': UserAgent().random,'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'} + + self.search_term = str(self.keyword) + ' site:' + str(self.newspaper_url) + + sys.stdout.write('\r' + 'Google Search Keyword : ' + str(self.search_term) + '\r') + sys.stdout.flush() + + url = 'https://www.google.com/search?q=' + '+'.join(self.search_term.split()) + + soup = BeautifulSoup(get(url, headers=random_headers).text, 'lxml') + + try: + # Extracts the digits if it the resulted number without comma ','. eg: About 680 results (0.23 seconds) + max_pages = round([int(s) for s in soup.select_one('div#resultStats').text.split() if s.isdigit()][0]/10) + max_pages = max_pages + 1 + except: + # Extracts the digits if it the resulted number without comma ','. eg: About 1,080 results (0.23 seconds) + max_pages = round(int(''.join(i for i in soup.select_one('div#resultStats').text if i.isdigit()))/10) + max_pages = max_pages + 1 + + url_list = [] + + options = Options() + options.headless = True + browser = webdriver.Chrome(options=options) + browser.get(url) + + index = 0 + + while True: + try: + index +=1 + page = browser.page_source + soup = BeautifulSoup(page, 'lxml') + linky = [soup.select('.r')[i].a['href'] for i in range(len(soup.select('.r')))] + url_list.extend(linky) + if index == max_pages: + break + browser.find_element_by_xpath('//*[@id="pnnext"]/span[2]').click() + time.sleep(2) + sys.stdout.write('\r' + str(index) + ' : ' + str(max_pages) + '\r') + sys.stdout.flush() + except: + pass + + browser.quit() + + self.urls = list(dict.fromkeys(url_list)) + sys.stdout.write('\r' + 'Total google search result urls extracted from the above keyword : ' + str(len(self.urls)) + '\r') + sys.stdout.flush() + +class newspaper: + + def __init__(self, url): + self.url = url + article = Article(self.url, request_timeout=10) + article.download() + article.parse() + article.nlp() + soup = BeautifulSoup(article.html, 'lxml') + + def cleaning_text(text): + text = re.sub(r'\b(?:(?:https?|ftp)://)?\w[\w-]*(?:\.[\w-]+)+\S*', ' ', text.lower()) + words = re.findall(r'[a-zA-Z0-9:.,]+', text) + return ' '.join(words) + + def author(soup): + i = 0 + while True: + try: + meta = json.loads(soup.select('script[type="application/ld+json"]')[i].text) + author = meta.get('author')['name'] + if '' != author: + break + except: + pass + i+=1 + if i == 3: + break + return author + + def date(soup): + i = 0 + while True: + try: + meta = json.loads(soup.select('script[type="application/ld+json"]')[i].text) + date = meta.get('datePublished') + if '' != date: + break + except: + pass + i+=1 + if i == 3: + break + return date + + def publisher(soup): + i = 0 + while True: + try: + meta = json.loads(soup.select('script[type="application/ld+json"]')[i].text) + publisher = meta.get('publisher')['name'] + if '' != publisher: + break + except: + pass + i+=1 + if i == 3: + break + return publisher + """ + :returns: author Name + """ + try: + self.author = author(soup) + except: + self.author = None + + """ + :returns: published Date + """ + try: + try: + self.date = date(soup) + except: + self.date = article.meta_data['article']['published_time'] + except: + self.date = None + + """ + :returns: article + """ + try: + self.article = cleaning_text(article.text) + except: + self.article = None + + """ + :returns: headlines + """ + try: + self.headline = cleaning_text(article.title) + except: + self.headline = None + + """ + :returns: keywords + """ + try: + self.keywords = article.keywords + except: + self.keywords = None + + """ + :returns: summary + """ + try: + self.summary = cleaning_text(article.summary) + except: + self.summary = None + + """ + :returns: description + """ + try: + try: + self.description = cleaning_text(article.meta_description) + except: + self.description = cleaning_text(article.meta_data['description']) + except: + self.description = None + + """ + :returns: publication + """ + try: + try: + self.publication = publisher(soup) + except: + self.publication = article.meta_data['og']['site_name'] + except: + self.publication = None + + """ + :returns: category + """ + try: + try: + text = cleaning_text((article.url[len(article.source_url):])).split()[1] + self.category = [item[0] for item in suggest(text)][0] + except: + self.category = article.meta_data['article']['section'] + except: + self.category = None diff --git a/news_fetch/utils.py b/news_fetch/utils.py new file mode 100644 index 0000000..968c0b6 --- /dev/null +++ b/news_fetch/utils.py @@ -0,0 +1,13 @@ +from selenium.webdriver.chrome.options import Options +from fake_useragent import UserAgent +from selenium import webdriver +from pattern.en import suggest +from newspaper import Article +from bs4 import BeautifulSoup +import chromedriver_binary +from requests import get +import time +import nltk +import json +import sys +import re diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3b752a5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +beautifulsoup4 +selenium +chromedriver-binary==74.0.3729.6.0 +pandas +pattern +fake_useragent +setuptools +twine diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..b88034e --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +description-file = README.md diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..dbc4fae --- /dev/null +++ b/setup.py @@ -0,0 +1,39 @@ +"""A setuptools based setup module. +See: +https://packaging.python.org/guides/distributing-packages-using-setuptools/ +https://santhoshse7en.github.io/news_fetch/ +https://santhoshse7en.github.io/news_fetch_doc/ +""" +# -*- encoding: utf-8 -*- +from __future__ import absolute_import +from __future__ import print_function + +# Always prefer setuptools over distutils +import setuptools + +keywords = ['Newspaper', "news_fetch", "without-api", "google_scraper", 'news_scraper', 'bs4', 'lxml',] + +setuptools.setup( + name="news_fetch", + version="0.0.4", + author="M Santhosh Kumar", + author_email="santhoshse7en@gmail.com", + description="A Python Package which helps to scrape news details", + long_description=open('README.md').read(), + long_description_content_type="text/markdown", + url="https://santhoshse7en.github.io/news_fetch/", + keywords = keywords, + install_requires=['beautifulsoup4', 'pandas', 'selenium', 'pattern', 'fake_useragent', 'chromedriver-binary==74.0.3729.6.0'], + packages = setuptools.find_packages(), + classifiers=['Development Status :: 4 - Beta', + 'Intended Audience :: End Users/Desktop', + 'Intended Audience :: Developers', + 'Intended Audience :: System Administrators', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + 'Programming Language :: Python', + 'Topic :: Communications :: Email', + 'Topic :: Office/Business', + 'Topic :: Software Development :: Bug Tracking', + ], +)