news-fetch

santhoshse7en · Jun 29, 2019 · d5f4540 · d5f4540
1 parent 90dfb73
commit d5f4540
Show file tree

Hide file tree

Showing 8 changed files with 344 additions and 3 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2019 M Santhosh Kumar
+Copyright (c) [2019] [M Santhosh Kumar]
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -1,2 +1,71 @@
-# news-fetch
-A Python Package which helps to scrape all news details
+[![PyPI Version](https://img.shields.io/pypi/v/news_fetch.svg)](https://pypi.org/project/news_fetch)
+[![Coverage Status](https://coveralls.io/repos/github/santhoshse7en/news_fetch/badge.svg?branch=master)](https://coveralls.io/github/santhoshse7en/news_fetch?branch=master)
+[![License](https://img.shields.io/pypi/l/news_fetch.svg)](https://pypi.python.org/pypi/news_fetch/)
+[![Documentation Status](https://readthedocs.org/projects/pip/badge/?version=latest&style=flat)](https://santhoshse7en.github.io/news_fetch_doc)
+
+# news_fetch
+
+news_fetch scrape all the news related attributes with helps [Google Search](https://www.google.com/) and [Newspaper3K](https://pypi.org/project/newspaper3k/) which reduce the NaN or '' or [] or None values while scraping.
+
+| Source         | Link                                         |
+| ---            |  ---                                         |
+| PyPI:          | https://pypi.org/project/news_fetch/             |
+| Repository:    | https://santhoshse7en.github.io/news_fetch/      |
+| Documentation: | https://santhoshse7en.github.io/news_fetch_doc/  |
+
+## Dependencies
+
+- beautifulsoup4
+- selenium
+- chromedriver-binary
+- fake_useragent
+- pandas
+- pattern
+
+
+
+## Dependencies Installation
+
+Use the package manager [pip](https://pip.pypa.io/en/stable/) to install following
+```bash
+pip install -r requirements.txt
+```
+
+## Usage
+
+Download it by clicking the green download button here on [Github](https://github.com/santhoshse7en/news_fetch/archive/master.zip). To extract URLs from targeted website call google_crawler function, you only need to parse argument of keyword and newspaper website.
+
+```python
+>>> from news_fetch.news import google_search
+>>> google = google_search('Alcoholics Anonymous', 'https://timesofindia.indiatimes.com/')
+```
+
+**Directory of google search results urls**
+
+![google](https://user-images.githubusercontent.com/47944792/60381562-67363380-9a74-11e9-99ea-51c27bf08abc.PNG)
+
+To scrape the all news details call news_crawler function
+
+```python
+>>> from news_fetch.news import newspaper
+>>> news = newspaper('https://www.bbc.co.uk/news/world-48810070')
+```
+
+**Directory of news_crawler**
+
+![news](https://user-images.githubusercontent.com/47944792/60381950-969b6f00-9a79-11e9-8167-c9cb45033c91.PNG)
+
+```python
+>>> news.headline
+
+'g20 summit: trump and xi agree to restart us china trade talks'
+```
+
+## Contributing
+
+Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.
+
+Please make sure to update tests as appropriate.
+
+## License
+[MIT](https://choosealicense.com/licenses/mit/)
diff --git a/news_fetch/__init__.py b/news_fetch/__init__.py
@@ -0,0 +1 @@
+
diff --git a/news_fetch/news.py b/news_fetch/news.py
@@ -0,0 +1,209 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Jun 29 10:10:04 2019
+
+@author: M.Santhosh Kumar
+"""
+from news_fetch.utils import *
+
+class google_search:
+
+    def __init__(self, keyword, newspaper_url):
+
+        self.keyword = keyword
+        self.newspaper_url = newspaper_url
+
+        random_headers = {'User-Agent': UserAgent().random,'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
+
+        self.search_term = str(self.keyword) + ' site:' + str(self.newspaper_url)
+
+        sys.stdout.write('\r' + 'Google Search Keyword : ' + str(self.search_term) + '\r')
+        sys.stdout.flush()
+
+        url = 'https://www.google.com/search?q=' + '+'.join(self.search_term.split())
+
+        soup = BeautifulSoup(get(url, headers=random_headers).text, 'lxml')
+
+        try:
+            # Extracts the digits if it the resulted number without comma ','. eg: About 680 results (0.23 seconds)
+            max_pages = round([int(s) for s in soup.select_one('div#resultStats').text.split() if s.isdigit()][0]/10)
+            max_pages = max_pages + 1
+        except:
+            # Extracts the digits if it the resulted number without comma ','. eg: About 1,080 results (0.23 seconds)
+            max_pages = round(int(''.join(i for i in soup.select_one('div#resultStats').text if i.isdigit()))/10)
+            max_pages = max_pages + 1
+
+        url_list = []
+
+        options = Options()
+        options.headless = True
+        browser = webdriver.Chrome(options=options)
+        browser.get(url)
+
+        index = 0
+
+        while True:
+            try:
+                index +=1
+                page = browser.page_source
+                soup = BeautifulSoup(page, 'lxml')
+                linky = [soup.select('.r')[i].a['href'] for i in range(len(soup.select('.r')))]
+                url_list.extend(linky)
+                if index == max_pages:
+                    break
+                browser.find_element_by_xpath('//*[@id="pnnext"]/span[2]').click()
+                time.sleep(2)
+                sys.stdout.write('\r' + str(index) + ' : ' + str(max_pages) + '\r')
+                sys.stdout.flush()
+            except:
+                pass
+
+        browser.quit()
+
+        self.urls = list(dict.fromkeys(url_list))
+        sys.stdout.write('\r' + 'Total google search result urls extracted from the above keyword : ' + str(len(self.urls)) + '\r')
+        sys.stdout.flush()
+
+class newspaper:
+
+    def __init__(self, url):
+        self.url = url
+        article = Article(self.url, request_timeout=10)
+        article.download()
+        article.parse()
+        article.nlp()
+        soup = BeautifulSoup(article.html, 'lxml')
+
+        def cleaning_text(text):
+            text = re.sub(r'\b(?:(?:https?|ftp)://)?\w[\w-]*(?:\.[\w-]+)+\S*', ' ', text.lower())
+            words = re.findall(r'[a-zA-Z0-9:.,]+', text)
+            return ' '.join(words)
+
+        def author(soup):
+            i = 0
+            while True:
+                try:
+                    meta = json.loads(soup.select('script[type="application/ld+json"]')[i].text)
+                    author = meta.get('author')['name']
+                    if '' != author:
+                        break
+                except:
+                    pass
+                i+=1
+                if i == 3:
+                    break
+            return author
+
+        def date(soup):
+            i = 0
+            while True:
+                try:
+                    meta = json.loads(soup.select('script[type="application/ld+json"]')[i].text)
+                    date = meta.get('datePublished')
+                    if '' != date:
+                        break
+                except:
+                    pass
+                i+=1
+                if i == 3:
+                    break
+            return date
+
+        def publisher(soup):
+            i = 0
+            while True:
+                try:
+                    meta = json.loads(soup.select('script[type="application/ld+json"]')[i].text)
+                    publisher = meta.get('publisher')['name']
+                    if '' != publisher:
+                        break
+                except:
+                    pass
+                i+=1
+                if i == 3:
+                    break
+            return publisher
+        """
+        :returns: author Name
+        """
+        try:
+            self.author = author(soup)
+        except:
+            self.author = None
+
+        """
+        :returns: published Date
+        """
+        try:
+            try:
+                self.date = date(soup)
+            except:
+                self.date = article.meta_data['article']['published_time']
+        except:
+            self.date = None
+
+        """
+        :returns: article
+        """
+        try:
+            self.article = cleaning_text(article.text)
+        except:
+            self.article = None
+
+        """
+        :returns: headlines
+        """
+        try:
+            self.headline = cleaning_text(article.title)
+        except:
+            self.headline = None
+
+        """
+        :returns: keywords
+        """
+        try:
+            self.keywords = article.keywords
+        except:
+            self.keywords = None
+
+        """
+        :returns: summary
+        """
+        try:
+            self.summary = cleaning_text(article.summary)
+        except:
+            self.summary = None
+
+        """
+        :returns: description
+        """
+        try:
+            try:
+                self.description = cleaning_text(article.meta_description)
+            except:
+                self.description = cleaning_text(article.meta_data['description'])
+        except:
+            self.description = None
+
+        """
+        :returns: publication
+        """
+        try:
+            try:
+                self.publication = publisher(soup)
+            except:
+                self.publication = article.meta_data['og']['site_name']
+        except:
+            self.publication = None
+
+        """
+        :returns: category
+        """
+        try:
+            try:
+                text = cleaning_text((article.url[len(article.source_url):])).split()[1]
+                self.category = [item[0] for item in suggest(text)][0]
+            except:
+                self.category = article.meta_data['article']['section']
+        except:
+            self.category = None
diff --git a/news_fetch/utils.py b/news_fetch/utils.py
@@ -0,0 +1,13 @@
+from selenium.webdriver.chrome.options import Options
+from fake_useragent import UserAgent
+from selenium import webdriver
+from pattern.en import suggest
+from newspaper import Article
+from bs4 import BeautifulSoup
+import chromedriver_binary
+from requests import get
+import time
+import nltk
+import json
+import sys
+import re
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,8 @@
+beautifulsoup4
+selenium
+chromedriver-binary==74.0.3729.6.0
+pandas
+pattern
+fake_useragent
+setuptools
+twine
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,2 @@
+[metadata]
+description-file = README.md
diff --git a/setup.py b/setup.py
@@ -0,0 +1,39 @@
+"""A setuptools based setup module.
+See:
+https://packaging.python.org/guides/distributing-packages-using-setuptools/
+https://santhoshse7en.github.io/news_fetch/
+https://santhoshse7en.github.io/news_fetch_doc/
+"""
+# -*- encoding: utf-8 -*-
+from __future__ import absolute_import
+from __future__ import print_function
+
+# Always prefer setuptools over distutils
+import setuptools
+
+keywords = ['Newspaper', "news_fetch", "without-api", "google_scraper", 'news_scraper', 'bs4', 'lxml',]
+
+setuptools.setup(
+    name="news_fetch",
+    version="0.0.4",
+    author="M Santhosh Kumar",
+    author_email="[email protected]",
+    description="A Python Package which helps to scrape news details",
+    long_description=open('README.md').read(),
+    long_description_content_type="text/markdown",
+    url="https://santhoshse7en.github.io/news_fetch/",
+    keywords = keywords,
+    install_requires=['beautifulsoup4', 'pandas', 'selenium', 'pattern', 'fake_useragent', 'chromedriver-binary==74.0.3729.6.0'],
+    packages = setuptools.find_packages(),
+    classifiers=['Development Status :: 4 - Beta',
+              'Intended Audience :: End Users/Desktop',
+              'Intended Audience :: Developers',
+              'Intended Audience :: System Administrators',
+              'License :: OSI Approved :: MIT License',
+              'Operating System :: OS Independent',
+              'Programming Language :: Python',
+              'Topic :: Communications :: Email',
+              'Topic :: Office/Business',
+              'Topic :: Software Development :: Bug Tracking',
+              ],
+)