From 7cc38f94fb7d74a4a856bb1940c79b700d4b3548 Mon Sep 17 00:00:00 2001 From: Morgan Barber <109038897+morganbarber@users.noreply.github.com> Date: Tue, 27 Feb 2024 15:38:27 +0000 Subject: [PATCH] Remove pyppeteer from requirements and fix getArticleContent() --- python_news_scraper/getArticleContent.py | 42 +++++++++------------- python_news_scraper/python_news_scraper.py | 3 +- requirements.txt | 3 +- setup.py | 2 +- test.py | 3 +- 5 files changed, 22 insertions(+), 31 deletions(-) diff --git a/python_news_scraper/getArticleContent.py b/python_news_scraper/getArticleContent.py index 59fdc3b..3f939b8 100644 --- a/python_news_scraper/getArticleContent.py +++ b/python_news_scraper/getArticleContent.py @@ -1,5 +1,5 @@ from bs4 import BeautifulSoup -from pyppeteer import launch +import requests verifyMessages = [ "you are human", @@ -8,54 +8,44 @@ "recaptcha" ] -async def getArticleContent(articles, filterWords): - try: - browser = await launch() - processedArticlesPromises = [extractArticleContentAndFavicon(article, browser, filterWords) for article in articles] - processedArticles = await asyncio.gather(*processedArticlesPromises) - await browser.close() - return processedArticles - except Exception as err: - # print("getArticleContent ERROR:", err) - return articles +def getArticleContent(articles, filterWords): + processedArticles = [] + for article in articles: + processedArticle = extractArticleContentAndFavicon(article, filterWords) + processedArticles.append(processedArticle) + return processedArticles -async def extractArticleContentAndFavicon(article, browser, filterWords): +def extractArticleContentAndFavicon(article, filterWords): try: - page = await browser.newPage() - await page.goto(article['link'], waitUntil='networkidle2') - content = await page.content() + + content = response.text - favicon = await page.evaluate(''' - () => { - const link = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]'); - return link ? link.getAttribute('href') : ''; - } - ''') + favicon = extractFavicon(content) soup = BeautifulSoup(content, 'html.parser') articleContent = soup.get_text(separator='\n') if not articleContent: - # print("Article content could not be parsed or is empty.") return { **article, 'content': '', 'favicon': favicon } hasVerifyMessage = any(w in articleContent.lower() for w in verifyMessages) if hasVerifyMessage: - # print("Article requires human verification.") return { **article, 'content': '', 'favicon': favicon } cleanedText = cleanText(articleContent, filterWords) if len(cleanedText.split(' ')) < 100: # Example threshold: 100 words - # print("Article content is too short and likely not valuable.") return { **article, 'content': '', 'favicon': favicon } - # print("SUCCESSFULLY SCRAPED ARTICLE CONTENT:", cleanedText) return { **article, 'content': cleanedText, 'favicon': favicon } except Exception as error: - # print('Error extracting article with Puppeteer:', error) return { **article, 'content': '', 'favicon': '' } +def extractFavicon(content): + soup = BeautifulSoup(content, 'html.parser') + link = soup.find('link', rel=['icon', 'shortcut icon']) + return link['href'] if link else '' + def cleanText(text, filterWords): unwantedKeywords = [ "subscribe now", diff --git a/python_news_scraper/python_news_scraper.py b/python_news_scraper/python_news_scraper.py index a1ebcda..dc63947 100644 --- a/python_news_scraper/python_news_scraper.py +++ b/python_news_scraper/python_news_scraper.py @@ -4,6 +4,7 @@ from .buildQueryString import buildQueryString from .getArticleContent import getArticleContent from .getPrettyUrl import get_pretty_url +import asyncio def googleNewsScraper(userConfig): config = { @@ -71,6 +72,6 @@ def googleNewsScraper(userConfig): if config["getArticleContent"]: filterWords = config.get("filterWords", []) - results = getArticleContent(results, browser, filterWords) + results = asyncio.run(getArticleContent(results, filterWords)) return results diff --git a/requirements.txt b/requirements.txt index 452e50b..041f722 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1 @@ -beautifulsoup4 -pyppeteer \ No newline at end of file +beautifulsoup4 \ No newline at end of file diff --git a/setup.py b/setup.py index 5e6f73b..e3c9fdb 100644 --- a/setup.py +++ b/setup.py @@ -8,5 +8,5 @@ author_email='morganbarber928@gmail.com', description='A python package to scrape news.', packages=find_packages(), - install_requires=['requests', 'beautifulsoup4', 'pyppeteer'], + install_requires=['requests', 'beautifulsoup4'], ) \ No newline at end of file diff --git a/test.py b/test.py index 418810d..81dce73 100644 --- a/test.py +++ b/test.py @@ -3,7 +3,8 @@ config = { "queryVars": "hl=en-US", "searchTerm": "python", + "getArticleContent": True, } results = python_news_scraper.googleNewsScraper(config) -print(results[0]) \ No newline at end of file +print(results)