Skip to content

Commit

Permalink
Remove pyppeteer from requirements and fix getArticleContent()
Browse files Browse the repository at this point in the history
  • Loading branch information
morganbarber committed Feb 27, 2024
1 parent e697d97 commit 7cc38f9
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 31 deletions.
42 changes: 16 additions & 26 deletions python_news_scraper/getArticleContent.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from bs4 import BeautifulSoup
from pyppeteer import launch
import requests

verifyMessages = [
"you are human",
Expand All @@ -8,54 +8,44 @@
"recaptcha"
]

async def getArticleContent(articles, filterWords):
try:
browser = await launch()
processedArticlesPromises = [extractArticleContentAndFavicon(article, browser, filterWords) for article in articles]
processedArticles = await asyncio.gather(*processedArticlesPromises)
await browser.close()
return processedArticles
except Exception as err:
# print("getArticleContent ERROR:", err)
return articles
def getArticleContent(articles, filterWords):
processedArticles = []
for article in articles:
processedArticle = extractArticleContentAndFavicon(article, filterWords)
processedArticles.append(processedArticle)
return processedArticles

async def extractArticleContentAndFavicon(article, browser, filterWords):
def extractArticleContentAndFavicon(article, filterWords):
try:
page = await browser.newPage()
await page.goto(article['link'], waitUntil='networkidle2')
content = await page.content()

content = response.text

favicon = await page.evaluate('''
() => {
const link = document.querySelector('link[rel="icon"], link[rel="shortcut icon"]');
return link ? link.getAttribute('href') : '';
}
''')
favicon = extractFavicon(content)

soup = BeautifulSoup(content, 'html.parser')
articleContent = soup.get_text(separator='\n')

if not articleContent:
# print("Article content could not be parsed or is empty.")
return { **article, 'content': '', 'favicon': favicon }

hasVerifyMessage = any(w in articleContent.lower() for w in verifyMessages)
if hasVerifyMessage:
# print("Article requires human verification.")
return { **article, 'content': '', 'favicon': favicon }

cleanedText = cleanText(articleContent, filterWords)

if len(cleanedText.split(' ')) < 100: # Example threshold: 100 words
# print("Article content is too short and likely not valuable.")
return { **article, 'content': '', 'favicon': favicon }

# print("SUCCESSFULLY SCRAPED ARTICLE CONTENT:", cleanedText)
return { **article, 'content': cleanedText, 'favicon': favicon }
except Exception as error:
# print('Error extracting article with Puppeteer:', error)
return { **article, 'content': '', 'favicon': '' }

def extractFavicon(content):
soup = BeautifulSoup(content, 'html.parser')
link = soup.find('link', rel=['icon', 'shortcut icon'])
return link['href'] if link else ''

def cleanText(text, filterWords):
unwantedKeywords = [
"subscribe now",
Expand Down
3 changes: 2 additions & 1 deletion python_news_scraper/python_news_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .buildQueryString import buildQueryString
from .getArticleContent import getArticleContent
from .getPrettyUrl import get_pretty_url
import asyncio

def googleNewsScraper(userConfig):
config = {
Expand Down Expand Up @@ -71,6 +72,6 @@ def googleNewsScraper(userConfig):

if config["getArticleContent"]:
filterWords = config.get("filterWords", [])
results = getArticleContent(results, browser, filterWords)
results = asyncio.run(getArticleContent(results, filterWords))

return results
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
beautifulsoup4
pyppeteer
beautifulsoup4
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@
author_email='[email protected]',
description='A python package to scrape news.',
packages=find_packages(),
install_requires=['requests', 'beautifulsoup4', 'pyppeteer'],
install_requires=['requests', 'beautifulsoup4'],
)
3 changes: 2 additions & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
config = {
"queryVars": "hl=en-US",
"searchTerm": "python",
"getArticleContent": True,
}

results = python_news_scraper.googleNewsScraper(config)
print(results[0])
print(results)

0 comments on commit 7cc38f9

Please sign in to comment.