Skip to content

Commit

Permalink
Fix article content extraction and add response status check
Browse files Browse the repository at this point in the history
  • Loading branch information
morganbarber committed Feb 27, 2024
1 parent 7e7bd97 commit 3c40aca
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 15 deletions.
33 changes: 19 additions & 14 deletions python_news_scraper/getArticleContent.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,32 @@ def getArticleContent(articles, filterWords):

def extractArticleContentAndFavicon(article, filterWords):
try:

content = response.text
response = requests.get(article['url'])
if response.status_code == 200:
print("response success")
content = response.text

favicon = extractFavicon(content)
favicon = extractFavicon(content)

soup = BeautifulSoup(content, 'html.parser')
articleContent = soup.get_text(separator='\n')
soup = BeautifulSoup(content, 'html.parser')
articleContent = soup.get_text(separator='\n')

if not articleContent:
return { **article, 'content': '', 'favicon': favicon }
if not articleContent:
return { **article, 'content': '', 'favicon': favicon }

hasVerifyMessage = any(w in articleContent.lower() for w in verifyMessages)
if hasVerifyMessage:
return { **article, 'content': '', 'favicon': favicon }
hasVerifyMessage = any(w in articleContent.lower() for w in verifyMessages)
if hasVerifyMessage:
return { **article, 'content': '', 'favicon': favicon }

cleanedText = cleanText(articleContent, filterWords)
cleanedText = cleanText(articleContent, filterWords)

if len(cleanedText.split(' ')) < 100: # Example threshold: 100 words
return { **article, 'content': '', 'favicon': favicon }
if len(cleanedText.split(' ')) < 100: # Example threshold: 100 words
return { **article, 'content': '', 'favicon': favicon }

return { **article, 'content': cleanedText, 'favicon': favicon }
return { **article, 'content': cleanedText, 'favicon': favicon }
else:
print("Response fail")
return { **article, 'content': '', 'favicon': '' }
except Exception as error:
return { **article, 'content': '', 'favicon': '' }

Expand Down
2 changes: 1 addition & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@
}

results = python_news_scraper.googleNewsScraper(config)
print(results)
print(results[0])

0 comments on commit 3c40aca

Please sign in to comment.