Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

simplify and improve sitemap init #503

Merged
merged 4 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 34 additions & 25 deletions tests/sitemaps_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@

from courlan import get_hostinfo

import trafilatura
from trafilatura import sitemaps
from trafilatura.utils import decode_response, is_similar_domain
from trafilatura.utils import decode_file, is_similar_domain

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

Expand All @@ -29,21 +30,28 @@ def test_extraction():
'''Test simple link extraction'''
# link handling
url, domain, baseurl = 'https://www.sitemaps.org/sitemap.xml', 'sitemaps.org', 'https://www.sitemaps.org'
sitemap = sitemaps.SitemapObject(baseurl, domain, url)
sitemap = sitemaps.SitemapObject(baseurl, domain, [])
sitemap.handle_link(url)
assert not sitemap.sitemap_urls and not sitemap.urls
assert len(sitemap.sitemap_urls) == 1 and not sitemap.urls

# same URL
url, domain, baseurl = 'https://www.sitemaps.org/sitemap.xml', 'sitemaps.org', 'https://www.sitemaps.org'
sitemap = sitemaps.SitemapObject(baseurl, domain, [url])
sitemap.current_url = url
sitemap.handle_link(url)
assert len(sitemap.sitemap_urls) == 1 and not sitemap.urls

sitemap = sitemaps.SitemapObject('https://example.org', 'example.org', 'https://example.org/sitemap.xml')
sitemap = sitemaps.SitemapObject('https://example.org', 'example.org', ['https://example.org/sitemap.xml'])
sitemap.handle_link('https://mydomain')
assert not sitemap.sitemap_urls and not sitemap.urls
assert len(sitemap.sitemap_urls) == 1 and not sitemap.urls

sitemap = sitemaps.SitemapObject('https://example.org', 'example.org', 'https://example.org/sitemap.xml')
sitemap = sitemaps.SitemapObject('https://example.org', 'example.org', ['https://example.org/sitemap.xml'])
sitemap.handle_link('https://mydomain.wordpress.com/1')
assert not sitemap.sitemap_urls and sitemap.urls == ['https://mydomain.wordpress.com/1']
assert len(sitemap.sitemap_urls) == 1 and sitemap.urls == ['https://mydomain.wordpress.com/1']

sitemap = sitemaps.SitemapObject('https://programtalk.com', 'programtalk.com', 'https://programtalk.com/sitemap.xml')
sitemap = sitemaps.SitemapObject('https://programtalk.com', 'programtalk.com', ['https://programtalk.com/sitemap.xml'])
sitemap.handle_link('http://programtalk.com/java-api-usage-examples/org.apache.xml.security.stax.securityEvent.SecurityEvent')
assert not sitemap.sitemap_urls and sitemap.urls == ['http://programtalk.com/java-api-usage-examples/org.apache.xml.security.stax.securityEvent.SecurityEvent']
assert len(sitemap.sitemap_urls) == 1 and sitemap.urls == ['http://programtalk.com/java-api-usage-examples/org.apache.xml.security.stax.securityEvent.SecurityEvent']

# similar domain names
assert not is_similar_domain('kleins-weindepot.de', 'eurosoft.net')
Expand All @@ -54,17 +62,17 @@ def test_extraction():
url = 'https://de.sitemaps.org/1'
sitemap_url = 'https://de.sitemaps.org/sitemap.xml'
domain, baseurl = get_hostinfo(sitemap_url)
sitemap = sitemaps.SitemapObject(baseurl, domain, sitemap_url)
sitemap = sitemaps.SitemapObject(baseurl, domain, [])
sitemap.handle_link(url)
assert not sitemap.sitemap_urls and sitemap.urls == [url]

# diverging domains
url = 'https://www.software.info/1'
sitemap_url = 'https://example.org/sitemap.xml'
domain, baseurl = get_hostinfo(sitemap_url)
sitemap = sitemaps.SitemapObject(baseurl, domain, sitemap_url)
sitemap_urls = ['https://example.org/sitemap.xml']
domain, baseurl = get_hostinfo(sitemap_urls[0])
sitemap = sitemaps.SitemapObject(baseurl, domain, sitemap_urls)
sitemap.handle_link(url)
assert not sitemap.sitemap_urls and not sitemap.urls
assert len(sitemap.sitemap_urls) == 1 and not sitemap.urls

# don't take this one?
#url = 'https://subdomain.sitemaps.org/1'
Expand All @@ -78,7 +86,7 @@ def test_extraction():
assert sitemaps.is_plausible_sitemap('http://test.org/sitemap.xml', '<!DOCTYPE html><html><body/></html>') is False
assert sitemaps.is_plausible_sitemap('http://test.org/sitemap', '<!DOCTYPE html><html><body/></html>') is False
# invalid
sitemap = sitemaps.SitemapObject(baseurl, domain, url)
sitemap = sitemaps.SitemapObject(baseurl, domain, [])
sitemap.content = '<html>\n</html>'
sitemap.extract_sitemap_links()
assert not sitemap.sitemap_urls and not sitemap.urls
Expand All @@ -89,7 +97,7 @@ def test_extraction():
with open(filepath, "r", encoding="utf-8") as f:
teststring = f.read()
assert sitemaps.is_plausible_sitemap('http://sitemaps.org/sitemap.xml', teststring) is True
sitemap = sitemaps.SitemapObject(baseurl, domain, sitemap_url)
sitemap = sitemaps.SitemapObject(baseurl, domain, [])
sitemap.content = teststring
sitemap.extract_sitemap_links()
assert not sitemap.sitemap_urls and len(sitemap.urls) == 84
Expand All @@ -103,20 +111,20 @@ def test_extraction():
filepath = os.path.join(RESOURCES_DIR, 'sitemap2.xml')
with open(filepath, "r", encoding="utf-8") as f:
teststring = f.read()
sitemap = sitemaps.SitemapObject(baseurl, domain, url)
sitemap = sitemaps.SitemapObject(baseurl, domain, [url])
sitemap.content = teststring
sitemap.extract_sitemap_links()
assert sitemap.sitemap_urls == ['http://www.example.com/sitemap1.xml.gz', 'http://www.example.com/sitemap2.xml.gz'] and not sitemap.urls
assert sitemap.sitemap_urls == ['http://www.example.com/sitemap.xml', 'http://www.example.com/sitemap1.xml.gz', 'http://www.example.com/sitemap2.xml.gz'] and not sitemap.urls

# hreflang
sitemap = sitemaps.SitemapObject('https://test.org/', 'test.org', 'https://test.org/sitemap', 'en')
sitemap = sitemaps.SitemapObject('https://test.org/', 'test.org', [], 'en')
sitemap.content = '<?xml version="1.0" encoding="UTF-8"?><urlset><url><loc>http://www.test.org/english/page.html</loc></url></urlset>'
sitemap.process()
assert (sitemap.sitemap_urls, sitemap.urls) == ([], ['http://www.test.org/english/page.html'])
filepath = os.path.join(RESOURCES_DIR, 'sitemap-hreflang.xml')
with open(filepath, "r", encoding="utf-8") as f:
teststring = f.read()
sitemap = sitemaps.SitemapObject(baseurl, domain, url, 'de')
sitemap = sitemaps.SitemapObject(baseurl, domain, [], 'de')
sitemap.content = teststring
sitemap.extract_sitemap_langlinks()
assert sitemap.sitemap_urls == ['http://www.example.com/sitemap-de.xml.gz']
Expand All @@ -127,26 +135,26 @@ def test_extraction():
filepath = os.path.join(RESOURCES_DIR, 'sitemap.xml.gz')
with open(filepath, 'rb') as f:
teststring = f.read()
teststring = decode_response(teststring)
teststring = decode_file(teststring)
assert sitemaps.is_plausible_sitemap('http://example.org/sitemap.xml.gz', teststring) is True
sitemap = sitemaps.SitemapObject(baseurl, domain, url)
sitemap = sitemaps.SitemapObject(baseurl, domain, [url])
sitemap.content = teststring
sitemap.extract_sitemap_links()
assert len(sitemap.sitemap_urls) == 0 and len(sitemap.urls) == 84
assert len(sitemap.sitemap_urls) == 1 and len(sitemap.urls) == 84

# check contents
assert sitemaps.is_plausible_sitemap('http://example.org/sitemap.xml.gz?value=1', teststring) is True

# TXT links
content = 'Tralala\nhttps://test.org/1\nhttps://test.org/2'
assert sitemaps.is_plausible_sitemap('http://example.org/sitemap', content) is True
sitemap = sitemaps.SitemapObject('https://test.org/', 'test.org', 'https://test.org/sitemap')
sitemap = sitemaps.SitemapObject('https://test.org/', 'test.org', [])
sitemap.content = 'Tralala\nhttps://test.org/1\nhttps://test.org/2'
sitemap.process()
assert (sitemap.sitemap_urls, sitemap.urls) == ([], ['https://test.org/1', 'https://test.org/2'])

# TXT links + language
sitemap = sitemaps.SitemapObject('https://test.org/', 'test.org', 'https://test.org/sitemap', 'en')
sitemap = sitemaps.SitemapObject('https://test.org/', 'test.org', [], 'en')
sitemap.content = 'Tralala\nhttps://test.org/en/1\nhttps://test.org/en/2\nhttps://test.org/es/3'
sitemap.process()
assert (sitemap.sitemap_urls, sitemap.urls) == ([], ['https://test.org/en/1', 'https://test.org/en/2'])
Expand All @@ -164,6 +172,7 @@ def test_robotstxt():

def test_whole():
"Test whole process."
trafilatura.settings.MAX_SITEMAPS_SEEN = 1
results = sitemaps.sitemap_search("https://www.sitemaps.org", target_lang="de")
assert len(results) == 8

Expand Down
84 changes: 37 additions & 47 deletions trafilatura/sitemaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,11 @@
import logging
import re
from itertools import islice
from typing import List, Optional
from typing import List, Set, Optional

from courlan import (
clean_url,
extract_domain,
filter_urls,
fix_relative_urls,
get_hostinfo,
lang_filter,
Expand Down Expand Up @@ -43,17 +42,24 @@
SCRUB_REGEX = re.compile(r"\?.*$|#.*$")
POTENTIAL_SITEMAP = re.compile(r"\.xml\b") # |\bsitemap\b

GUESSES = ["sitemap.xml.gz", "sitemap", "sitemap_index.xml", "sitemap_news.xml"]
GUESSES = [
"sitemap.xml",
"sitemap.xml.gz",
"sitemap",
"sitemap_index.xml",
"sitemap_news.xml",
]


class SitemapObject:
"Store all necessary information on sitemap download and processing."
__slots__ = [
"base_url",
"content",
"current_url",
"domain",
"external",
"sitemap_url",
"seen",
"sitemap_urls",
"target_lang",
"urls",
Expand All @@ -63,28 +69,30 @@ def __init__(
self,
base_url: str,
domain: str,
sitemap_url: str,
sitemapsurls: List[str],
target_lang: Optional[str] = None,
external: bool = False,
) -> None:
self.base_url: str = base_url
self.content: str = ""
self.domain: str = domain
self.external: bool = external
self.sitemap_url: str = sitemap_url
self.sitemap_urls: List[str] = []
self.current_url: str = ""
self.seen: Set[str] = set()
self.sitemap_urls: List[str] = sitemapsurls
self.target_lang: Optional[str] = target_lang
self.urls: List[str] = []

def fetch(self) -> None:
"Fetch a sitemap over the network."
LOGGER.debug("fetching sitemap: %s", self.sitemap_url)
self.content = fetch_url(self.sitemap_url)
LOGGER.debug("fetching sitemap: %s", self.current_url)
self.content = fetch_url(self.current_url)
self.seen.add(self.current_url)

def handle_link(self, link: str) -> None:
"""Examine a link and determine if it's valid and if it leads to
a sitemap or a web page."""
if link == self.sitemap_url: # safety check
if link == self.current_url: # safety check
return
# fix, check, clean and normalize
link = fix_relative_urls(self.base_url, link)
Expand Down Expand Up @@ -135,7 +143,7 @@ def extract_sitemap_langlinks(self) -> None:
"%s sitemaps and %s links with hreflang found for %s",
len(self.sitemap_urls),
len(self.urls),
self.sitemap_url,
self.current_url,
)

def extract_sitemap_links(self) -> None:
Expand All @@ -150,12 +158,12 @@ def extract_sitemap_links(self) -> None:
"%s sitemaps and %s links found for %s",
len(self.sitemap_urls),
len(self.urls),
self.sitemap_url,
self.current_url,
)

def process(self) -> None:
"Download a sitemap and extract the links it contains."
plausible = is_plausible_sitemap(self.sitemap_url, self.content)
plausible = is_plausible_sitemap(self.current_url, self.content)
# safeguard
if not plausible:
return
Expand Down Expand Up @@ -200,47 +208,29 @@ def sitemap_search(
LOGGER.warning("base URL unreachable, dropping sitemap: %s", url)
return []

urlfilter = None
if url.endswith((".gz", "sitemap", ".xml")):
sitemapurl = url
sitemapurls = [url]
else:
sitemapurl = baseurl + "/sitemap.xml"
# filter triggered, prepare it
if len(url) > len(baseurl) + 2:
urlfilter = url

sitemap = SitemapObject(baseurl, domainname, sitemapurl, target_lang, external)
sitemap.fetch()
sitemap.process()

if not sitemap.sitemap_urls and sitemap.urls:
linklist = filter_urls(sitemap.urls, urlfilter)
LOGGER.debug("%s sitemap links found for %s", len(linklist), domainname)
return linklist

# try sitemaps in robots.txt file if nothing has been found
if not sitemap.sitemap_urls and not sitemap.urls:
sitemap.sitemap_urls = find_robots_sitemaps(baseurl)
# try additional URLs just in case
if not sitemap.sitemap_urls:
sitemap.sitemap_urls = ["".join([baseurl, "/", g]) for g in GUESSES]
sitemapurls = []

sitemap = SitemapObject(baseurl, domainname, sitemapurls, target_lang, external)

# try sitemaps in robots.txt file, additional URLs just in case
if not sitemap.sitemap_urls:
sitemap.sitemap_urls = find_robots_sitemaps(baseurl) or [
f"{baseurl}/{g}" for g in GUESSES
]

# iterate through nested sitemaps and results
seen = {sitemapurl}
i = 1
while sitemap.sitemap_urls:
sitemap.sitemap_url = sitemap.sitemap_urls.pop()
while sitemap.sitemap_urls and len(sitemap.seen) < MAX_SITEMAPS_SEEN:
sitemap.current_url = sitemap.sitemap_urls.pop()
sitemap.fetch()
sitemap.process()
# sanity check: keep track of visited sitemaps and exclude them
seen.add(sitemap.sitemap_url)
sitemap.sitemap_urls = [s for s in sitemap.sitemap_urls if s not in seen]
# counter and safeguard
i += 1
if i > MAX_SITEMAPS_SEEN:
break

sitemap.urls = filter_urls(sitemap.urls, urlfilter)
sitemap.sitemap_urls = [
s for s in sitemap.sitemap_urls if s not in sitemap.seen
]

LOGGER.debug("%s sitemap links found for %s", len(sitemap.urls), domainname)
return sitemap.urls

Expand Down
Loading