From 699bf4a75469ecc986c140ca68f89b247c2ce9dc Mon Sep 17 00:00:00 2001 From: pidoubleyou Date: Thu, 15 Feb 2024 21:37:57 +0100 Subject: [PATCH 1/5] ignore zdf entries --- .../ard/json/ArdTopicsLetterDeserializer.java | 35 +++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/src/main/java/mServer/crawler/sender/ard/json/ArdTopicsLetterDeserializer.java b/src/main/java/mServer/crawler/sender/ard/json/ArdTopicsLetterDeserializer.java index 161f66cbf..12870c8cc 100644 --- a/src/main/java/mServer/crawler/sender/ard/json/ArdTopicsLetterDeserializer.java +++ b/src/main/java/mServer/crawler/sender/ard/json/ArdTopicsLetterDeserializer.java @@ -10,6 +10,7 @@ import mServer.crawler.sender.base.JsonUtils; import java.lang.reflect.Type; +import java.util.Arrays; import java.util.HashSet; import java.util.Optional; import java.util.Set; @@ -23,9 +24,13 @@ public class ArdTopicsLetterDeserializer implements JsonDeserializer parseTeaser(final JsonObject teaserObject) { id = JsonUtils.getAttributeAsString(teaserObject, ATTRIBUTE_ID); } - id.ifPresent( - nonNullId -> - results.add( - new CrawlerUrlDTO( - String.format( - ArdConstants.TOPIC_URL, nonNullId, ArdConstants.TOPIC_PAGE_SIZE)))); + if (isRelevant(teaserObject)) { + id.ifPresent( + nonNullId -> + results.add( + new CrawlerUrlDTO( + String.format( + ArdConstants.TOPIC_URL, nonNullId, ArdConstants.TOPIC_PAGE_SIZE)))); + } return results; } + + private boolean isRelevant(final JsonObject teaserObject) { + if (teaserObject.has(ELEMENT_PUBLICATION_SERVICE)) { + final JsonObject publicationService = + teaserObject.get(ELEMENT_PUBLICATION_SERVICE).getAsJsonObject(); + final Optional attributeAsString = + JsonUtils.getAttributeAsString(publicationService, ATTRIBUTE_NAME); + if (attributeAsString.isPresent()) { + + return !Arrays.stream(IGNORED_SENDER) + .anyMatch(sender -> sender.equalsIgnoreCase(attributeAsString.get())); + } + } + + return true; + } } From e2665b11e7e8c0acdcafa42fad6ffb5096b7b46e Mon Sep 17 00:00:00 2001 From: pidoubleyou Date: Mon, 19 Feb 2024 22:04:30 +0100 Subject: [PATCH 2/5] filter ard topics --- .../parser/ZdfTopicsPageHtmlDeserializer.java | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/main/java/mServer/crawler/sender/zdf/parser/ZdfTopicsPageHtmlDeserializer.java b/src/main/java/mServer/crawler/sender/zdf/parser/ZdfTopicsPageHtmlDeserializer.java index 3bc36186a..a44e0bb36 100644 --- a/src/main/java/mServer/crawler/sender/zdf/parser/ZdfTopicsPageHtmlDeserializer.java +++ b/src/main/java/mServer/crawler/sender/zdf/parser/ZdfTopicsPageHtmlDeserializer.java @@ -4,6 +4,7 @@ import mServer.crawler.sender.base.UrlUtils; import mServer.crawler.sender.zdf.ZdfConstants; import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.util.HashSet; @@ -11,20 +12,33 @@ public class ZdfTopicsPageHtmlDeserializer { - private static final String LINK_SELECTOR = "article h3 a"; + private static final String ARTICLE_SELECTOR = "article"; + private static final String LINK_SELECTOR = "h3 a"; + private static final String TEASER_SELECTOR = "dd.teaser-info span"; private static final String ATTRIBUTE_HREF = "href"; public Set deserialize(final Document document) { final Set results = new HashSet<>(); - Elements filmUrls = document.select(LINK_SELECTOR); + Elements filmUrls = document.select(ARTICLE_SELECTOR); filmUrls.forEach( - filmUrlElement -> { - String url = filmUrlElement.attr(ATTRIBUTE_HREF); - url = UrlUtils.addDomainIfMissing(url, ZdfConstants.URL_BASE); - results.add(new CrawlerUrlDTO(url)); + articleElement -> { + final Element filmUrlElement = articleElement.selectFirst(LINK_SELECTOR); + final Element teaserElement = articleElement.selectFirst(TEASER_SELECTOR); + if (filmUrlElement != null && isRelevant(teaserElement)) { + String url = filmUrlElement.attr(ATTRIBUTE_HREF); + url = UrlUtils.addDomainIfMissing(url, ZdfConstants.URL_BASE); + results.add(new CrawlerUrlDTO(url)); + } }); return results; } + + private boolean isRelevant(Element teaserElement) { + if (teaserElement == null) { + return true; + } + return !("ARD".equalsIgnoreCase(teaserElement.text())); + } } From cd3346726c61666fbf6769a333dfe05ddf7bf66a Mon Sep 17 00:00:00 2001 From: pidoubleyou Date: Sun, 3 Mar 2024 23:05:11 +0100 Subject: [PATCH 3/5] reduce daysPast --- .../java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java b/src/main/java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java index b73a5be97..96137476e 100644 --- a/src/main/java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java +++ b/src/main/java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java @@ -97,7 +97,7 @@ protected Collection getExtraDaysEntries() private ConcurrentLinkedQueue getDayUrls() { - int daysPast = CrawlerTool.loadLongMax() ? 60 : 20; + int daysPast = CrawlerTool.loadLongMax() ? 30 : 20; int daysFuture = CrawlerTool.loadLongMax() ? 30 : 10; final ConcurrentLinkedQueue urls = new ConcurrentLinkedQueue<>(); From 179152c2611718f309427fd19b531d32a1290427 Mon Sep 17 00:00:00 2001 From: lookshe Date: Mon, 4 Mar 2024 09:45:43 +0100 Subject: [PATCH 4/5] fix #967 (#968) --- src/main/java/mServer/Main.java | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/main/java/mServer/Main.java b/src/main/java/mServer/Main.java index f5cf3c242..82e6920c6 100644 --- a/src/main/java/mServer/Main.java +++ b/src/main/java/mServer/Main.java @@ -29,6 +29,9 @@ import mServer.tool.MserverDatumZeit; import mServer.tool.MserverLog; +import java.time.LocalDateTime; +import java.time.temporal.ChronoUnit; + public class Main { public Main() { @@ -88,11 +91,15 @@ public static void main(String[] args) { } private static void runServer(String[] ar) throws InterruptedException { + LocalDateTime beforeRun = LocalDateTime.now().truncatedTo(ChronoUnit.DAYS); while (new MServer(ar).starten()) { - long timeToSleep = (MserverDatumZeit.getSecondsUntilNextDay() + 120) * 1000; // 0:02 - MserverLog.systemMeldung("Schlafenlegen bis zum nächsten Tag (" + timeToSleep + "ms)"); - Thread.sleep(timeToSleep); - MserverLog.systemMeldung("Neustart der Suche"); + if (!LocalDateTime.now().truncatedTo(ChronoUnit.DAYS).isAfter(beforeRun)) { // do not sleep if day changed + long timeToSleep = (MserverDatumZeit.getSecondsUntilNextDay() + 120) * 1000; // 0:02 + MserverLog.systemMeldung("Schlafenlegen bis zum nächsten Tag (" + timeToSleep + "ms)"); + Thread.sleep(timeToSleep); + MserverLog.systemMeldung("Neustart der Suche"); + } + beforeRun = LocalDateTime.now().truncatedTo(ChronoUnit.DAYS); } } From 851e9d84da44d6e523d2ad88507d53593eb62293 Mon Sep 17 00:00:00 2001 From: Alexander F Date: Mon, 4 Mar 2024 09:47:36 +0100 Subject: [PATCH 5/5] Version auf 3.1.229 angehoben --- build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index 438aae770..6c64103e0 100644 --- a/build.gradle +++ b/build.gradle @@ -27,7 +27,7 @@ sourceCompatibility = JavaVersion.VERSION_17 targetCompatibility = JavaVersion.VERSION_17 group = 'de.mediathekview' archivesBaseName = "MServer" -version = '3.1.228' +version = '3.1.229' def jarName = 'MServer.jar' def mainClass = 'mServer.Main'