-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
1cfc733
commit cb98d9a
Showing
2 changed files
with
94 additions
and
11 deletions.
There are no files selected for viewing
35 changes: 24 additions & 11 deletions
35
src/main/java/de/mediathekview/mserver/crawler/zdf/parser/ZdfTopicsPageHtmlDeserializer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,31 +1,44 @@ | ||
package de.mediathekview.mserver.crawler.zdf.parser; | ||
|
||
import static de.mediathekview.mserver.base.HtmlConsts.ATTRIBUTE_HREF; | ||
|
||
import de.mediathekview.mserver.base.utils.UrlUtils; | ||
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; | ||
import de.mediathekview.mserver.crawler.zdf.ZdfConstants; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.select.Elements; | ||
|
||
import java.util.HashSet; | ||
import java.util.Set; | ||
|
||
import static de.mediathekview.mserver.base.HtmlConsts.ATTRIBUTE_HREF; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.select.Elements; | ||
|
||
public class ZdfTopicsPageHtmlDeserializer { | ||
|
||
private static final String LINK_SELECTOR = "article h3 a"; | ||
private static final String ARTICLE_SELECTOR = "article"; | ||
private static final String LINK_SELECTOR = "h3 a"; | ||
private static final String TEASER_SELECTOR = "dd.teaser-info span"; | ||
|
||
public Set<CrawlerUrlDTO> deserialize(final Document document) { | ||
final Set<CrawlerUrlDTO> results = new HashSet<>(); | ||
|
||
Elements filmUrls = document.select(LINK_SELECTOR); | ||
Elements filmUrls = document.select(ARTICLE_SELECTOR); | ||
filmUrls.forEach( | ||
filmUrlElement -> { | ||
String url = filmUrlElement.attr(ATTRIBUTE_HREF); | ||
url = UrlUtils.addDomainIfMissing(url, ZdfConstants.URL_BASE); | ||
results.add(new CrawlerUrlDTO(url)); | ||
articleElement -> { | ||
final Element filmUrlElement = articleElement.selectFirst(LINK_SELECTOR); | ||
final Element teaserElement = articleElement.selectFirst(TEASER_SELECTOR); | ||
if (filmUrlElement != null && isRelevant(teaserElement)) { | ||
String url = filmUrlElement.attr(ATTRIBUTE_HREF); | ||
url = UrlUtils.addDomainIfMissing(url, ZdfConstants.URL_BASE); | ||
results.add(new CrawlerUrlDTO(url)); | ||
} | ||
}); | ||
|
||
return results; | ||
} | ||
|
||
private boolean isRelevant(Element teaserElement) { | ||
if (teaserElement == null) { | ||
return true; | ||
} | ||
return !("ARD".equalsIgnoreCase(teaserElement.text())); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters