Skip to content

Commit

Permalink
#965 filter ard topics
Browse files Browse the repository at this point in the history
  • Loading branch information
pidoubleyou committed Feb 19, 2024
1 parent 1cfc733 commit cb98d9a
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 11 deletions.
Original file line number Diff line number Diff line change
@@ -1,31 +1,44 @@
package de.mediathekview.mserver.crawler.zdf.parser;

import static de.mediathekview.mserver.base.HtmlConsts.ATTRIBUTE_HREF;

import de.mediathekview.mserver.base.utils.UrlUtils;
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
import de.mediathekview.mserver.crawler.zdf.ZdfConstants;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.util.HashSet;
import java.util.Set;

import static de.mediathekview.mserver.base.HtmlConsts.ATTRIBUTE_HREF;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class ZdfTopicsPageHtmlDeserializer {

private static final String LINK_SELECTOR = "article h3 a";
private static final String ARTICLE_SELECTOR = "article";
private static final String LINK_SELECTOR = "h3 a";
private static final String TEASER_SELECTOR = "dd.teaser-info span";

public Set<CrawlerUrlDTO> deserialize(final Document document) {
final Set<CrawlerUrlDTO> results = new HashSet<>();

Elements filmUrls = document.select(LINK_SELECTOR);
Elements filmUrls = document.select(ARTICLE_SELECTOR);
filmUrls.forEach(
filmUrlElement -> {
String url = filmUrlElement.attr(ATTRIBUTE_HREF);
url = UrlUtils.addDomainIfMissing(url, ZdfConstants.URL_BASE);
results.add(new CrawlerUrlDTO(url));
articleElement -> {
final Element filmUrlElement = articleElement.selectFirst(LINK_SELECTOR);
final Element teaserElement = articleElement.selectFirst(TEASER_SELECTOR);
if (filmUrlElement != null && isRelevant(teaserElement)) {
String url = filmUrlElement.attr(ATTRIBUTE_HREF);
url = UrlUtils.addDomainIfMissing(url, ZdfConstants.URL_BASE);
results.add(new CrawlerUrlDTO(url));
}
});

return results;
}

private boolean isRelevant(Element teaserElement) {
if (teaserElement == null) {
return true;
}
return !("ARD".equalsIgnoreCase(teaserElement.text()));
}
}
70 changes: 70 additions & 0 deletions src/test/resources/zdf/zdf_topics_page1.html
Original file line number Diff line number Diff line change
Expand Up @@ -900,8 +900,78 @@ <h3 id="Achtung,Essen!-1" class="teaser-title js-rb-live" data-module="reload-gu

</div>
</article>
<article
class="b-content-teaser-item b-content-teaser-item-new cell js-impression-track m-show" >


<div class="inner m-clickarea">
<div class="ratio-wrap">
<div class="b-ratiobox">
<picture class="artdirect"
>

<source class="m-8-9 abtest"
data-srcset="https://www.zdf.de/assets/collection-teaser-image-ard-all-you-need-100~240x270?cb=1699289021228 240w 270h,https://www.zdf.de/assets/collection-teaser-image-ard-all-you-need-100~640x720?cb=1699289021228 640w 720h"
srcset="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" media="(min-width: 1px)" />
<source class="m-16-9 abtest"
data-srcset="https://www.zdf.de/assets/collection-teaser-image-ard-all-you-need-100~384x216?cb=1699289021228 384w 216h,https://www.zdf.de/assets/collection-teaser-image-ard-all-you-need-100~768x432?cb=1699289021228 768w 432h,https://www.zdf.de/assets/collection-teaser-image-ard-all-you-need-100~1280x720?cb=1699289021228 1280w 720h,https://www.zdf.de/assets/collection-teaser-image-ard-all-you-need-100~1920x1080?cb=1699289021228 1920w 1080h"
srcset="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" media="(min-width: 1px)" />
<img class="preview-image ratiobox-item lazyload js-artdirect abtest" data-src="https://www.zdf.de/assets/collection-teaser-image-ard-all-you-need-100~1920x1080?cb=1699289021228" src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" data-sizes="auto" alt="All You Need">
</picture>
</div>
</div>
<div
class="box m-tags">

<h3 id="AllYouNeed-11" class="teaser-title js-rb-live"
data-module="reload-guided-tour">

<a href="/daserste/all-you-need" title="All You Need"
class="teaser-title-link m-clickarea-action js-track-click"
data-use-in-app-browser='false' data-target-id="collection-index-page-ard-collection-ard-dxjuomfyzdpzag93ojzln2u0ymywmzizmje2ngq-118"
data-target-profile="http://zdf.de/rels/content/page-index-pd11-teaser-target"
data-target-content-type="brand"
data-target-video-type=""

data-track='{
"element": "ContentTeaser",
"action": "Click",
"format": "Small",
"teaserTracking": "true",
"targetAssetId": "SCMS_index-page-ard-collection_ard_dxjuomfyzdpzag93ojzln2u0ymywmzizmje2ngq-",
"clickedClusterPosition": "{clusterPos(.b-content-teaser-list)}",
"clickedTeaserPosition": "{teaserPos(.b-content-teaser-list, .b-content-teaser-item)}",
"nodeId": "{ nodeId(.b-content-teaser-list) }",
"actionDetail": "Inline|VPos:{clusterPos(.b-content-teaser-list)}|HPos:{teaserPos(.b-content-teaser-list, .b-content-teaser-item)}|Cluster:|All_You_Need|Linkziel:all-you-need"
}'
>
<span class="normal-space">
All You Need<span class="hyphens-helper">&nbsp;</span>
</span>
</a>
</h3>










<dl class="teaser-foot js-rb-live"
data-module="news-infoline" data-news-infoline-text="ARD">
<span class="icon-502_play icon " aria-hidden="true"></span>
<dd class="teaser-info" aria-label=" ARD">
<span aria-hidden="true">ARD</span>
</dd>
</dl>
</div>

</div>
</article>




Expand Down

0 comments on commit cb98d9a

Please sign in to comment.