-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add scraper for RSD aggregator
- Loading branch information
1 parent
a1c0fe6
commit e5591de
Showing
9 changed files
with
351 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
-- SPDX-FileCopyrightText: 2024 Dusan Mijatovic (Netherlands eScience Center) | ||
-- SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]> | ||
-- SPDX-FileCopyrightText: 2024 Netherlands eScience Center | ||
-- | ||
-- SPDX-License-Identifier: Apache-2.0 | ||
|
@@ -41,7 +42,7 @@ SELECT | |
software_overview.licenses | ||
FROM | ||
software_overview() | ||
UNION | ||
UNION ALL | ||
SELECT | ||
remote_software.id, | ||
remote_rsd.label AS source, | ||
|
@@ -61,7 +62,7 @@ SELECT | |
FROM | ||
remote_software | ||
INNER JOIN | ||
remote_rsd ON remote_rsd.id=remote_software.remote_rsd | ||
remote_rsd ON remote_rsd.id = remote_software.remote_rsd_id | ||
; | ||
$$; | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
161 changes: 161 additions & 0 deletions
161
scrapers/src/main/java/nl/esciencecenter/rsd/scraper/aggregator/MainAggregator.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]> | ||
// SPDX-FileCopyrightText: 2024 Netherlands eScience Center | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
package nl.esciencecenter.rsd.scraper.aggregator; | ||
|
||
import com.google.gson.JsonArray; | ||
import com.google.gson.JsonElement; | ||
import com.google.gson.JsonObject; | ||
import nl.esciencecenter.rsd.scraper.Config; | ||
import nl.esciencecenter.rsd.scraper.RsdResponseException; | ||
import nl.esciencecenter.rsd.scraper.Utils; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.io.IOException; | ||
import java.net.URI; | ||
import java.time.Duration; | ||
import java.time.ZonedDateTime; | ||
import java.time.format.DateTimeFormatter; | ||
import java.util.Collection; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.UUID; | ||
import java.util.concurrent.Callable; | ||
import java.util.concurrent.ConcurrentHashMap; | ||
import java.util.concurrent.ConcurrentMap; | ||
import java.util.concurrent.ExecutionException; | ||
import java.util.concurrent.ExecutorService; | ||
import java.util.concurrent.Executors; | ||
import java.util.concurrent.Future; | ||
|
||
public class MainAggregator { | ||
|
||
private static final Logger LOGGER = LoggerFactory.getLogger(MainAggregator.class); | ||
private static final URI BASE_URL = URI.create(Config.backendBaseUrl()); | ||
static final String REMOTE_SOFTWARE_TABLE_NAME = "remote_software"; | ||
static final String AGGREGATOR_SERVICE_NAME = "Aggregator"; | ||
|
||
|
||
public static void main(String[] args) { | ||
LOGGER.info("Start aggregating RSDs"); | ||
long start = System.nanoTime(); | ||
|
||
ZonedDateTime now = ZonedDateTime.now(); | ||
Collection<RemoteRsdData> allRemoteEntries = PostgrestConnector.allActiveDomains(BASE_URL); | ||
|
||
Collection<RemoteRsdData> remoteEntriesToScrape = allRemoteEntries.stream() | ||
.filter(entry -> entry.refreshedAt() == null || entry.refreshedAt() | ||
.plus(entry.refreshInterval()) | ||
// subtracting 10 seconds to take into account variations in when this scraper starts | ||
.minus(Duration.ofSeconds(10L)) | ||
.isBefore(now)) | ||
.toList(); | ||
|
||
ConcurrentMap<UUID, JsonArray> softwarePerId = new ConcurrentHashMap<>(remoteEntriesToScrape.size()); | ||
Collection<Callable<Void>> tasks = remoteEntriesToScrape.stream() | ||
.<Callable<Void>>map(entry -> () -> { | ||
JsonArray scrapedSoftware = RemoteRsdConnector.getAllSoftware(entry.domain()); | ||
softwarePerId.put(entry.id(), scrapedSoftware); | ||
return null; | ||
}) | ||
.toList(); | ||
|
||
try (ExecutorService executorService = Executors.newFixedThreadPool(8)) { | ||
List<Future<Void>> completedFutures = executorService.invokeAll(tasks); | ||
for (Future<Void> completedFuture : completedFutures) { | ||
try { | ||
completedFuture.get(); | ||
} catch (ExecutionException e) { | ||
LOGGER.error("Unknown error", e); | ||
Utils.saveExceptionInDatabase(AGGREGATOR_SERVICE_NAME, REMOTE_SOFTWARE_TABLE_NAME, null, e); | ||
} catch (InterruptedException e) { | ||
Utils.saveExceptionInDatabase(AGGREGATOR_SERVICE_NAME, REMOTE_SOFTWARE_TABLE_NAME, null, e); | ||
LOGGER.error("Got interrupted, early exiting aggregating RSDs", e); | ||
Thread.currentThread().interrupt(); | ||
return; | ||
} | ||
} | ||
} catch (InterruptedException e) { | ||
Utils.saveExceptionInDatabase(AGGREGATOR_SERVICE_NAME, REMOTE_SOFTWARE_TABLE_NAME, null, e); | ||
LOGGER.error("Got interrupted, early exiting aggregating RSDs", e); | ||
Thread.currentThread().interrupt(); | ||
return; | ||
} | ||
|
||
JsonArray allSoftware = new JsonArray(); | ||
for (Map.Entry<UUID, JsonArray> entry : softwarePerId.entrySet()) { | ||
JsonArray softwareArray = entry.getValue(); | ||
UUID id = entry.getKey(); | ||
for (JsonElement jsonElement : softwareArray) { | ||
JsonObject jsonObject = jsonElement.getAsJsonObject(); | ||
jsonObject.addProperty("remote_rsd_id", id.toString()); | ||
jsonObject.addProperty("remote_software_id", jsonObject.getAsJsonPrimitive("id").getAsString()); | ||
jsonObject.remove("id"); | ||
jsonObject | ||
.addProperty("scraped_at", now.format(DateTimeFormatter.ISO_OFFSET_DATE_TIME)); | ||
|
||
allSoftware.add(jsonElement); | ||
} | ||
} | ||
|
||
PostgrestConnector.saveRemoteSoftware(BASE_URL, allSoftware); | ||
|
||
for (UUID id : softwarePerId.keySet()) { | ||
PostgrestConnector.updateRefreshedTimeAndErrorMessage(BASE_URL, id, now, null); | ||
} | ||
|
||
Map<UUID, Collection<UUID>> softwareScrapedPerRemoteId = new HashMap<>(); | ||
for (JsonElement jsonElement : allSoftware) { | ||
JsonObject jsonObject = jsonElement.getAsJsonObject(); | ||
UUID remoteRsdId = UUID.fromString(jsonObject.getAsJsonPrimitive("remote_rsd_id").getAsString()); | ||
UUID remoteSoftwareId = UUID.fromString(jsonObject.getAsJsonPrimitive("remote_software_id").getAsString()); | ||
Collection<UUID> softwareOfRemote = softwareScrapedPerRemoteId.computeIfAbsent(remoteRsdId, id -> new HashSet<>()); | ||
softwareOfRemote.add(remoteSoftwareId); | ||
} | ||
|
||
for (RemoteRsdData remoteRsdData : remoteEntriesToScrape) { | ||
UUID remoteId = remoteRsdData.id(); | ||
if (!softwareScrapedPerRemoteId.containsKey(remoteId)) { | ||
continue; | ||
} | ||
|
||
Collection<UUID> scrapedSoftware = softwareScrapedPerRemoteId.get(remoteId); | ||
Collection<UUID> previouslyStoredSoftwareIds = remoteRsdData.softwareIds(); | ||
|
||
for (UUID previouslyStoredSoftwareId : previouslyStoredSoftwareIds) { | ||
if (!scrapedSoftware.contains(previouslyStoredSoftwareId)) { | ||
try { | ||
PostgrestConnector.deleteSoftware(BASE_URL, remoteId, previouslyStoredSoftwareId); | ||
} catch (RsdResponseException e) { | ||
LOGGER.error("Unknown error", e); | ||
Utils.saveExceptionInDatabase(AGGREGATOR_SERVICE_NAME, REMOTE_SOFTWARE_TABLE_NAME, previouslyStoredSoftwareId, e); | ||
} catch (IOException e) { | ||
LOGGER.error("Unknown error", e); | ||
Utils.saveExceptionInDatabase(AGGREGATOR_SERVICE_NAME, REMOTE_SOFTWARE_TABLE_NAME, previouslyStoredSoftwareId, e); | ||
} catch (InterruptedException e) { | ||
LOGGER.error("Got interrupted, early exiting deleting old entries", e); | ||
Utils.saveExceptionInDatabase(AGGREGATOR_SERVICE_NAME, REMOTE_SOFTWARE_TABLE_NAME, previouslyStoredSoftwareId, e); | ||
Thread.currentThread().interrupt(); | ||
return; | ||
} | ||
} | ||
} | ||
} | ||
|
||
for (RemoteRsdData entry : remoteEntriesToScrape) { | ||
UUID id = entry.id(); | ||
if (!softwarePerId.containsKey(id)) { | ||
PostgrestConnector.updateRefreshedTimeAndErrorMessage(BASE_URL, id, now, "Unknown error while scraping"); | ||
} | ||
} | ||
|
||
long stop = System.nanoTime(); | ||
LOGGER.info("Done aggregating RSDs ({} ms)", (stop - start) / 1000_000L); | ||
} | ||
|
||
} |
106 changes: 106 additions & 0 deletions
106
scrapers/src/main/java/nl/esciencecenter/rsd/scraper/aggregator/PostgrestConnector.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]> | ||
// SPDX-FileCopyrightText: 2024 Netherlands eScience Center | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
package nl.esciencecenter.rsd.scraper.aggregator; | ||
|
||
import com.google.gson.JsonArray; | ||
import com.google.gson.JsonElement; | ||
import com.google.gson.JsonObject; | ||
import com.google.gson.JsonParser; | ||
import nl.esciencecenter.rsd.scraper.RsdResponseException; | ||
import nl.esciencecenter.rsd.scraper.Utils; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.io.IOException; | ||
import java.net.URI; | ||
import java.time.Duration; | ||
import java.time.ZonedDateTime; | ||
import java.time.format.DateTimeFormatter; | ||
import java.util.ArrayList; | ||
import java.util.Collection; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.UUID; | ||
|
||
public class PostgrestConnector { | ||
|
||
private static final Logger LOGGER = LoggerFactory.getLogger(PostgrestConnector.class); | ||
|
||
private PostgrestConnector() { | ||
} | ||
|
||
public static Collection<RemoteRsdData> allActiveDomains(URI baseUrl) { | ||
String filter = "select=id,domain,scrape_interval_minutes,scraped_at,remote_software(remote_software_id)&active=is.true"; | ||
String url = baseUrl.toString() + "/remote_rsd?" + filter; | ||
|
||
String response = Utils.getAsAdmin(url); | ||
|
||
return parseDomainsToScrapeResponse(response); | ||
} | ||
|
||
static List<RemoteRsdData> parseDomainsToScrapeResponse(String response) { | ||
JsonArray jsonTree = JsonParser.parseString(response).getAsJsonArray(); | ||
List<RemoteRsdData> result = new ArrayList<>(jsonTree.size()); | ||
|
||
for (JsonElement element : jsonTree) { | ||
try { | ||
JsonObject jsonObject = element.getAsJsonObject(); | ||
UUID id = UUID.fromString(jsonObject.getAsJsonPrimitive("id").getAsString()); | ||
URI domain = URI.create(jsonObject.getAsJsonPrimitive("domain").getAsString()); | ||
Duration refreshInterval = Duration.ofMinutes(jsonObject.getAsJsonPrimitive("scrape_interval_minutes") | ||
.getAsLong()); | ||
JsonElement refreshedAtElement = jsonObject.get("scraped_at"); | ||
ZonedDateTime refreshedAt = refreshedAtElement.isJsonNull() ? null : ZonedDateTime.parse(refreshedAtElement.getAsString()); | ||
|
||
Collection<UUID> softwareIds = new HashSet<>(); | ||
JsonArray idsArray = jsonObject.getAsJsonArray("remote_software"); | ||
for (JsonElement jsonElement : idsArray) { | ||
UUID softwareId = UUID.fromString(jsonElement.getAsJsonObject() | ||
.getAsJsonPrimitive("remote_software_id") | ||
.getAsString()); | ||
softwareIds.add(softwareId); | ||
} | ||
|
||
result.add(new RemoteRsdData( | ||
id, | ||
domain, | ||
refreshInterval, | ||
refreshedAt, | ||
softwareIds | ||
)); | ||
} catch (RuntimeException e) { | ||
LOGGER.error("Exception when parsing item", e); | ||
Utils.saveExceptionInDatabase(MainAggregator.AGGREGATOR_SERVICE_NAME, MainAggregator.REMOTE_SOFTWARE_TABLE_NAME, null, e); | ||
} | ||
} | ||
|
||
return result; | ||
} | ||
|
||
public static void deleteSoftware(URI baseUrl, UUID remoteRsdId, UUID remoteSoftwareId) throws RsdResponseException, IOException, InterruptedException { | ||
String filter = "remote_rsd_id=eq." + remoteRsdId + "&remote_software_id=eq." + remoteSoftwareId; | ||
String url = baseUrl.toString() + "/remote_software?" + filter; | ||
|
||
Utils.deleteAsAdmin(url); | ||
} | ||
|
||
public static void saveRemoteSoftware(URI baseUrl, JsonArray softwareArray) { | ||
String url = baseUrl + "/remote_software?on_conflict=remote_rsd_id,remote_software_id"; | ||
|
||
Utils.postAsAdmin(url, softwareArray.toString(), "Prefer", "resolution=merge-duplicates"); | ||
} | ||
|
||
public static void updateRefreshedTimeAndErrorMessage(URI baseUrl, UUID id, ZonedDateTime refreshedAt, String errorMessage) { | ||
String filter = "id=eq." + id; | ||
String url = baseUrl.toString() + "/remote_rsd?" + filter; | ||
|
||
JsonObject body = new JsonObject(); | ||
body.addProperty("scraped_at", refreshedAt.format(DateTimeFormatter.ISO_OFFSET_DATE_TIME)); | ||
body.addProperty("last_err_msg", errorMessage); | ||
|
||
Utils.patchAsAdmin(url, body.toString()); | ||
} | ||
} |
28 changes: 28 additions & 0 deletions
28
scrapers/src/main/java/nl/esciencecenter/rsd/scraper/aggregator/RemoteRsdConnector.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]> | ||
// SPDX-FileCopyrightText: 2024 Netherlands eScience Center | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
package nl.esciencecenter.rsd.scraper.aggregator; | ||
|
||
import com.google.gson.JsonArray; | ||
import com.google.gson.JsonParser; | ||
import nl.esciencecenter.rsd.scraper.RsdResponseException; | ||
import nl.esciencecenter.rsd.scraper.Utils; | ||
|
||
import java.io.IOException; | ||
import java.net.URI; | ||
|
||
public class RemoteRsdConnector { | ||
|
||
private RemoteRsdConnector() { | ||
} | ||
|
||
public static JsonArray getAllSoftware(URI remoteDomain) throws RsdResponseException, IOException, InterruptedException { | ||
String path = "/api/v1/rpc/software_overview"; | ||
String url = remoteDomain.toString() + path; | ||
|
||
String response = Utils.get(url); | ||
return JsonParser.parseString(response).getAsJsonArray(); | ||
} | ||
} |
Oops, something went wrong.