From 2165063077013eb4d894913697d84396269e463b Mon Sep 17 00:00:00 2001 From: Fufeck Date: Mon, 11 Sep 2023 11:56:00 +0200 Subject: [PATCH] refacto get datasets beta.gouv --- .env.sample | 1 + lib/sources/datagouv.js | 80 +++++++++++++++++++++++++---------------- lib/sources/index.js | 14 +++++--- 3 files changed, 60 insertions(+), 35 deletions(-) diff --git a/.env.sample b/.env.sample index 04a9f10..1a41439 100644 --- a/.env.sample +++ b/.env.sample @@ -6,6 +6,7 @@ ADMIN_TOKEN= API_DEPOT_URL=https://plateforme-bal.adresse.data.gouv.fr/api-depot API_DEPOT_CLIENT_SECRET= API_DEPOT_CLIENT_ID= +URL_API_DATA_GOUV=https://www.data.gouv.fr/api/1 S3_ENDPOINT=https://s3.gra.io.cloud.ovh.net/ S3_CONTAINER_ID= diff --git a/lib/sources/datagouv.js b/lib/sources/datagouv.js index c6ba9a0..cdbf640 100644 --- a/lib/sources/datagouv.js +++ b/lib/sources/datagouv.js @@ -1,57 +1,42 @@ const got = require('got') const {chain} = require('lodash') -function isCertified(organization) { - const {badges} = organization - - return badges.some(b => b.kind === 'certified') - && badges.some(b => b.kind === 'public-service') -} - -function isBAL(resource) { - return resource.format === 'csv' || resource.url.endsWith('csv') -} +const URL_API_DATA_GOUV = process.env.URL_API_DATA_GOUV || 'https://www.data.gouv.fr/api/1' +const PAGE_SIZE = 100 +const TAG = 'base-adresse-locale' +const FORMAT = 'csv' +// CREATE INTERNE CACHE const organizationsCache = {} +const datasetsCache = {} async function getOrganization(organizationId) { if (!(organizationId in organizationsCache)) { - const response = await got(`https://www.data.gouv.fr/api/1/organizations/${organizationId}/`, {responseType: 'json'}) + const response = await got(`${URL_API_DATA_GOUV}/organizations/${organizationId}/`, {responseType: 'json'}) organizationsCache[organizationId] = response.body } return organizationsCache[organizationId] } -const datasetsCache = {} - async function getDataset(datasetId) { if (!(datasetId in datasetsCache)) { - const response = await got(`https://www.data.gouv.fr/api/1/datasets/${datasetId}/`, {responseType: 'json'}) + const response = await got(`${URL_API_DATA_GOUV}/datasets/${datasetId}/`, {responseType: 'json'}) datasetsCache[datasetId] = response.body } return datasetsCache[datasetId] } -async function getEligibleBALDatasets() { - const response = await got('https://www.data.gouv.fr/api/1/datasets/?tag=base-adresse-locale&format=csv&page_size=1000', {responseType: 'json'}) - - // Register in datasets cache - response.body.data.forEach(dataset => { - datasetsCache[dataset.id] = dataset - }) +function isCertified(organization) { + const {badges} = organization - const datasets = await Promise.all( - response.body.data - .filter(d => d.resources.some(r => isBAL(r)) && d.organization && !d.archived) - .map(async d => { - const organization = await getOrganization(d.organization.id) - return {...d, organization} - }) - ) + return badges.some(b => b.kind === 'certified') + && badges.some(b => b.kind === 'public-service') +} - return datasets.filter(d => isCertified(d.organization)) +function isBAL(resource) { + return resource.format === 'csv' || resource.url.endsWith('csv') } function getBALUrl(dataset) { @@ -64,4 +49,39 @@ function getBALUrl(dataset) { return mostRecentResource.url } +function computeBetaGouvDatasetsUrl(page) { + return URL_API_DATA_GOUV + + '/datasets/' + + `/?tag=${TAG}` + + `&format=${FORMAT}` + + `&page_size=${PAGE_SIZE}` + + `&page=${page}` +} + +async function fetchDatasets(page = 1) { + const url = computeBetaGouvDatasetsUrl(page) + const response = await got(url, {responseType: 'json'}) + + // FILTER DATASETS + const datasets = response.body.data + .filter(d => d.resources.some(r => isBAL(r)) && d.organization && !d.archived && isCertified(d.organization)) + + if (response.body.total > page * PAGE_SIZE) { + return [...datasets, ...await fetchDatasets(page + 1)] + } + + return datasets +} + +async function getEligibleBALDatasets() { + // GET DATASET + const datasets = await fetchDatasets() + // BUILD CACHES + datasets.forEach(dataset => { + datasetsCache[dataset.id] = dataset + }) + + return datasets +} + module.exports = {getEligibleBALDatasets, getOrganization, getDataset, getBALUrl} diff --git a/lib/sources/index.js b/lib/sources/index.js index 598170e..b0e9330 100644 --- a/lib/sources/index.js +++ b/lib/sources/index.js @@ -18,15 +18,16 @@ async function augmentCustomEntry(entry) { organization = await getOrganization(entry.organization || dataset.organization.id) } + // ADD URL TO ENTRY if (!entry.url && !entry.converter && dataset) { - return {...entry, dataset, organization, url: getBALUrl(dataset)} + entry.url = getBALUrl(dataset) } return { - ...entry, type: 'github', dataset, - organization + organization, + ...entry } } @@ -73,13 +74,16 @@ function computeMetaFromSource(source) { async function computeList() { const customSources = readYamlFile(sourcesFilePath) + // CREATE BLACKLIST ID DATASET const blacklistedIds = customSources.blackList.map(e => e.dataset) + // CREATE WHITELIST ID DATASET const whitelistedIds = customSources.whiteList.filter(e => e.dataset).map(e => e.dataset) - + // GET DATASETS FROM DATAGOUV const eligibleBALDatasets = await getEligibleBALDatasets() + // FILTER DATASETS WITHOUT BLACKLISTE AND WHITELISTE const selectedDatasets = eligibleBALDatasets .filter(d => !blacklistedIds.includes(d.id) && !whitelistedIds.includes(d.id)) - + // MIX WHITELIST AND DATAGOUV DATASETS const sources = [ ...(await Promise.all(customSources.whiteList.map(source => augmentCustomEntry(source)))), ...(selectedDatasets.map(dataset => prepareEligibleEntry(dataset)))