Skip to content

Commit

Permalink
refacto get datasets beta.gouv
Browse files Browse the repository at this point in the history
  • Loading branch information
fufeck committed Oct 2, 2023
1 parent 452d563 commit 2dcaaea
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 35 deletions.
1 change: 1 addition & 0 deletions .env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ ADMIN_TOKEN=
API_DEPOT_URL=https://plateforme-bal.adresse.data.gouv.fr/api-depot
API_DEPOT_CLIENT_SECRET=
API_DEPOT_CLIENT_ID=
URL_API_DATA_GOUV=https://www.data.gouv.fr/api/1

S3_ENDPOINT=https://s3.gra.io.cloud.ovh.net/
S3_CONTAINER_ID=
Expand Down
80 changes: 50 additions & 30 deletions lib/sources/datagouv.js
Original file line number Diff line number Diff line change
@@ -1,57 +1,42 @@
const got = require('got')
const {chain} = require('lodash')

function isCertified(organization) {
const {badges} = organization

return badges.some(b => b.kind === 'certified')
&& badges.some(b => b.kind === 'public-service')
}

function isBAL(resource) {
return resource.format === 'csv' || resource.url.endsWith('csv')
}
const URL_API_DATA_GOUV = process.env.URL_API_DATA_GOUV || 'https://www.data.gouv.fr/api/1'
const PAGE_SIZE = 100
const TAG = 'base-adresse-locale'
const FORMAT = 'csv'

// CREATE INTERNE CACHE
const organizationsCache = {}
const datasetsCache = {}

async function getOrganization(organizationId) {
if (!(organizationId in organizationsCache)) {
const response = await got(`https://www.data.gouv.fr/api/1/organizations/${organizationId}/`, {responseType: 'json'})
const response = await got(`${URL_API_DATA_GOUV}/organizations/${organizationId}/`, {responseType: 'json'})
organizationsCache[organizationId] = response.body
}

return organizationsCache[organizationId]
}

const datasetsCache = {}

async function getDataset(datasetId) {
if (!(datasetId in datasetsCache)) {
const response = await got(`https://www.data.gouv.fr/api/1/datasets/${datasetId}/`, {responseType: 'json'})
const response = await got(`${URL_API_DATA_GOUV}/datasets/${datasetId}/`, {responseType: 'json'})
datasetsCache[datasetId] = response.body
}

return datasetsCache[datasetId]
}

async function getEligibleBALDatasets() {
const response = await got('https://www.data.gouv.fr/api/1/datasets/?tag=base-adresse-locale&format=csv&page_size=1000', {responseType: 'json'})

// Register in datasets cache
response.body.data.forEach(dataset => {
datasetsCache[dataset.id] = dataset
})
function isCertified(organization) {
const {badges} = organization

const datasets = await Promise.all(
response.body.data
.filter(d => d.resources.some(r => isBAL(r)) && d.organization && !d.archived)
.map(async d => {
const organization = await getOrganization(d.organization.id)
return {...d, organization}
})
)
return badges.some(b => b.kind === 'certified')
&& badges.some(b => b.kind === 'public-service')
}

return datasets.filter(d => isCertified(d.organization))
function isBAL(resource) {
return resource.format === 'csv' || resource.url.endsWith('csv')
}

function getBALUrl(dataset) {
Expand All @@ -64,4 +49,39 @@ function getBALUrl(dataset) {
return mostRecentResource.url
}

function computeBetaGouvDatasetsUrl(page) {
return URL_API_DATA_GOUV
+ '/datasets/'
+ `/?tag=${TAG}`
+ `&format=${FORMAT}`
+ `&page_size=${PAGE_SIZE}`
+ `&page=${page}`
}

async function fetchDatasets(page = 1) {
const url = computeBetaGouvDatasetsUrl(page)
const response = await got(url, {responseType: 'json'})

// FILTER DATASETS
const datasets = response.body.data
.filter(d => d.resources.some(r => isBAL(r)) && d.organization && !d.archived && isCertified(d.organization))

if (response.body.total > page * PAGE_SIZE) {
return [...datasets, ...await fetchDatasets(page + 1)]
}

return datasets
}

async function getEligibleBALDatasets() {
// GET DATASET
const datasets = await fetchDatasets()
// BUILD CACHES
datasets.forEach(dataset => {
datasetsCache[dataset.id] = dataset
})

return datasets
}

module.exports = {getEligibleBALDatasets, getOrganization, getDataset, getBALUrl}
14 changes: 9 additions & 5 deletions lib/sources/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,16 @@ async function augmentCustomEntry(entry) {
organization = await getOrganization(entry.organization || dataset.organization.id)
}

// ADD URL TO ENTRY
if (!entry.url && !entry.converter && dataset) {
return {...entry, dataset, organization, url: getBALUrl(dataset)}
entry.url = getBALUrl(dataset)
}

return {
...entry,
type: 'github',
dataset,
organization
organization,
...entry
}
}

Expand Down Expand Up @@ -73,13 +74,16 @@ function computeMetaFromSource(source) {
async function computeList() {
const customSources = readYamlFile(sourcesFilePath)

// CREATE BLACKLIST ID DATASET
const blacklistedIds = customSources.blackList.map(e => e.dataset)
// CREATE WHITELIST ID DATASET
const whitelistedIds = customSources.whiteList.filter(e => e.dataset).map(e => e.dataset)

// GET DATASETS FROM DATAGOUV
const eligibleBALDatasets = await getEligibleBALDatasets()
// FILTER DATASETS WITHOUT BLACKLISTE AND WHITELISTE
const selectedDatasets = eligibleBALDatasets
.filter(d => !blacklistedIds.includes(d.id) && !whitelistedIds.includes(d.id))

// MIX WHITELIST AND DATAGOUV DATASETS
const sources = [
...(await Promise.all(customSources.whiteList.map(source => augmentCustomEntry(source)))),
...(selectedDatasets.map(dataset => prepareEligibleEntry(dataset)))
Expand Down

0 comments on commit 2dcaaea

Please sign in to comment.