From 3d394279b379a9a04227429b6463739c25573e44 Mon Sep 17 00:00:00 2001 From: Mael Date: Fri, 27 Sep 2024 21:38:11 +0200 Subject: [PATCH 1/8] Avoid array creation and string interpolation at each formatLine run No significant time gain on my test dataset, but might. https://transport.data.gouv.fr/datasets/tisseo-offre-de-transport-gtfs --- src/lib/import.ts | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/lib/import.ts b/src/lib/import.ts index 913fcf2..e93e964 100644 --- a/src/lib/import.ts +++ b/src/lib/import.ts @@ -604,21 +604,9 @@ const formatLine = ( } // Convert to midnight timestamp and add timestamp columns as integer seconds from midnight - const timeColumnNames = [ - 'start_time', - 'end_time', - 'arrival_time', - 'departure_time', - 'prior_notice_last_time', - 'prior_notice_start_time', - 'start_pickup_drop_off_window', - ]; - for (const timeColumnName of timeColumnNames) { + for (const [timeColumnName, timestampColumnName] of timeColumnNamesCouples) { if (formattedLine[timeColumnName]) { - const timestampColumnName = timeColumnName.endsWith('time') - ? `${timeColumnName}stamp` - : `${timeColumnName}_timestamp`; formattedLine[timestampColumnName] = calculateSecondsFromMidnight( formattedLine[timeColumnName], ); @@ -633,6 +621,20 @@ const formatLine = ( return formattedLine; }; +const timeColumnNames = [ + 'start_time', + 'end_time', + 'arrival_time', + 'departure_time', + 'prior_notice_last_time', + 'prior_notice_start_time', + 'start_pickup_drop_off_window', + ], + timeColumnNamesCouples = timeColumnNames.map((name) => [ + name, + name.endsWith('time') ? `${name}stamp` : `${name}_timestamp`, + ]); + const importLines = ( task: ITask, lines: { [x: string]: any; geojson?: string }[], From 990f7edf64c40f08f748c4182e0e8022c288b52e Mon Sep 17 00:00:00 2001 From: Mael Date: Fri, 27 Sep 2024 22:10:29 +0200 Subject: [PATCH 2/8] Add a new build-watch script --- package.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/package.json b/package.json index 34051fa..044837c 100644 --- a/package.json +++ b/package.json @@ -78,7 +78,8 @@ "scripts": { "prepare": "husky", "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js", - "build": "tsup" + "build": "tsup", + "build-watch": "tsup --watch" }, "exports": { ".": { From 84cbd569d6f243a78d6a818cdb0a48553ff1c47c Mon Sep 17 00:00:00 2001 From: Mael Date: Fri, 27 Sep 2024 22:11:58 +0200 Subject: [PATCH 3/8] Memoize the calculate seconds from midnight function Looks like this function consumes 5 seconds out of 13 for my control dataset "toulouse". This memoization does not make us gain these 5 seconds, but could nevertheless help --- src/lib/import.ts | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/src/lib/import.ts b/src/lib/import.ts index e93e964..b1b80b1 100644 --- a/src/lib/import.ts +++ b/src/lib/import.ts @@ -606,21 +606,31 @@ const formatLine = ( // Convert to midnight timestamp and add timestamp columns as integer seconds from midnight for (const [timeColumnName, timestampColumnName] of timeColumnNamesCouples) { - if (formattedLine[timeColumnName]) { - formattedLine[timestampColumnName] = calculateSecondsFromMidnight( - formattedLine[timeColumnName], - ); + const value = formattedLine[timeColumnName]; + if (value) { + formattedLine[timestampColumnName] = + cachedCalculateSecondsFromMidnight(value); // Ensure leading zeros for time columns - formattedLine[timeColumnName] = padLeadingZeros( - formattedLine[timeColumnName], - ); + formattedLine[timeColumnName] = padLeadingZeros(value); } } return formattedLine; }; +interface Dictionary { + [key: string]: T; +} +const cache: Dictionary = {}; +const cachedCalculateSecondsFromMidnight = (value: string) => { + const cached = cache[value]; + if (cached != null) return cached; + const computed = calculateSecondsFromMidnight(value); + cache[value] = computed; + return computed; +}; + const timeColumnNames = [ 'start_time', 'end_time', From 55aab6708088c1d8affb5153d418b0e0c484f9db Mon Sep 17 00:00:00 2001 From: Mael Date: Fri, 27 Sep 2024 22:35:55 +0200 Subject: [PATCH 4/8] Memoize both date functions --- src/lib/import.ts | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/lib/import.ts b/src/lib/import.ts index b1b80b1..f5cc1fe 100644 --- a/src/lib/import.ts +++ b/src/lib/import.ts @@ -608,11 +608,11 @@ const formatLine = ( for (const [timeColumnName, timestampColumnName] of timeColumnNamesCouples) { const value = formattedLine[timeColumnName]; if (value) { - formattedLine[timestampColumnName] = - cachedCalculateSecondsFromMidnight(value); + const [seconds, date] = cachedCalculateDates(value); + formattedLine[timestampColumnName] = seconds; // Ensure leading zeros for time columns - formattedLine[timeColumnName] = padLeadingZeros(value); + formattedLine[timeColumnName] = date; } } @@ -622,11 +622,14 @@ const formatLine = ( interface Dictionary { [key: string]: T; } -const cache: Dictionary = {}; -const cachedCalculateSecondsFromMidnight = (value: string) => { +type Tuple = [seconds: number | null, date: string | null]; +const cache: Dictionary = {}; +const cachedCalculateDates = (value: string) => { const cached = cache[value]; if (cached != null) return cached; - const computed = calculateSecondsFromMidnight(value); + const seconds = calculateSecondsFromMidnight(value); + const date = padLeadingZeros(value); + const computed: Tuple = [seconds, date]; cache[value] = computed; return computed; }; From f678e0f9940fdeb42f5a8b4ca1e7bf126627c1a2 Mon Sep 17 00:00:00 2001 From: Mael Date: Thu, 3 Oct 2024 12:19:41 +0200 Subject: [PATCH 5/8] Create indexes after importing all the GTFS files --- src/lib/import.ts | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/lib/import.ts b/src/lib/import.ts index f5cc1fe..f8187e3 100644 --- a/src/lib/import.ts +++ b/src/lib/import.ts @@ -517,7 +517,14 @@ const createTables = (db: Database.Database) => { db.prepare( `CREATE TABLE ${model.filenameBase} (${columns.join(', ')});`, ).run(); + } +}; +const createIndexes = (db: Database.Database) => { + for (const model of Object.values(models) as Model[]) { + if (!model.schema) { + return; + } for (const column of model.schema.filter((column) => column.index)) { db.prepare( `CREATE INDEX idx_${model.filenameBase}_${column.name} ON ${model.filenameBase} (${column.name});`, @@ -889,6 +896,9 @@ export async function importGtfs(initialConfig: Config) { } }); + log(`Will now create DB indexes`); + createIndexes(db); + log( `Completed GTFS import for ${pluralize('agency', agencyCount, true)}\n`, ); From 668b0aef30b80bd2df2662c059a0c81829f40af8 Mon Sep 17 00:00:00 2001 From: Mael Date: Thu, 3 Oct 2024 12:25:17 +0200 Subject: [PATCH 6/8] Avoir creating a new db connection for each importLines batch --- src/lib/import.ts | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/lib/import.ts b/src/lib/import.ts index f8187e3..3dd5587 100644 --- a/src/lib/import.ts +++ b/src/lib/import.ts @@ -656,15 +656,12 @@ const timeColumnNames = [ ]); const importLines = ( + db: Database.Database, task: ITask, lines: { [x: string]: any; geojson?: string }[], model: Model, totalLineCount: number, ) => { - const db = openDb({ - sqlitePath: task.sqlitePath, - }); - if (lines.length === 0) { return; } @@ -724,7 +721,7 @@ const importLines = ( ); }; -const importFiles = (task: ITask) => +const importFiles = (db: Database.Database, task: ITask) => mapSeries( Object.values(models), (model: Model) => @@ -788,7 +785,7 @@ const importFiles = (task: ITask) => lines.push(formatLine(record, model, totalLineCount)); // If we have a bunch of lines ready to insert, then do it if (lines.length >= maxInsertVariables / model.schema.length) { - importLines(task, lines, model, totalLineCount); + importLines(db, task, lines, model, totalLineCount); } } catch (error) { reject(error); @@ -799,7 +796,7 @@ const importFiles = (task: ITask) => parser.on('end', () => { try { // Insert all remaining lines - importLines(task, lines, model, totalLineCount); + importLines(db, task, lines, model, totalLineCount); } catch (error) { reject(error); } @@ -820,7 +817,7 @@ const importFiles = (task: ITask) => ); } const line = formatLine({ geojson: data }, model, totalLineCount); - importLines(task, [line], model, totalLineCount); + importLines(db, task, [line], model, totalLineCount); resolve(); }) .catch(reject); @@ -883,7 +880,7 @@ export async function importGtfs(initialConfig: Config) { } await readFiles(task); - await importFiles(task); + await importFiles(db, task); await updateRealtimeData(task); await rm(tempPath, { recursive: true }); From 07075f6b608fbabd5e7c15f3850cd22db28639bf Mon Sep 17 00:00:00 2001 From: Mael Date: Thu, 3 Oct 2024 13:42:43 +0200 Subject: [PATCH 7/8] Use sqlite's transaction method rather than batching prepare().run() --- src/lib/import.ts | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/src/lib/import.ts b/src/lib/import.ts index 3dd5587..5c693e7 100644 --- a/src/lib/import.ts +++ b/src/lib/import.ts @@ -776,6 +776,37 @@ const importFiles = (db: Database.Database, task: ITask) => ...task.csvOptions, }); + const columns = model.schema.filter((column) => column.name !== 'id'); + + const placeholder = columns.map(({ name }) => '@' + name).join(', '); + const prepareStatement = `INSERT ${task.ignoreDuplicates ? 'OR IGNORE' : ''} INTO ${ + model.filenameBase + } (${columns + .map((column) => column.name) + .join(', ')}) VALUES (${placeholder})`; + + const insert = db.prepare(prepareStatement); + + const insertMany = db.transaction((lines) => { + for (const line of lines) { + if (task.prefix === undefined) { + insert.run(line); + } else { + const prefixedLine = Object.fromEntries( + Object.entries(line).map(([columnName, value], index) => [ + columnName, + columns[index].prefix === true + ? `${task.prefix}${value}` + : value, + ]), + ); + insert.run(prefixedLine); + } + } + }); + + let lines: { [x: string]: any; geojson?: string }[] = []; + parser.on('readable', () => { let record; @@ -783,10 +814,6 @@ const importFiles = (db: Database.Database, task: ITask) => try { totalLineCount += 1; lines.push(formatLine(record, model, totalLineCount)); - // If we have a bunch of lines ready to insert, then do it - if (lines.length >= maxInsertVariables / model.schema.length) { - importLines(db, task, lines, model, totalLineCount); - } } catch (error) { reject(error); } @@ -795,8 +822,7 @@ const importFiles = (db: Database.Database, task: ITask) => parser.on('end', () => { try { - // Insert all remaining lines - importLines(db, task, lines, model, totalLineCount); + insertMany(lines); } catch (error) { reject(error); } From c4167829b72ff51bce0601a1d95cd25232f8c76a Mon Sep 17 00:00:00 2001 From: Mael Date: Thu, 3 Oct 2024 14:13:28 +0200 Subject: [PATCH 8/8] Fix getStops with bounding box test : order is not important --- src/test/get-stops.test.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/test/get-stops.test.ts b/src/test/get-stops.test.ts index cb8b78f..72a4077 100644 --- a/src/test/get-stops.test.ts +++ b/src/test/get-stops.test.ts @@ -1,5 +1,6 @@ import config from './test-config.ts'; import { openDb, closeDb, importGtfs, getStops } from '../index.ts'; +import { sortBy } from 'lodash-es'; import exp from 'constants'; beforeAll(async () => { @@ -316,6 +317,10 @@ describe('getStops():', () => { ]; expect(results).toHaveLength(3); - expect(results).toEqual(expectedResult); + + // Results aren't sorted by distance, so the DB insert statement can influence the result order + expect(sortBy(results, 'stop_id')).toEqual( + sortBy(expectedResult, 'stop_id'), + ); }); });