Skip to content

Commit

Permalink
Optimize performance when Git is used as storage repository (#1121)
Browse files Browse the repository at this point in the history
  • Loading branch information
Ndpnt authored Nov 27, 2024
2 parents 36d2390 + a9ed0dc commit 78d0153
Show file tree
Hide file tree
Showing 8 changed files with 103 additions and 26 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@

All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## Unreleased [patch]

> Development of this release was supported by the [French Ministry for Foreign Affairs](https://www.diplomatie.gouv.fr/fr/politique-etrangere-de-la-france/diplomatie-numerique/) through its ministerial [State Startups incubator](https://beta.gouv.fr/startups/open-terms-archive.html) under the aegis of the Ambassador for Digital Affairs.
### Changed

- Optimize performance for Git storage

## 2.7.1 - 2024-11-21

_Full changeset and discussions: [#1120](https://github.com/OpenTermsArchive/engine/pull/1120)._
Expand Down
23 changes: 14 additions & 9 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"sib-api-v3-sdk": "^8.2.1",
"simple-git": "^3.8.0",
"simple-git": "^3.27.0",
"swagger-jsdoc": "^6.2.8",
"swagger-ui-express": "^5.0.0",
"winston": "^3.9.0",
Expand Down
12 changes: 11 additions & 1 deletion src/archivist/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ export const EVENTS = [
'trackingStarted',
'trackingCompleted',
'inaccessibleContent',
'info',
'error',
'pluginError',
];
Expand All @@ -45,6 +46,7 @@ export default class Archivist extends events.EventEmitter {
}

async initialize() {
this.emit('info', 'Initializing engine…');
if (this.services) {
return;
}
Expand All @@ -67,6 +69,8 @@ export default class Archivist extends events.EventEmitter {
process.exit(1);
});

this.emit('info', 'Initialization completed');

return this;
}

Expand Down Expand Up @@ -140,7 +144,13 @@ export default class Archivist extends events.EventEmitter {
return;
}

return this.recordVersion(terms, extractOnly);
await this.recordVersion(terms, extractOnly);

terms.sourceDocuments.forEach(sourceDocument => {
sourceDocument.content = null; // Reduce memory usage by clearing no longer needed large content strings
sourceDocument.mimeType = null; // …and associated MIME type
sourceDocument.snapshotId = null; // …and associated snapshot ID for consistency
});
}

async fetchSourceDocuments(terms) {
Expand Down
30 changes: 23 additions & 7 deletions src/archivist/recorder/repositories/git/git.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@ export default class Git {
await fs.mkdir(this.path, { recursive: true });
}

this.git = simpleGit(this.path, { maxConcurrentProcesses: 1 });
this.git = simpleGit(this.path, {
trimmed: true,
maxConcurrentProcesses: 1,
});

await this.git.init();

Expand All @@ -27,7 +30,8 @@ export default class Git {
.addConfig('push.default', 'current')
.addConfig('user.name', this.author.name)
.addConfig('user.email', this.author.email)
.addConfig('core.quotePath', false); // disable Git's encoding of special characters in pathnames. For example, `service·A` will be encoded as `service\302\267A` without this setting, leading to issues. See https://git-scm.com/docs/git-config#Documentation/git-config.txt-corequotePath
.addConfig('core.quotePath', false) // Disable Git's encoding of special characters in pathnames. For example, `service·A` will be encoded as `service\302\267A` without this setting, leading to issues. See https://git-scm.com/docs/git-config#Documentation/git-config.txt-corequotePath
.addConfig('core.commitGraph', true); // Enable `commit-graph` feature for efficient commit data storage, improving performance of operations like `git log`
}

add(filePath) {
Expand All @@ -42,7 +46,7 @@ export default class Git {
process.env.GIT_AUTHOR_DATE = commitDate;
process.env.GIT_COMMITTER_DATE = commitDate;

summary = await this.git.commit(message, filePath);
summary = await this.git.commit(message, filePath, ['--no-verify']); // Skip pre-commit and commit-msg hooks, as commits are programmatically managed, to optimize performance
} finally {
process.env.GIT_AUTHOR_DATE = '';
process.env.GIT_COMMITTER_DATE = '';
Expand All @@ -60,11 +64,11 @@ export default class Git {
}

listCommits(options = []) {
return this.log([ '--reverse', '--no-merges', '--name-only', ...options ]);
return this.log([ '--reverse', '--no-merges', '--name-only', ...options ]); // Returns all commits in chronological order (`--reverse`), excluding merge commits (`--no-merges`), with modified files names (`--name-only`)
}

async getCommit(options) {
const [commit] = await this.listCommits([ '-1', ...options ]);
const [commit] = await this.listCommits([ '-1', ...options ]); // Returns only the most recent commit matching the given options

return commit;
}
Expand Down Expand Up @@ -103,8 +107,8 @@ export default class Git {
return this.git.clean('f', '-d');
}

async getFullHash(shortHash) {
return (await this.git.show([ shortHash, '--pretty=%H', '-s' ])).trim();
getFullHash(shortHash) {
return this.git.show([ shortHash, '--pretty=%H', '-s' ]);
}

restore(path, commit) {
Expand All @@ -120,4 +124,16 @@ export default class Git {
relativePath(absolutePath) {
return path.relative(this.path, absolutePath); // Git needs a path relative to the .git directory, not an absolute one
}

async listFiles(path) {
return (await this.git.raw([ 'ls-files', path ])).split('\n');
}

async writeCommitGraph() {
await this.git.raw([ 'commit-graph', 'write', '--reachable', '--changed-paths' ]);
}

async updateCommitGraph() {
await this.git.raw([ 'commit-graph', 'write', '--reachable', '--changed-paths', '--append' ]);
}
}
18 changes: 12 additions & 6 deletions src/archivist/recorder/repositories/git/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ export default class GitRepository extends RepositoryInterface {
async initialize() {
await this.git.initialize();
await this.git.cleanUp(); // Drop all uncommitted changes and remove all leftover files that may be present if the process was killed aggressively
await this.git.writeCommitGraph(); // Create or replace the commit graph with a new one to ensure it's fully consistent

return this;
}
Expand Down Expand Up @@ -56,17 +57,22 @@ export default class GitRepository extends RepositoryInterface {
return record;
}

finalize() {
if (!this.needsPublication) {
return;
async finalize() {
if (this.needsPublication) {
await this.git.pushChanges();
}

return this.git.pushChanges();
return this.git.updateCommitGraph();
}

async findLatest(serviceId, termsType, documentId) {
const filePath = DataMapper.generateFilePath(serviceId, termsType, documentId);
const commit = await this.git.getCommit([filePath]);
const matchingFilesPaths = await this.git.listFiles(DataMapper.generateFilePath(serviceId, termsType, documentId));

if (!matchingFilesPaths.length) {
return null;
}

const commit = await this.git.getCommit([...matchingFilesPaths]); // Returns the most recent commit that modified any of the matching files. If multiple files match the path pattern (e.g. both HTML and PDF versions exist), returns the commit that last modified any of them

return this.#toDomain(commit);
}
Expand Down
14 changes: 12 additions & 2 deletions src/logger/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import config from 'config';
import winston from 'winston';
import 'winston-mail';

import { formatDuration } from './utils.js';

const { combine, timestamp, printf, colorize } = winston.format;

const alignedWithColorsAndTime = combine(
Expand Down Expand Up @@ -82,6 +84,7 @@ logger.configure({

let recordedSnapshotsCount;
let recordedVersionsCount;
let trackingStartTime;

logger.onFirstSnapshotRecorded = ({ serviceId, termsType, documentId, id }) => {
logger.info({ message: `Recorded first snapshot with id ${id}`, serviceId, termsType, documentId });
Expand Down Expand Up @@ -119,14 +122,17 @@ logger.onTrackingStarted = (numberOfServices, numberOfTerms, extractOnly) => {
}
recordedSnapshotsCount = 0;
recordedVersionsCount = 0;
trackingStartTime = Date.now();
};

logger.onTrackingCompleted = (numberOfServices, numberOfTerms, extractOnly) => {
const duration = formatDuration(Date.now() - trackingStartTime);

if (extractOnly) {
logger.info(`Examined ${numberOfTerms} terms from ${numberOfServices} services for extraction`);
logger.info(`Examined ${numberOfTerms} terms from ${numberOfServices} services for extraction in ${duration}`);
logger.info(`Recorded ${recordedVersionsCount} new versions\n`);
} else {
logger.info(`Tracked changes of ${numberOfTerms} terms from ${numberOfServices} services`);
logger.info(`Tracked changes of ${numberOfTerms} terms from ${numberOfServices} services in ${duration}`);
logger.info(`Recorded ${recordedSnapshotsCount} new snapshots and ${recordedVersionsCount} new versions\n`);
}
};
Expand All @@ -139,6 +145,10 @@ logger.onError = (error, terms) => {
logger.error({ message: error.stack, serviceId: terms.service.id, termsType: terms.type });
};

logger.onInfo = message => {
logger.info({ message });
};

logger.onPluginError = (error, pluginName) => {
logger.error({ message: `Error in "${pluginName}" plugin: ${error.stack}` });
};
Expand Down
22 changes: 22 additions & 0 deletions src/logger/utils.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
export const formatDuration = milliseconds => {
const seconds = Math.floor(milliseconds / 1000);
const hours = Math.floor(seconds / 3600);
const minutes = Math.floor((seconds % 3600) / 60);
const remainingSeconds = seconds % 60;

const parts = [];

if (hours > 0) {
parts.push(`${hours} hour${hours > 1 ? 's' : ''}`);
}

if (minutes > 0) {
parts.push(`${minutes} minute${minutes > 1 ? 's' : ''}`);
}

if (remainingSeconds > 0 || parts.length === 0) {
parts.push(`${remainingSeconds} second${remainingSeconds !== 1 ? 's' : ''}`);
}

return parts.join(' and ');
};

0 comments on commit 78d0153

Please sign in to comment.