Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize performance when Git is used as storage repository #1121

Merged
merged 11 commits into from
Nov 27, 2024
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@

All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## Unreleased [patch]

> Development of this release was supported by the [French Ministry for Foreign Affairs](https://www.diplomatie.gouv.fr/fr/politique-etrangere-de-la-france/diplomatie-numerique/) through its ministerial [State Startups incubator](https://beta.gouv.fr/startups/open-terms-archive.html) under the aegis of the Ambassador for Digital Affairs.

### Changed

- Optimize performance when Git is used as storage repository
Ndpnt marked this conversation as resolved.
Show resolved Hide resolved

## 2.7.1 - 2024-11-21

_Full changeset and discussions: [#1120](https://github.com/OpenTermsArchive/engine/pull/1120)._
Expand Down
23 changes: 14 additions & 9 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"sib-api-v3-sdk": "^8.2.1",
"simple-git": "^3.8.0",
"simple-git": "^3.27.0",
"swagger-jsdoc": "^6.2.8",
"swagger-ui-express": "^5.0.0",
"winston": "^3.9.0",
Expand Down
12 changes: 11 additions & 1 deletion src/archivist/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ export const EVENTS = [
'trackingStarted',
'trackingCompleted',
'inaccessibleContent',
'info',
'error',
'pluginError',
];
Expand All @@ -45,6 +46,7 @@ export default class Archivist extends events.EventEmitter {
}

async initialize() {
this.emit('info', 'Initializing engine…');
if (this.services) {
return;
}
Expand All @@ -67,6 +69,8 @@ export default class Archivist extends events.EventEmitter {
process.exit(1);
});

this.emit('info', 'Initialization completed');

return this;
}

Expand Down Expand Up @@ -140,7 +144,13 @@ export default class Archivist extends events.EventEmitter {
return;
}

return this.recordVersion(terms, extractOnly);
await this.recordVersion(terms, extractOnly);

terms.sourceDocuments.forEach(sourceDocument => {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can't we just do terms.sourceDocuments = [] or terms.sourceDocuments = null?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because doing so would result in losing information needed for the next run, such as location, contentSelectors, …

sourceDocument.content = null; // Reduce memory usage by clearing no longer needed large content strings
sourceDocument.mimeType = null; // …and associated MIME type
sourceDocument.snapshotId = null; // …and associated snapshot ID for consistency
});
}

async fetchSourceDocuments(terms) {
Expand Down
30 changes: 23 additions & 7 deletions src/archivist/recorder/repositories/git/git.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@ export default class Git {
await fs.mkdir(this.path, { recursive: true });
}

this.git = simpleGit(this.path, { maxConcurrentProcesses: 1 });
this.git = simpleGit(this.path, {
trimmed: true,
maxConcurrentProcesses: 1,
});

await this.git.init();

Expand All @@ -27,7 +30,8 @@ export default class Git {
.addConfig('push.default', 'current')
.addConfig('user.name', this.author.name)
.addConfig('user.email', this.author.email)
.addConfig('core.quotePath', false); // disable Git's encoding of special characters in pathnames. For example, `service·A` will be encoded as `service\302\267A` without this setting, leading to issues. See https://git-scm.com/docs/git-config#Documentation/git-config.txt-corequotePath
.addConfig('core.quotePath', false) // Disable Git's encoding of special characters in pathnames. For example, `service·A` will be encoded as `service\302\267A` without this setting, leading to issues. See https://git-scm.com/docs/git-config#Documentation/git-config.txt-corequotePath
.addConfig('core.commitGraph', true); // Enable `commit-graph` feature for efficient commit data storage, improving performance of operations like `git log`
}

add(filePath) {
Expand All @@ -42,7 +46,7 @@ export default class Git {
process.env.GIT_AUTHOR_DATE = commitDate;
process.env.GIT_COMMITTER_DATE = commitDate;

summary = await this.git.commit(message, filePath);
summary = await this.git.commit(message, filePath, ['--no-verify']); // Skip pre-commit and commit-msg hooks, as commits are programmatically managed, to optimize performance
} finally {
process.env.GIT_AUTHOR_DATE = '';
process.env.GIT_COMMITTER_DATE = '';
Expand All @@ -60,11 +64,11 @@ export default class Git {
}

listCommits(options = []) {
return this.log([ '--reverse', '--no-merges', '--name-only', ...options ]);
return this.log([ '--reverse', '--no-merges', '--name-only', ...options ]); // Returns all commits in chronological order (`--reverse`), excluding merge commits (`--no-merges`), with modified files names (`--name-only`)
}

async getCommit(options) {
const [commit] = await this.listCommits([ '-1', ...options ]);
const [commit] = await this.listCommits([ '-1', ...options ]); // Returns only the most recent commit matching the given options

return commit;
}
Expand Down Expand Up @@ -103,8 +107,8 @@ export default class Git {
return this.git.clean('f', '-d');
}

async getFullHash(shortHash) {
return (await this.git.show([ shortHash, '--pretty=%H', '-s' ])).trim();
getFullHash(shortHash) {
return this.git.show([ shortHash, '--pretty=%H', '-s' ]);
}

restore(path, commit) {
Expand All @@ -120,4 +124,16 @@ export default class Git {
relativePath(absolutePath) {
return path.relative(this.path, absolutePath); // Git needs a path relative to the .git directory, not an absolute one
}

async listFiles(path) {
return (await this.git.raw([ 'ls-files', path ])).split('\n');
}

async writeCommitGraph() {
await this.git.raw([ 'commit-graph', 'write', '--reachable', '--changed-paths' ]);
}

async updateCommitGraph() {
await this.git.raw([ 'commit-graph', 'write', '--reachable', '--changed-paths', '--append' ]);
}
}
18 changes: 12 additions & 6 deletions src/archivist/recorder/repositories/git/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ export default class GitRepository extends RepositoryInterface {
async initialize() {
await this.git.initialize();
await this.git.cleanUp(); // Drop all uncommitted changes and remove all leftover files that may be present if the process was killed aggressively
await this.git.writeCommitGraph(); // Create or replace the commit graph with a new one to ensure it's fully consistent

return this;
}
Expand Down Expand Up @@ -56,17 +57,22 @@ export default class GitRepository extends RepositoryInterface {
return record;
}

finalize() {
if (!this.needsPublication) {
return;
async finalize() {
if (this.needsPublication) {
await this.git.pushChanges();
}

return this.git.pushChanges();
return this.git.updateCommitGraph();
}

async findLatest(serviceId, termsType, documentId) {
const filePath = DataMapper.generateFilePath(serviceId, termsType, documentId);
const commit = await this.git.getCommit([filePath]);
const matchingFilesPaths = await this.git.listFiles(DataMapper.generateFilePath(serviceId, termsType, documentId));

if (!matchingFilesPaths.length) {
return null;
}

const commit = await this.git.getCommit([...matchingFilesPaths]); // Returns the most recent commit that modified any of the matching files. If multiple files match the path pattern (e.g. both HTML and PDF versions exist), returns the commit that last modified any of them

return this.#toDomain(commit);
}
Expand Down
14 changes: 12 additions & 2 deletions src/logger/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import config from 'config';
import winston from 'winston';
import 'winston-mail';

import { formatDuration } from './utils.js';

const { combine, timestamp, printf, colorize } = winston.format;

const alignedWithColorsAndTime = combine(
Expand Down Expand Up @@ -82,6 +84,7 @@ logger.configure({

let recordedSnapshotsCount;
let recordedVersionsCount;
let trackingStartTime;

logger.onFirstSnapshotRecorded = ({ serviceId, termsType, documentId, id }) => {
logger.info({ message: `Recorded first snapshot with id ${id}`, serviceId, termsType, documentId });
Expand Down Expand Up @@ -119,14 +122,17 @@ logger.onTrackingStarted = (numberOfServices, numberOfTerms, extractOnly) => {
}
recordedSnapshotsCount = 0;
recordedVersionsCount = 0;
trackingStartTime = Date.now();
};

logger.onTrackingCompleted = (numberOfServices, numberOfTerms, extractOnly) => {
const duration = formatDuration(Date.now() - trackingStartTime);

if (extractOnly) {
logger.info(`Examined ${numberOfTerms} terms from ${numberOfServices} services for extraction`);
logger.info(`Examined ${numberOfTerms} terms from ${numberOfServices} services for extraction in ${duration}`);
logger.info(`Recorded ${recordedVersionsCount} new versions\n`);
} else {
logger.info(`Tracked changes of ${numberOfTerms} terms from ${numberOfServices} services`);
logger.info(`Tracked changes of ${numberOfTerms} terms from ${numberOfServices} services in ${duration}`);
logger.info(`Recorded ${recordedSnapshotsCount} new snapshots and ${recordedVersionsCount} new versions\n`);
}
};
Expand All @@ -139,6 +145,10 @@ logger.onError = (error, terms) => {
logger.error({ message: error.stack, serviceId: terms.service.id, termsType: terms.type });
};

logger.onInfo = message => {
logger.info({ message });
};

logger.onPluginError = (error, pluginName) => {
logger.error({ message: `Error in "${pluginName}" plugin: ${error.stack}` });
};
Expand Down
22 changes: 22 additions & 0 deletions src/logger/utils.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
export const formatDuration = milliseconds => {
const seconds = Math.floor(milliseconds / 1000);
const hours = Math.floor(seconds / 3600);
const minutes = Math.floor((seconds % 3600) / 60);
const remainingSeconds = seconds % 60;

const parts = [];

if (hours > 0) {
parts.push(`${hours} hour${hours > 1 ? 's' : ''}`);
}

if (minutes > 0) {
parts.push(`${minutes} minute${minutes > 1 ? 's' : ''}`);
}

if (remainingSeconds > 0 || parts.length === 0) {
parts.push(`${remainingSeconds} second${remainingSeconds !== 1 ? 's' : ''}`);
}

return parts.join(' and ');
};
Loading