From d36c4b29fcc9a1928a64b7702a933e7dd71b979e Mon Sep 17 00:00:00 2001 From: Jurriaan Barkey Wolf Date: Tue, 23 Jun 2020 11:51:34 +0200 Subject: [PATCH] #94 getMetadata function for extraction of docx file metadata --- README.md | 26 ++++++++++++++ src/__tests__/unit.test.ts | 45 ++++++++++++++++++++++- src/main.ts | 74 +++++++++++++++++++++++++++++++++----- 3 files changed, 136 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index d1c64989..4b4c893e 100755 --- a/README.md +++ b/README.md @@ -505,6 +505,32 @@ const commands = await listCommands(template_buffer, ['{', '}']); ] ``` +The `getMetadata` function lets you extract the metadata fields from a document, such as the number of pages or words. Note that not all fields may be available, depending on the document. + +```typescript + await getMetadata(template) + // result: + Object { + "category": undefined, + "characters": 24, + "company": undefined, + "created": "2015-08-16T18:55:00Z", + "creator": "Someone Else", + "description": undefined, + "lastModifiedBy": "Grau Panea, Guillermo", + "lastPrinted": undefined, + "lines": 1, + "modified": "2016-12-15T11:21:00Z", + "pages": 1, + "paragraphs": 1, + "revision": "32", + "subject": undefined, + "template": "Normal.dotm", + "title": undefined, + "words": 4, + } +``` + # Performance & security **Templates can contain arbitrary javascript code. Beware of code injection risks!** diff --git a/src/__tests__/unit.test.ts b/src/__tests__/unit.test.ts index 922fb5e5..f664e0fe 100644 --- a/src/__tests__/unit.test.ts +++ b/src/__tests__/unit.test.ts @@ -1,6 +1,6 @@ import path from 'path'; import { zipLoad } from '../zip'; -import { readContentTypes, getMainDoc } from '../main'; +import { readContentTypes, getMainDoc, getMetadata } from '../main'; import fs from 'fs'; import { setDebugLogSink } from '../debug'; @@ -26,3 +26,46 @@ describe('[Content_Types].xml parser', () => { expect(main_doc).toStrictEqual('document2.xml'); }); }); + +describe('getMetadata', () => { + it('finds the number of pages', async () => { + const template = await fs.promises.readFile( + path.join(__dirname, 'fixtures', 'simpleQuery.docx') + ); + expect(await getMetadata(template)).toMatchInlineSnapshot(` + Object { + "category": undefined, + "characters": 24, + "company": undefined, + "created": "2015-08-16T18:55:00Z", + "creator": "Unga Graorg", + "description": undefined, + "lastModifiedBy": "Grau Panea, Guillermo", + "lastPrinted": undefined, + "lines": 1, + "modified": "2016-12-15T11:21:00Z", + "pages": 1, + "paragraphs": 1, + "revision": "32", + "subject": undefined, + "template": "Normal.dotm", + "title": undefined, + "words": 4, + } + `); + }); + + it('smoke test: does not crash on normal docx files', async () => { + expect.hasAssertions(); + const files = await fs.promises.readdir( + path.join(__dirname, 'fixtures'), + 'utf-8' + ); + for (const f of files) { + if (!f.endsWith('.docx')) continue; + const t = await fs.promises.readFile(path.join(__dirname, 'fixtures', f)); + const metadata = await getMetadata(t); + expect(typeof metadata.modified).toBe('string'); + } + }); +}); diff --git a/src/main.ts b/src/main.ts index 99c32e72..f66eae93 100644 --- a/src/main.ts +++ b/src/main.ts @@ -333,18 +333,76 @@ export async function listCommands( return commands; } -export async function readContentTypes(zip: JSZip): Promise { - const contentTypesXml = await zipGetText(zip, CONTENT_TYPES_PATH); - if (contentTypesXml == null) - throw new TemplateParseError(`${CONTENT_TYPES_PATH} could not be read`); - const node = await parseXml(contentTypesXml); +/** + * Extract metadata from a document, such as the number of pages or words. + * @param template the docx template as a Buffer-like object + */ +export async function getMetadata(template: Buffer) { + const app_xml_path = `docProps/app.xml`; + const core_xml_path = `docProps/core.xml`; + const zip = await zipLoad(template); + const appXml = await parsePath(zip, app_xml_path); + const coreXml = await parsePath(zip, core_xml_path); + // TODO: extract custom.xml as well? + + function getText(t: Node): string | undefined { + if (t._children.length === 0) return undefined; + const n = t._children[0]; + if (n._fTextNode) return n._text; + throw new Error(`Not a text node`); + } + + function findNodeText(m: Node, tag: string): string | undefined { + for (const t of m._children) { + if (t._fTextNode) continue; + if (t._tag === tag) return getText(t); + } + return; + } + + const numberize = (a: any): number | undefined => { + const c = Number(a); + if (Number.isFinite(c)) return c; + return; + }; + + return { + pages: numberize(findNodeText(appXml, 'Pages')), + words: numberize(findNodeText(appXml, 'Words')), + characters: numberize(findNodeText(appXml, 'Characters')), + lines: numberize(findNodeText(appXml, 'Lines')), + paragraphs: numberize(findNodeText(appXml, 'Paragraphs')), + company: findNodeText(appXml, 'Company'), + template: findNodeText(appXml, 'Template'), + + // from CoreXML + title: findNodeText(coreXml, 'dc:title'), + subject: findNodeText(coreXml, 'dc:subject'), + creator: findNodeText(coreXml, 'dc:creator'), + description: findNodeText(coreXml, 'dc:description'), + lastModifiedBy: findNodeText(coreXml, 'cp:lastModifiedBy'), + revision: findNodeText(coreXml, 'cp:revision'), + lastPrinted: findNodeText(coreXml, 'cp:lastPrinted'), + created: findNodeText(coreXml, 'dcterms:created'), + modified: findNodeText(coreXml, 'dcterms:modified'), + category: findNodeText(coreXml, 'cp:category'), + }; +} + +async function parsePath(zip: JSZip, xml_path: string): Promise { + const xmlFile = await zipGetText(zip, xml_path); + if (xmlFile == null) + throw new TemplateParseError(`${xml_path} could not be read`); + const node = await parseXml(xmlFile); if (node._fTextNode) - throw new TemplateParseError( - `${CONTENT_TYPES_PATH} is a text node when parsed` - ); + throw new TemplateParseError(`${xml_path} is a text node when parsed`); return node; } +export async function readContentTypes(zip: JSZip): Promise { + return await parsePath(zip, CONTENT_TYPES_PATH); +} + export function getMainDoc(contentTypes: NonTextNode): string { const MAIN_DOC_MIME = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml' as const; for (const t of contentTypes._children) {