Skip to content

Commit

Permalink
#94 getMetadata function for extraction of docx file metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
jjhbw committed Aug 19, 2020
1 parent 1e8a9de commit d36c4b2
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 9 deletions.
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,32 @@ const commands = await listCommands(template_buffer, ['{', '}']);
]
```

The `getMetadata` function lets you extract the metadata fields from a document, such as the number of pages or words. Note that not all fields may be available, depending on the document.

```typescript
await getMetadata(template)
// result:
Object {
"category": undefined,
"characters": 24,
"company": undefined,
"created": "2015-08-16T18:55:00Z",
"creator": "Someone Else",
"description": undefined,
"lastModifiedBy": "Grau Panea, Guillermo",
"lastPrinted": undefined,
"lines": 1,
"modified": "2016-12-15T11:21:00Z",
"pages": 1,
"paragraphs": 1,
"revision": "32",
"subject": undefined,
"template": "Normal.dotm",
"title": undefined,
"words": 4,
}
```

# Performance & security

**Templates can contain arbitrary javascript code. Beware of code injection risks!**
Expand Down
45 changes: 44 additions & 1 deletion src/__tests__/unit.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import path from 'path';
import { zipLoad } from '../zip';
import { readContentTypes, getMainDoc } from '../main';
import { readContentTypes, getMainDoc, getMetadata } from '../main';
import fs from 'fs';
import { setDebugLogSink } from '../debug';

Expand All @@ -26,3 +26,46 @@ describe('[Content_Types].xml parser', () => {
expect(main_doc).toStrictEqual('document2.xml');
});
});

describe('getMetadata', () => {
it('finds the number of pages', async () => {
const template = await fs.promises.readFile(
path.join(__dirname, 'fixtures', 'simpleQuery.docx')
);
expect(await getMetadata(template)).toMatchInlineSnapshot(`
Object {
"category": undefined,
"characters": 24,
"company": undefined,
"created": "2015-08-16T18:55:00Z",
"creator": "Unga Graorg",
"description": undefined,
"lastModifiedBy": "Grau Panea, Guillermo",
"lastPrinted": undefined,
"lines": 1,
"modified": "2016-12-15T11:21:00Z",
"pages": 1,
"paragraphs": 1,
"revision": "32",
"subject": undefined,
"template": "Normal.dotm",
"title": undefined,
"words": 4,
}
`);
});

it('smoke test: does not crash on normal docx files', async () => {
expect.hasAssertions();
const files = await fs.promises.readdir(
path.join(__dirname, 'fixtures'),
'utf-8'
);
for (const f of files) {
if (!f.endsWith('.docx')) continue;
const t = await fs.promises.readFile(path.join(__dirname, 'fixtures', f));
const metadata = await getMetadata(t);
expect(typeof metadata.modified).toBe('string');
}
});
});
74 changes: 66 additions & 8 deletions src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -333,18 +333,76 @@ export async function listCommands(
return commands;
}

export async function readContentTypes(zip: JSZip): Promise<NonTextNode> {
const contentTypesXml = await zipGetText(zip, CONTENT_TYPES_PATH);
if (contentTypesXml == null)
throw new TemplateParseError(`${CONTENT_TYPES_PATH} could not be read`);
const node = await parseXml(contentTypesXml);
/**
* Extract metadata from a document, such as the number of pages or words.
* @param template the docx template as a Buffer-like object
*/
export async function getMetadata(template: Buffer) {
const app_xml_path = `docProps/app.xml`;
const core_xml_path = `docProps/core.xml`;
const zip = await zipLoad(template);
const appXml = await parsePath(zip, app_xml_path);
const coreXml = await parsePath(zip, core_xml_path);
// TODO: extract custom.xml as well?

function getText(t: Node): string | undefined {
if (t._children.length === 0) return undefined;
const n = t._children[0];
if (n._fTextNode) return n._text;
throw new Error(`Not a text node`);
}

function findNodeText(m: Node, tag: string): string | undefined {
for (const t of m._children) {
if (t._fTextNode) continue;
if (t._tag === tag) return getText(t);
}
return;
}

const numberize = (a: any): number | undefined => {
const c = Number(a);
if (Number.isFinite(c)) return c;
return;
};

return {
pages: numberize(findNodeText(appXml, 'Pages')),
words: numberize(findNodeText(appXml, 'Words')),
characters: numberize(findNodeText(appXml, 'Characters')),
lines: numberize(findNodeText(appXml, 'Lines')),
paragraphs: numberize(findNodeText(appXml, 'Paragraphs')),
company: findNodeText(appXml, 'Company'),
template: findNodeText(appXml, 'Template'),

// from CoreXML
title: findNodeText(coreXml, 'dc:title'),
subject: findNodeText(coreXml, 'dc:subject'),
creator: findNodeText(coreXml, 'dc:creator'),
description: findNodeText(coreXml, 'dc:description'),
lastModifiedBy: findNodeText(coreXml, 'cp:lastModifiedBy'),
revision: findNodeText(coreXml, 'cp:revision'),
lastPrinted: findNodeText(coreXml, 'cp:lastPrinted'),
created: findNodeText(coreXml, 'dcterms:created'),
modified: findNodeText(coreXml, 'dcterms:modified'),
category: findNodeText(coreXml, 'cp:category'),
};
}

async function parsePath(zip: JSZip, xml_path: string): Promise<NonTextNode> {
const xmlFile = await zipGetText(zip, xml_path);
if (xmlFile == null)
throw new TemplateParseError(`${xml_path} could not be read`);
const node = await parseXml(xmlFile);
if (node._fTextNode)
throw new TemplateParseError(
`${CONTENT_TYPES_PATH} is a text node when parsed`
);
throw new TemplateParseError(`${xml_path} is a text node when parsed`);
return node;
}

export async function readContentTypes(zip: JSZip): Promise<NonTextNode> {
return await parsePath(zip, CONTENT_TYPES_PATH);
}

export function getMainDoc(contentTypes: NonTextNode): string {
const MAIN_DOC_MIME = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml' as const;
for (const t of contentTypes._children) {
Expand Down

0 comments on commit d36c4b2

Please sign in to comment.