From d36c4b29fcc9a1928a64b7702a933e7dd71b979e Mon Sep 17 00:00:00 2001
From: Jurriaan Barkey Wolf <jjhbarkeywolf@gmail.com>
Date: Tue, 23 Jun 2020 11:51:34 +0200
Subject: [PATCH] #94 getMetadata function for extraction of docx file metadata

---
 README.md                  | 26 ++++++++++++++
 src/__tests__/unit.test.ts | 45 ++++++++++++++++++++++-
 src/main.ts                | 74 +++++++++++++++++++++++++++++++++-----
 3 files changed, 136 insertions(+), 9 deletions(-)
diff --git a/README.md b/README.md
index d1c64989..4b4c893e 100755
--- a/README.md
+++ b/README.md
@@ -505,6 +505,32 @@ const commands = await listCommands(template_buffer, ['{', '}']);
 ]
 ```
 
+The `getMetadata` function lets you extract the metadata fields from a document, such as the number of pages or words. Note that not all fields may be available, depending on the document.
+
+```typescript
+    await getMetadata(template)
+    // result:
+      Object {
+        "category": undefined,
+        "characters": 24,
+        "company": undefined,
+        "created": "2015-08-16T18:55:00Z",
+        "creator": "Someone Else",
+        "description": undefined,
+        "lastModifiedBy": "Grau Panea, Guillermo",
+        "lastPrinted": undefined,
+        "lines": 1,
+        "modified": "2016-12-15T11:21:00Z",
+        "pages": 1,
+        "paragraphs": 1,
+        "revision": "32",
+        "subject": undefined,
+        "template": "Normal.dotm",
+        "title": undefined,
+        "words": 4,
+      }
+```
+
 # Performance & security
 
 **Templates can contain arbitrary javascript code. Beware of code injection risks!**
diff --git a/src/__tests__/unit.test.ts b/src/__tests__/unit.test.ts
index 922fb5e5..f664e0fe 100644
--- a/src/__tests__/unit.test.ts
+++ b/src/__tests__/unit.test.ts
@@ -1,6 +1,6 @@
 import path from 'path';
 import { zipLoad } from '../zip';
-import { readContentTypes, getMainDoc } from '../main';
+import { readContentTypes, getMainDoc, getMetadata } from '../main';
 import fs from 'fs';
 import { setDebugLogSink } from '../debug';
 
@@ -26,3 +26,46 @@ describe('[Content_Types].xml parser', () => {
     expect(main_doc).toStrictEqual('document2.xml');
   });
 });
+
+describe('getMetadata', () => {
+  it('finds the number of pages', async () => {
+    const template = await fs.promises.readFile(
+      path.join(__dirname, 'fixtures', 'simpleQuery.docx')
+    );
+    expect(await getMetadata(template)).toMatchInlineSnapshot(`
+      Object {
+        "category": undefined,
+        "characters": 24,
+        "company": undefined,
+        "created": "2015-08-16T18:55:00Z",
+        "creator": "Unga Graorg",
+        "description": undefined,
+        "lastModifiedBy": "Grau Panea, Guillermo",
+        "lastPrinted": undefined,
+        "lines": 1,
+        "modified": "2016-12-15T11:21:00Z",
+        "pages": 1,
+        "paragraphs": 1,
+        "revision": "32",
+        "subject": undefined,
+        "template": "Normal.dotm",
+        "title": undefined,
+        "words": 4,
+      }
+    `);
+  });
+
+  it('smoke test: does not crash on normal docx files', async () => {
+    expect.hasAssertions();
+    const files = await fs.promises.readdir(
+      path.join(__dirname, 'fixtures'),
+      'utf-8'
+    );
+    for (const f of files) {
+      if (!f.endsWith('.docx')) continue;
+      const t = await fs.promises.readFile(path.join(__dirname, 'fixtures', f));
+      const metadata = await getMetadata(t);
+      expect(typeof metadata.modified).toBe('string');
+    }
+  });
+});
diff --git a/src/main.ts b/src/main.ts
index 99c32e72..f66eae93 100644
--- a/src/main.ts
+++ b/src/main.ts
@@ -333,18 +333,76 @@ export async function listCommands(
   return commands;
 }
 
-export async function readContentTypes(zip: JSZip): Promise<NonTextNode> {
-  const contentTypesXml = await zipGetText(zip, CONTENT_TYPES_PATH);
-  if (contentTypesXml == null)
-    throw new TemplateParseError(`${CONTENT_TYPES_PATH} could not be read`);
-  const node = await parseXml(contentTypesXml);
+/**
+ * Extract metadata from a document, such as the number of pages or words.
+ * @param template the docx template as a Buffer-like object
+ */
+export async function getMetadata(template: Buffer) {
+  const app_xml_path = `docProps/app.xml`;
+  const core_xml_path = `docProps/core.xml`;
+  const zip = await zipLoad(template);
+  const appXml = await parsePath(zip, app_xml_path);
+  const coreXml = await parsePath(zip, core_xml_path);
+  // TODO: extract custom.xml as well?
+
+  function getText(t: Node): string | undefined {
+    if (t._children.length === 0) return undefined;
+    const n = t._children[0];
+    if (n._fTextNode) return n._text;
+    throw new Error(`Not a text node`);
+  }
+
+  function findNodeText(m: Node, tag: string): string | undefined {
+    for (const t of m._children) {
+      if (t._fTextNode) continue;
+      if (t._tag === tag) return getText(t);
+    }
+    return;
+  }
+
+  const numberize = (a: any): number | undefined => {
+    const c = Number(a);
+    if (Number.isFinite(c)) return c;
+    return;
+  };
+
+  return {
+    pages: numberize(findNodeText(appXml, 'Pages')),
+    words: numberize(findNodeText(appXml, 'Words')),
+    characters: numberize(findNodeText(appXml, 'Characters')),
+    lines: numberize(findNodeText(appXml, 'Lines')),
+    paragraphs: numberize(findNodeText(appXml, 'Paragraphs')),
+    company: findNodeText(appXml, 'Company'),
+    template: findNodeText(appXml, 'Template'),
+
+    // from CoreXML
+    title: findNodeText(coreXml, 'dc:title'),
+    subject: findNodeText(coreXml, 'dc:subject'),
+    creator: findNodeText(coreXml, 'dc:creator'),
+    description: findNodeText(coreXml, 'dc:description'),
+    lastModifiedBy: findNodeText(coreXml, 'cp:lastModifiedBy'),
+    revision: findNodeText(coreXml, 'cp:revision'),
+    lastPrinted: findNodeText(coreXml, 'cp:lastPrinted'),
+    created: findNodeText(coreXml, 'dcterms:created'),
+    modified: findNodeText(coreXml, 'dcterms:modified'),
+    category: findNodeText(coreXml, 'cp:category'),
+  };
+}
+
+async function parsePath(zip: JSZip, xml_path: string): Promise<NonTextNode> {
+  const xmlFile = await zipGetText(zip, xml_path);
+  if (xmlFile == null)
+    throw new TemplateParseError(`${xml_path} could not be read`);
+  const node = await parseXml(xmlFile);
   if (node._fTextNode)
-    throw new TemplateParseError(
-      `${CONTENT_TYPES_PATH} is a text node when parsed`
-    );
+    throw new TemplateParseError(`${xml_path} is a text node when parsed`);
   return node;
 }
 
+export async function readContentTypes(zip: JSZip): Promise<NonTextNode> {
+  return await parsePath(zip, CONTENT_TYPES_PATH);
+}
+
 export function getMainDoc(contentTypes: NonTextNode): string {
   const MAIN_DOC_MIME = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml' as const;
   for (const t of contentTypes._children) {