Differential cmap compression code

fkaelberer · Aug 28, 2014 · b45239f · b45239f
1 parent 4834f1c
commit b45239f
Show file tree

Hide file tree

Showing 6 changed files with 1,043 additions and 58 deletions.
diff --git a/external/bcmaps_temp/savings.json b/external/bcmaps_temp/savings.json
diff --git a/external/cmapscompress/README.md b/external/cmapscompress/README.md
@@ -1,6 +1,6 @@
 # Quick notes about binary CMap format (bcmap)
 
-The format is designed to package some information from the CMap files located at external/cmap. Please notice for size optimization reasons, the original information blocks can be changed (split or joined) and items in the blocks can be swaped.
+The format is designed to package some information from the CMap files located at external/cmap. Please notice for size optimization reasons, the original information blocks can be changed (split or joined) and items in the blocks can be swapped.
 
 The data stored in binary format in network byte order (big-endian).
 
@@ -15,6 +15,55 @@ The following primitives used during encoding of the file:
   - signed fixed number (SB[n]) – similar to the SN, but it represents a signed number that is stored in B[n]
   - string (S) – the string is encoded as sequence of bytes. First comes length is characters encoded as UN, when UTF16 characters encoded as UN.
 
+# Differential compression
+
+The contents of each CMap file is either stored normally or differentially. In the latter case, a second CMap file (the 'base file') is needed for file decoding. 
+
+The first record in each file indicates if the file is stored normally or differentially. 
+It is a *baseFileName* string (S) which
+  - is empty ('') if the file is stored normally, or
+  - contains the file name of the base file (without path or extension) if it is stored differentially.
+
+The second record, *contentSize*, is an unsigned number (UN) which contains
+  - the number of bytes to follow if the file is stored normally ('the file contents'), or
+  - the number of bytes to reconstruct via differential compression (see below).
+
+In either case, the (possibly decoded) file contents are then structured as described in the [file structure](#file-structure) section.
+
+### Decoding differential data
+
+If a CMap file (let's name it *A*) is stored differentially, file contents are to be constructed from the contents of *A* and from the base file (which we shall call *B*). 
+The records to follow are alternately of the following type, starting with *copy*.
+
+A **copy**-type instruction specified by 
+ - startDelta as UN
+ - length as UN
+
+which instructs to read *length* bytes from *B*, where startDelta specifies the start position as an offset from the previously used array end. (The previous array end is initialized with the position of the start of content, i.e., after *baseFileName* and *contentSize* in *B*).
+
+An **insert**-type instruction is specified by
+ - length as UN
+
+and instructs to read append the following *length* bytes from *A* and append it to the contents.
+
+It may happen that file *B* itself is stored differentially and depends on a further file. In this case, *B* has to be restored before restoring *A*. The following pseudocode accomplishes the decoding
+```
+var contents = '';
+var previousEnd = 0; // position after *baseFileName* and *contentSize* in baseFile
+for (var copy = true; contents.length < contentSize; copy = !copy) {
+  if (copy) {
+    var start = previousEnd + A.readUN();
+    var length = A.readUN();
+    contents.append(B.subarray(start, start + length));
+    previousEnd = start + length;
+  } else {
+    var length = A.readUN();
+    contents.append(A.readBytes(length));
+  }
+}
+```
+
+<a name="file-structure"></a>
 # File structure
 
 The first byte is a header:

diff --git a/external/cmapscompress/compress.js b/external/cmapscompress/compress.js
@@ -13,6 +13,8 @@
  * limitations under the License.
  */
 
+'use strict';
+
 var fs = require('fs');
 var path = require('path');
 var parseAdobeCMap = require('./parse.js').parseAdobeCMap;
@@ -112,8 +114,8 @@ function compressCmap(srcPath, destPath, verify) {
   fs.writeFileSync(destPath, new Buffer(out, 'hex'));
 
   if (verify) {
-    var result2 = parseCMap(out);
-    var isGood = JSON.stringify(inputData) == JSON.stringify(result2);
+    var result2 = new BinaryCMapReader(out).parse();
+    var isGood = JSON.stringify(inputData) === JSON.stringify(result2);
     if (!isGood) {
       throw new Error('Extracted data does not match the expected result');
     }
@@ -125,11 +127,13 @@ function compressCmap(srcPath, destPath, verify) {
   };
 }
 
-function parseCMap(binaryData) {
-  var reader = {
-    buffer: binaryData,
-    pos: 0,
-    end: binaryData.length,
+function BinaryCMapReader(binaryData) {
+  this.buffer = binaryData;
+  this.pos = 0;
+  this.end = binaryData.length;
+}
+
+BinaryCMapReader.prototype = {
   readByte: function () {
     if (this.pos >= this.end) {
       return -1;
@@ -164,7 +168,7 @@ function parseCMap(binaryData) {
     var stack = [];
     do {
       var b = this.readByte();
-        last = !(b & 0x80);
+      var last = !(b & 0x80);
       stack.push(b & 0x7F);
     } while (!last);
     var s = '', buffer = 0, bufferSize = 0;
@@ -198,10 +202,9 @@ function parseCMap(binaryData) {
       s += String.fromCharCode(this.readNumber());
     }
     return s;
-    }
-  };
-
-  var header = reader.readByte();
+  },
+  parse: function() {
+    var header = this.readByte();
     var result = {
       type: header >> 1,
       wmode: header & 1,
@@ -211,15 +214,15 @@ function parseCMap(binaryData) {
     };
 
     var b;
-  while ((b = reader.readByte()) >= 0) {
+    while ((b = this.readByte()) >= 0) {
       var type = b >> 5;
       if (type === 7) {
         switch (b & 0x1F) {
           case 0:
-          result.comment = reader.readString();
+            result.comment = this.readString();
             break;
           case 1:
-          result.usecmap = reader.readString();
+            result.usecmap = this.readString();
             break;
         }
         continue;
@@ -235,83 +238,84 @@ function parseCMap(binaryData) {
         item.sequence = true;
       }
       var ucs2DataSize = 1;
-    var subitemsCount = reader.readNumber();
+      var subitemsCount = this.readNumber();
       var start, end, code, char;
       switch (type) {
         case 0:
-        start = reader.readHex(dataSize);
-        end = addHex(reader.readHexNumber(dataSize), start);
+          start = this.readHex(dataSize);
+          end = addHex(this.readHexNumber(dataSize), start);
           subitems.push({start: start, end: end});
           for (var i = 1; i < subitemsCount; i++) {
-          start = addHex(reader.readHexNumber(dataSize), incHex(end));
-          end = addHex(reader.readHexNumber(dataSize), start);
+            start = addHex(this.readHexNumber(dataSize), incHex(end));
+            end = addHex(this.readHexNumber(dataSize), start);
             subitems.push({start: start, end: end});
           }
           break;
         case 1:
-        start = reader.readHex(dataSize);
-        end = addHex(reader.readHexNumber(dataSize), start);
-        code = reader.readNumber();
+          start = this.readHex(dataSize);
+          end = addHex(this.readHexNumber(dataSize), start);
+          code = this.readNumber();
           subitems.push({start: start, end: end, code: code});
           for (var i = 1; i < subitemsCount; i++) {
-          start = addHex(reader.readHexNumber(dataSize), incHex(end));
-          end = addHex(reader.readHexNumber(dataSize), start);
-          code = reader.readNumber();
+            start = addHex(this.readHexNumber(dataSize), incHex(end));
+            end = addHex(this.readHexNumber(dataSize), start);
+            code = this.readNumber();
             subitems.push({start: start, end: end, code: code});
           }
           break;
         case 2:
-        char = reader.readHex(dataSize);
-        code = reader.readNumber();
+          char = this.readHex(dataSize);
+          code = this.readNumber();
           subitems.push({char: char, code: code});
           for (var i = 1; i < subitemsCount; i++) {
-          char = sequence ? incHex(char) : addHex(reader.readHexNumber(dataSize), incHex(char));
-          code = reader.readSigned() + (code + 1);
+            char = sequence ? incHex(char) : addHex(this.readHexNumber(dataSize), incHex(char));
+            code = this.readSigned() + (code + 1);
             subitems.push({char: char, code: code});
           }
           break;
         case 3:
-        start = reader.readHex(dataSize);
-        end = addHex(reader.readHexNumber(dataSize), start);
-        code = reader.readNumber();
+          start = this.readHex(dataSize);
+          end = addHex(this.readHexNumber(dataSize), start);
+          code = this.readNumber();
           subitems.push({start: start, end: end, code: code});
           for (var i = 1; i < subitemsCount; i++) {
-          start = sequence ? incHex(end) : addHex(reader.readHexNumber(dataSize), incHex(end));
-          end = addHex(reader.readHexNumber(dataSize), start);
-          code = reader.readNumber();
+            start = sequence ? incHex(end) : addHex(this.readHexNumber(dataSize), incHex(end));
+            end = addHex(this.readHexNumber(dataSize), start);
+            code = this.readNumber();
             subitems.push({start: start, end: end, code: code});
           }
           break;
         case 4:
-        char = reader.readHex(ucs2DataSize);
-        code = reader.readHex(dataSize);
+          char = this.readHex(ucs2DataSize);
+          code = this.readHex(dataSize);
           subitems.push({char: char, code: code});
           for (var i = 1; i < subitemsCount; i++) {
-          char = sequence ? incHex(char) : addHex(reader.readHexNumber(ucs2DataSize), incHex(char));
-          code = addHex(reader.readHexSigned(dataSize), incHex(code));
+            char = sequence ? incHex(char) : addHex(this.readHexNumber(ucs2DataSize), incHex(char));
+            code = addHex(this.readHexSigned(dataSize), incHex(code));
             subitems.push({char: char, code: code});
           }
           break;
         case 5:
-        start = reader.readHex(ucs2DataSize);
-        end = addHex(reader.readHexNumber(ucs2DataSize), start);
-        code = reader.readHex(dataSize);
+          start = this.readHex(ucs2DataSize);
+          end = addHex(this.readHexNumber(ucs2DataSize), start);
+          code = this.readHex(dataSize);
           subitems.push({start: start, end: end, code: code});
           for (var i = 1; i < subitemsCount; i++) {
-          start = sequence ? incHex(end) : addHex(reader.readHexNumber(ucs2DataSize), incHex(end));
-          end = addHex(reader.readHexNumber(ucs2DataSize), start);
-          code = reader.readHex(dataSize);
+            start = sequence ? incHex(end) : addHex(this.readHexNumber(ucs2DataSize), incHex(end));
+            end = addHex(this.readHexNumber(ucs2DataSize), start);
+            code = this.readHex(dataSize);
             subitems.push({start: start, end: end, code: code});
           }
           break;
         default:
-        throw new Error('Unknown type: ' + type)
+          throw new Error('Unknown type: ' + type);
       }
       result.body.push(item);
     }
 
     return result;
   }
+};
 
 function toHexDigit(n) {
   return n.toString(16);
@@ -423,15 +427,41 @@ function incHex(a) {
   return s;
 }
 
+function padRight(s, length) {
+  s = '' + s;
+  return s + new Array(Math.max(0, length - s.length + 1)).join(' ');
+}
+
+function padLeft(s, length) {
+  s = '' + s;
+  return new Array(Math.max(0, length - s.length + 1)).join(' ') + s;
+}
+
 exports.compressCmaps = function (src, dest, verify) {
   var files = fs.readdirSync(src).filter(function (fn) {
     return fn.indexOf('.') < 0; // skipping files with the extension
   });
+
+  var sumPacked = 0;
+  var sumOrig = 0;
   files.forEach(function (fn) {
     var srcPath = path.join(src, fn);
     var destPath = path.join(dest, fn + '.bcmap');
     var stats = compressCmap(srcPath, destPath, verify);
-    console.log('Compressing ' + fn + ': ' + stats.orig + ' vs ' + stats.packed +
-      ' ' + (stats.packed / stats.orig * 100).toFixed(1) + '%');
+    sumOrig += stats.orig;
+    sumPacked += stats.packed;
+//    console.log(padRight(fn, 25) + 
+//                padLeft(stats.packed, 7) + '/' + padLeft(stats.orig, 7) + ',' +
+//                padLeft((stats.packed / stats.orig * 100).toFixed(1) + '%', 7));
   });
+  console.log('-------------------------------------------------------');
+  console.log(padRight('TOTAL', 25) +
+              padLeft(sumPacked, 7) + '/' + padLeft(sumOrig, 7) + ',' +
+              padLeft((sumPacked / sumOrig * 100).toFixed(1) + '%', 7));
 };
+
+exports.writeNumber = writeNumber;
+exports.writeString = writeString;
+exports.padLeft = padLeft;
+exports.padRight = padRight;
+exports.BinaryCMapReader = BinaryCMapReader;