Skip to content

Commit

Permalink
Differential cmap compression code
Browse files Browse the repository at this point in the history
  • Loading branch information
fkaelberer committed Aug 28, 2014
1 parent 4834f1c commit b45239f
Show file tree
Hide file tree
Showing 6 changed files with 1,043 additions and 58 deletions.
1 change: 1 addition & 0 deletions external/bcmaps_temp/savings.json

Large diffs are not rendered by default.

51 changes: 50 additions & 1 deletion external/cmapscompress/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Quick notes about binary CMap format (bcmap)

The format is designed to package some information from the CMap files located at external/cmap. Please notice for size optimization reasons, the original information blocks can be changed (split or joined) and items in the blocks can be swaped.
The format is designed to package some information from the CMap files located at external/cmap. Please notice for size optimization reasons, the original information blocks can be changed (split or joined) and items in the blocks can be swapped.

The data stored in binary format in network byte order (big-endian).

Expand All @@ -15,6 +15,55 @@ The following primitives used during encoding of the file:
- signed fixed number (SB[n]) – similar to the SN, but it represents a signed number that is stored in B[n]
- string (S) – the string is encoded as sequence of bytes. First comes length is characters encoded as UN, when UTF16 characters encoded as UN.

# Differential compression

The contents of each CMap file is either stored normally or differentially. In the latter case, a second CMap file (the 'base file') is needed for file decoding.

The first record in each file indicates if the file is stored normally or differentially.
It is a *baseFileName* string (S) which
- is empty ('') if the file is stored normally, or
- contains the file name of the base file (without path or extension) if it is stored differentially.

The second record, *contentSize*, is an unsigned number (UN) which contains
- the number of bytes to follow if the file is stored normally ('the file contents'), or
- the number of bytes to reconstruct via differential compression (see below).

In either case, the (possibly decoded) file contents are then structured as described in the [file structure](#file-structure) section.

### Decoding differential data

If a CMap file (let's name it *A*) is stored differentially, file contents are to be constructed from the contents of *A* and from the base file (which we shall call *B*).
The records to follow are alternately of the following type, starting with *copy*.

A **copy**-type instruction specified by
- startDelta as UN
- length as UN

which instructs to read *length* bytes from *B*, where startDelta specifies the start position as an offset from the previously used array end. (The previous array end is initialized with the position of the start of content, i.e., after *baseFileName* and *contentSize* in *B*).

An **insert**-type instruction is specified by
- length as UN

and instructs to read append the following *length* bytes from *A* and append it to the contents.

It may happen that file *B* itself is stored differentially and depends on a further file. In this case, *B* has to be restored before restoring *A*. The following pseudocode accomplishes the decoding
```
var contents = '';
var previousEnd = 0; // position after *baseFileName* and *contentSize* in baseFile
for (var copy = true; contents.length < contentSize; copy = !copy) {
if (copy) {
var start = previousEnd + A.readUN();
var length = A.readUN();
contents.append(B.subarray(start, start + length));
previousEnd = start + length;
} else {
var length = A.readUN();
contents.append(A.readBytes(length));
}
}
```

<a name="file-structure"></a>
# File structure

The first byte is a header:
Expand Down
128 changes: 79 additions & 49 deletions external/cmapscompress/compress.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
* limitations under the License.
*/

'use strict';

var fs = require('fs');
var path = require('path');
var parseAdobeCMap = require('./parse.js').parseAdobeCMap;
Expand Down Expand Up @@ -112,8 +114,8 @@ function compressCmap(srcPath, destPath, verify) {
fs.writeFileSync(destPath, new Buffer(out, 'hex'));

if (verify) {
var result2 = parseCMap(out);
var isGood = JSON.stringify(inputData) == JSON.stringify(result2);
var result2 = new BinaryCMapReader(out).parse();
var isGood = JSON.stringify(inputData) === JSON.stringify(result2);
if (!isGood) {
throw new Error('Extracted data does not match the expected result');
}
Expand All @@ -125,11 +127,13 @@ function compressCmap(srcPath, destPath, verify) {
};
}

function parseCMap(binaryData) {
var reader = {
buffer: binaryData,
pos: 0,
end: binaryData.length,
function BinaryCMapReader(binaryData) {
this.buffer = binaryData;
this.pos = 0;
this.end = binaryData.length;
}

BinaryCMapReader.prototype = {
readByte: function () {
if (this.pos >= this.end) {
return -1;
Expand Down Expand Up @@ -164,7 +168,7 @@ function parseCMap(binaryData) {
var stack = [];
do {
var b = this.readByte();
last = !(b & 0x80);
var last = !(b & 0x80);
stack.push(b & 0x7F);
} while (!last);
var s = '', buffer = 0, bufferSize = 0;
Expand Down Expand Up @@ -198,10 +202,9 @@ function parseCMap(binaryData) {
s += String.fromCharCode(this.readNumber());
}
return s;
}
};

var header = reader.readByte();
},
parse: function() {
var header = this.readByte();
var result = {
type: header >> 1,
wmode: header & 1,
Expand All @@ -211,15 +214,15 @@ function parseCMap(binaryData) {
};

var b;
while ((b = reader.readByte()) >= 0) {
while ((b = this.readByte()) >= 0) {
var type = b >> 5;
if (type === 7) {
switch (b & 0x1F) {
case 0:
result.comment = reader.readString();
result.comment = this.readString();
break;
case 1:
result.usecmap = reader.readString();
result.usecmap = this.readString();
break;
}
continue;
Expand All @@ -235,83 +238,84 @@ function parseCMap(binaryData) {
item.sequence = true;
}
var ucs2DataSize = 1;
var subitemsCount = reader.readNumber();
var subitemsCount = this.readNumber();
var start, end, code, char;
switch (type) {
case 0:
start = reader.readHex(dataSize);
end = addHex(reader.readHexNumber(dataSize), start);
start = this.readHex(dataSize);
end = addHex(this.readHexNumber(dataSize), start);
subitems.push({start: start, end: end});
for (var i = 1; i < subitemsCount; i++) {
start = addHex(reader.readHexNumber(dataSize), incHex(end));
end = addHex(reader.readHexNumber(dataSize), start);
start = addHex(this.readHexNumber(dataSize), incHex(end));
end = addHex(this.readHexNumber(dataSize), start);
subitems.push({start: start, end: end});
}
break;
case 1:
start = reader.readHex(dataSize);
end = addHex(reader.readHexNumber(dataSize), start);
code = reader.readNumber();
start = this.readHex(dataSize);
end = addHex(this.readHexNumber(dataSize), start);
code = this.readNumber();
subitems.push({start: start, end: end, code: code});
for (var i = 1; i < subitemsCount; i++) {
start = addHex(reader.readHexNumber(dataSize), incHex(end));
end = addHex(reader.readHexNumber(dataSize), start);
code = reader.readNumber();
start = addHex(this.readHexNumber(dataSize), incHex(end));
end = addHex(this.readHexNumber(dataSize), start);
code = this.readNumber();
subitems.push({start: start, end: end, code: code});
}
break;
case 2:
char = reader.readHex(dataSize);
code = reader.readNumber();
char = this.readHex(dataSize);
code = this.readNumber();
subitems.push({char: char, code: code});
for (var i = 1; i < subitemsCount; i++) {
char = sequence ? incHex(char) : addHex(reader.readHexNumber(dataSize), incHex(char));
code = reader.readSigned() + (code + 1);
char = sequence ? incHex(char) : addHex(this.readHexNumber(dataSize), incHex(char));
code = this.readSigned() + (code + 1);
subitems.push({char: char, code: code});
}
break;
case 3:
start = reader.readHex(dataSize);
end = addHex(reader.readHexNumber(dataSize), start);
code = reader.readNumber();
start = this.readHex(dataSize);
end = addHex(this.readHexNumber(dataSize), start);
code = this.readNumber();
subitems.push({start: start, end: end, code: code});
for (var i = 1; i < subitemsCount; i++) {
start = sequence ? incHex(end) : addHex(reader.readHexNumber(dataSize), incHex(end));
end = addHex(reader.readHexNumber(dataSize), start);
code = reader.readNumber();
start = sequence ? incHex(end) : addHex(this.readHexNumber(dataSize), incHex(end));
end = addHex(this.readHexNumber(dataSize), start);
code = this.readNumber();
subitems.push({start: start, end: end, code: code});
}
break;
case 4:
char = reader.readHex(ucs2DataSize);
code = reader.readHex(dataSize);
char = this.readHex(ucs2DataSize);
code = this.readHex(dataSize);
subitems.push({char: char, code: code});
for (var i = 1; i < subitemsCount; i++) {
char = sequence ? incHex(char) : addHex(reader.readHexNumber(ucs2DataSize), incHex(char));
code = addHex(reader.readHexSigned(dataSize), incHex(code));
char = sequence ? incHex(char) : addHex(this.readHexNumber(ucs2DataSize), incHex(char));
code = addHex(this.readHexSigned(dataSize), incHex(code));
subitems.push({char: char, code: code});
}
break;
case 5:
start = reader.readHex(ucs2DataSize);
end = addHex(reader.readHexNumber(ucs2DataSize), start);
code = reader.readHex(dataSize);
start = this.readHex(ucs2DataSize);
end = addHex(this.readHexNumber(ucs2DataSize), start);
code = this.readHex(dataSize);
subitems.push({start: start, end: end, code: code});
for (var i = 1; i < subitemsCount; i++) {
start = sequence ? incHex(end) : addHex(reader.readHexNumber(ucs2DataSize), incHex(end));
end = addHex(reader.readHexNumber(ucs2DataSize), start);
code = reader.readHex(dataSize);
start = sequence ? incHex(end) : addHex(this.readHexNumber(ucs2DataSize), incHex(end));
end = addHex(this.readHexNumber(ucs2DataSize), start);
code = this.readHex(dataSize);
subitems.push({start: start, end: end, code: code});
}
break;
default:
throw new Error('Unknown type: ' + type)
throw new Error('Unknown type: ' + type);
}
result.body.push(item);
}

return result;
}
};

function toHexDigit(n) {
return n.toString(16);
Expand Down Expand Up @@ -423,15 +427,41 @@ function incHex(a) {
return s;
}

function padRight(s, length) {
s = '' + s;
return s + new Array(Math.max(0, length - s.length + 1)).join(' ');
}

function padLeft(s, length) {
s = '' + s;
return new Array(Math.max(0, length - s.length + 1)).join(' ') + s;
}

exports.compressCmaps = function (src, dest, verify) {
var files = fs.readdirSync(src).filter(function (fn) {
return fn.indexOf('.') < 0; // skipping files with the extension
});

var sumPacked = 0;
var sumOrig = 0;
files.forEach(function (fn) {
var srcPath = path.join(src, fn);
var destPath = path.join(dest, fn + '.bcmap');
var stats = compressCmap(srcPath, destPath, verify);
console.log('Compressing ' + fn + ': ' + stats.orig + ' vs ' + stats.packed +
' ' + (stats.packed / stats.orig * 100).toFixed(1) + '%');
sumOrig += stats.orig;
sumPacked += stats.packed;
// console.log(padRight(fn, 25) +
// padLeft(stats.packed, 7) + '/' + padLeft(stats.orig, 7) + ',' +
// padLeft((stats.packed / stats.orig * 100).toFixed(1) + '%', 7));
});
console.log('-------------------------------------------------------');
console.log(padRight('TOTAL', 25) +
padLeft(sumPacked, 7) + '/' + padLeft(sumOrig, 7) + ',' +
padLeft((sumPacked / sumOrig * 100).toFixed(1) + '%', 7));
};

exports.writeNumber = writeNumber;
exports.writeString = writeString;
exports.padLeft = padLeft;
exports.padRight = padRight;
exports.BinaryCMapReader = BinaryCMapReader;
Loading

0 comments on commit b45239f

Please sign in to comment.