dict-generate.cpp

/**********************************************************************************
 * Program to generate the dictionary for the C implementation of the zxcvbn password estimator.
 * Copyright (c) 2015-2017 Tony Evans
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 **********************************************************************************/

#include <algorithm>
#include <iostream>
#include <string>
#include <fstream>
#include <list>
#include <set>
#include <vector>
#include <map>
#include <memory>
#include <limits>
#include <stdlib.h>
#include <string.h>
#include <math.h>

using namespace std;

class Node;
typedef std::shared_ptr<Node> NodeSPtr;
typedef std::weak_ptr<Node> NodeWPtr;
typedef std::map<char, NodeSPtr> NodeMap_t;

typedef unsigned int Check_t;

/**********************************************************************************
 * Class to perform CRC checksum calculation.
 */
class TrieCheck
{
public:
    typedef uint64_t Check_t;
    static const Check_t CHK_INIT = 0xffffffffffffffff;
    TrieCheck()          { Init(); }
    void Init()          { mCrc = CHK_INIT; }
    operator Check_t() const { return Result(); }
    Check_t  Result()  const { return mCrc; }
    bool operator ! () const { return mCrc == CHK_INIT; }
    void operator () (const void *, unsigned int);
protected:
    Check_t mCrc;
};

/**********************************************************************************
 * Class to hold a node within the trie
 */
class Node
{
public:
    Node();
    Node(const Node &);
    ~Node();
    Node & operator = (const Node &);
    //bool operator == (const Node & r) const { return !IsEqual(r); }
    //bool operator != (const Node & r) const { return !IsEqual(r); }
    void          SetEnd()           { mEnd = true; }
    bool          IsEnd() const      { return mEnd; }
    int           Height() const     { return mHeight; }

    // Scan the trie and count nodes
    int           NodeCount()        { ClearCounted() ; return CountNodes(); }


    int           CalcAddress()      { int a=0; ClearCounted(); a=CalcAddr(a, true); return CalcAddr(a, false); }
    Node         *GetParent()        { return mParent; }
    unsigned int  GetAddr() const    { return mAddr; }
    NodeMap_t::iterator ChildBegin() { return mChild.begin(); }
    NodeMap_t::iterator ChildEnd()   { return mChild.end(); }
    unsigned int  GetNumChild()      { return mChild.size(); }
    int           GetNumEnds() const { return mEndings; }
    NodeSPtr      FindChild(char);
    std::string   GetChildChars();

    TrieCheck::Check_t CalcCheck();
    int         CalcEndings();
    int         CalcHeight();
    NodeSPtr    AddChild(char);
    void        ChangeChild(NodeSPtr &, NodeSPtr &);
//    bool        IsEqual(const Node &) const;
    void        ClearCounted();
    void        SetCounted()    { mCounted = true; }
    bool        IsCounted() const { return mCounted; }
protected:
    int      CountNodes();
    int      CalcAddr(int, bool);

    NodeMap_t    mChild;
    Node        *mParent;
    int          mEndings;
    int          mHeight;
    unsigned int mAddr;
    TrieCheck    mCheck;
    bool         mEnd;
    bool         mCounted;
};

/**********************************************************************************
 * Static table used for the crc implementation.
 */
static const TrieCheck::Check_t CrcTable[16] =
{
    0x0000000000000000, 0x7d08ff3b88be6f81, 0xfa11fe77117cdf02, 0x8719014c99c2b083,
    0xdf7adabd7a6e2d6f, 0xa2722586f2d042ee, 0x256b24ca6b12f26d, 0x5863dbf1e3ac9dec,
    0x95ac9329ac4bc9b5, 0xe8a46c1224f5a634, 0x6fbd6d5ebd3716b7, 0x12b5926535897936,
    0x4ad64994d625e4da, 0x37deb6af5e9b8b5b, 0xb0c7b7e3c7593bd8, 0xcdcf48d84fe75459
};

// Update the crc value with new data.
void TrieCheck::operator () (const void *v, unsigned int Len)
{
    Check_t Crc = mCrc;
    const unsigned char *Data = reinterpret_cast<const unsigned char *>(v);
    while(Len--)
    {
        Crc = CrcTable[(Crc ^ (*Data >> 0)) & 0x0f] ^ (Crc >> 4);
        Crc = CrcTable[(Crc ^ (*Data >> 4)) & 0x0f] ^ (Crc >> 4);
        ++Data;
    }
    mCrc = Crc;
}

Node::Node()
{
    mEndings = -1;
    mHeight = -1;
    mEnd    = false;
    mParent = 0;
}

Node::Node(const Node &r)
{
    *this = r;
}

Node::~Node()
{

}

Node &Node::operator = (const Node & r)
{
    mChild   = r.mChild;
    mParent  = r.mParent;
    mEndings = r.mEndings;
    mHeight  = r.mHeight;
    mCheck   = r.mCheck;
    mEnd     = r.mEnd;
    return *this;
}


/**********************************************************************************
 * Generate a checksum for the current node. Value also depends of the
 * checksum of any child nodes
 */
TrieCheck::Check_t Node::CalcCheck()
{
    if (!mCheck)
    {
        // Not done this node before
        char c;
        NodeMap_t::iterator It;
        mCheck.Init();
        // Include number of children
        c = mChild.size();
        mCheck(&c, sizeof c);
        // For each child include its character and node checksum
        for(It = mChild.begin(); It != mChild.end(); ++It)
        {
            Check_t n = It->second->CalcCheck();
            c = It->first;
            mCheck(&c, sizeof c);
            mCheck(&n, sizeof n);
        }
        // Finally include whether this node is an ending in the chaecksum
        c = mEnd;
        mCheck(&c, sizeof c);
    }
    return mCheck;
}

/**********************************************************************************
 * Get number of nodes for this which end/finish a word
 */
int Node::CalcEndings()
{
    if (mEndings < 0)
    {
        // Not already done this node,so calculate the ends
        int n = 0;
        NodeMap_t::iterator It;
        // Number of endings is sum of the endings of the child nodes and plus this node if it ends a word
        for(It = mChild.begin(); It != mChild.end(); ++It)
            n += It->second->CalcEndings();
        n += !!mEnd;
        mEndings = n;
    }
    return mEndings;
}

/**********************************************************************************
 * Calculate the height of the trie starting at current node
 */
int Node::CalcHeight()
{
    if (mHeight < 0)
    {
        // Not already done this node,so calculate the height
        int Hi = 0;
        NodeMap_t::iterator It;
        // Get height of all child nodes, remember the highest
        for(It = mChild.begin(); It != mChild.end(); ++It)
        {
            int i = It->second->CalcHeight();
            if (i >= Hi)
                Hi = i+1;
        }
        mHeight = Hi;
    }
    return mHeight;
}

/**********************************************************************************
 * Clear indication that node has been counted
 */
void Node::ClearCounted()
{
    NodeMap_t::iterator It;
    mCounted = false;
    for(It = mChild.begin(); It != mChild.end(); ++It)
        It->second->ClearCounted();
}

/**********************************************************************************
 * Count this plus the number of child nodes. As part of the tree node count
 * scan, make sure not to double count nodes
 */
int Node::CountNodes()
{
    // Count is 0 if already done
    if (mCounted)
        return 0;
    mCounted = true;
    NodeMap_t::iterator It;
    int i = 1; // 1 for this node

    // Add the child nodes
    for(It = mChild.begin(); It != mChild.end(); ++It)
        i += It->second->CountNodes();
    return i;
}

/**********************************************************************************
 * Calculate the final node address
 */
int Node::CalcAddr(int Start, bool ManyEnds)
{
    NodeMap_t::iterator It;

    if (!(mCounted || (ManyEnds && (mEndings < 256))))
    {
        mCounted = true;
        mAddr = Start++;
    }
    for(It = mChild.begin(); It != mChild.end(); ++It)
        Start = It->second->CalcAddr(Start, ManyEnds);

    return Start;
}

/**********************************************************************************
 * Add the given character to the current node, return the next lower node
 */
NodeSPtr Node::AddChild(char c)
{
    NodeMap_t::iterator It;
    // Find character in map of child nodes
    It = mChild.find(c);
    if (It == mChild.end())
    {
        // New character, create new child node
        NodeSPtr a(new Node);
        a->mParent = this;
        std::pair<char, NodeSPtr> x(c, a);
        std::pair<NodeMap_t::iterator, bool> y = mChild.insert(x);
        It = y.first;
    }
    return It->second;
}

/**********************************************************************************
 * Find the child node which corresponds to the given character.
 */
NodeSPtr Node::FindChild(char Ch)
{
    NodeMap_t::iterator It;
    It = mChild.find(Ch);
    if (It == mChild.end())
        return NodeSPtr();
    return It->second;
}

/**********************************************************************************
 * Replace the current child node (old param) with a new child (Replace param),
 * and update the new child parent.
 */
void Node::ChangeChild(NodeSPtr & Replace, NodeSPtr & Old)
{
    NodeMap_t::iterator It;
    for(It = mChild.begin(); It != mChild.end(); ++It)
    {
        NodeSPtr p = It->second;
        if (p == Old)
        {
            It->second = Replace;
            Replace->mParent = this;
            break;
        }
    }
}

/**********************************************************************************
 * Find all the characters corresponding to the children of this node.
 */
std::string Node::GetChildChars()
{
    NodeMap_t::iterator It;
    std::string Result;
    for(It = mChild.begin(); It != mChild.end(); ++It)
    {
        char c = It->first;
        Result += c;
    }
    return Result;
}

/**********************************************************************************
 * struct to hold data read from input file (except for the word string)
 */
struct Entry
{
    Entry() : mRank(0), mDict(0), mOrder(0), mOccurs(0) {}
    int mRank;
    int mDict;
    int mOrder;
    int mOccurs;
};

/**********************************************************************************
 * Struct to hold a string and an int. Also provide the compare operators for std::set class
 */
struct StringInt
{
    string       s;
    unsigned int i;
    StringInt() { i=0; }
    StringInt(const StringInt & r) : s(r.s), i(r.i) {}
    StringInt & operator = (const StringInt & r) { i = r.i; s = r.s; return *this; }
    bool operator < (const StringInt & r)  const { return s < r.s; }
    bool operator > (const StringInt & r)  const { return s > r.s; }
    bool operator == (const StringInt & r) const { return s == r.s; }
    StringInt * Self() const { return const_cast<StringInt *>(this); }
};

typedef map<string, Entry> EntryMap_t;
typedef list<string> StringList_t;
typedef list<NodeSPtr> NodeList_t;
typedef set<StringInt> StringIntSet_t;
typedef vector<int> StringOfInts;
typedef vector<unsigned int> UintVect;
typedef vector<uint64_t> Uint64Vect;
typedef vector<StringInt *> StrIntPtrVect_t;
typedef vector<StringInt> StringIntVect_t;

// Variables holding 'interesting' information on the data
unsigned int MaxLength, MinLength, NumChars, NumInWords, NumDuplicate;
static string PassWithMaxChilds, MaxChildChars;
static unsigned int MaxNumChilds, MaxChildsPosn;

struct FileInfo
{
    FileInfo() : Words(0), BruteIgnore(0), Accented(0), Dups(0), Used(0), Rank(0) { }
    string Name;
    StringList_t Pwds;
    int Words;
    int BruteIgnore;
    int Accented;
    int Dups;
    int Used;
    int Rank;
};

/**********************************************************************************
 * Read the file of words and add them to the file information.
 */
static bool ReadInputFile(const string & FileName, FileInfo &Info, int MaxRank)
{
    ifstream f(FileName.c_str());
    if (!f.is_open())
    {
        cerr << "Error opening " << FileName << endl;
        return false;
    }
    Info.Name = FileName;

    // Rank is the position of the work in the dictionary file. Rank==1 is lowest for a word (and
    // indicates a very popular or bad password).
    int Rank = 0;
    string Line;
    while(getline(f, Line) && (Rank < MaxRank))
    {
        // Truncate at first space or tab to leave just the word in case additional info on line
        string::size_type y = Line.find_first_of("\t ");
        if (y != string::npos)
            Line.erase(y);

        y = Line.length();
        if (!y)
            continue;

        ++Info.Words;
        // Only use words where all chars are ascii (no accents etc.)
        string::size_type x;
        double BruteForce = 1.0;
        for(x = 0; x < y; ++x)
        {
            unsigned char c = Line[x];
            if (c >= 128)
                break;
            c = tolower(c);
            Line[x] = c;
            BruteForce *= 26.0;
        }
        if (x < y)
        {
            ++Info.Accented;
            continue;
        }

        // Don't use words where the brute force strength is less than the word's rank
        if (BruteForce < (Rank+1))
        {
            ++Info.BruteIgnore;
            continue;
        }
        // Remember some interesting info
        if (y > MaxLength)
            MaxLength = y;
        if (y < MinLength)
            MinLength = y;
        NumChars += y;

        Info.Pwds.push_back(Line);
        ++Rank;
    }
    f.close();
    return true;
}

static void CombineWordLists(EntryMap_t & Entries, FileInfo *Infos, int NumInfo)
{
    bool Done = false;
    int Rank = 0;
    while(!Done)
    {
        int i;
        ++Rank;
        Done = true;
        for(i = 0; i < NumInfo; ++i)
        {
            FileInfo *p = Infos + i;
            while(!p->Pwds.empty())
            {
                Done = false;
                string Word = p->Pwds.front();
                p->Pwds.pop_front();
                EntryMap_t::iterator It = Entries.find(Word);
                if (It != Entries.end())
                {
                    // Word is repeat of one from another file
                    p->Dups += 1;
                    ++NumDuplicate;
                }
                else
                {
                    // New word, add it
                    Entry e;
                    e.mDict = i;
                    e.mRank = Rank;
                    Entries.insert(std::pair<std::string, Entry>(Word, e));
                    p->Used += 1;
                    break;
                }
            }
        }
    }
}

/**********************************************************************************
 * Use all words previously read from file(s) and add them to a Trie, which starts
 * at Root. Also update a bool array indicating the chars used in the words.
 */
static void ProcessEntries(NodeSPtr Root, EntryMap_t & Entries, bool *InputCharSet)
{
    EntryMap_t::iterator It;
    std::string Text;
    for(It = Entries.begin(); It != Entries.end(); ++It)
    {
        Text = It->first;
        // Add latest word to tree
        string::size_type x;
        NodeSPtr pNode = Root;
        for(x = 0; x < Text.length(); ++x)
        {
            char c = Text[x];
            pNode = pNode->AddChild(c);
            // Add char to set of used character codes
            InputCharSet[c & 0xFF] = true;
        }
        pNode->SetEnd();
    }
}

/**********************************************************************************
 * Add the passed node to the list if it has same height  as value in Hi (= number
 * of steps to get to a terminal node). If current node has height greater than Hi,
 * recursivly call with each child node as one of these may be at the required height.
 */
static void AddToListAtHeight(NodeList_t & Lst, NodeSPtr Node, int Hi)
{
    if (Hi == Node->Height())
    {
        Lst.push_back(Node);
        return;
    }
    if (Hi < Node->Height())
    {
        NodeMap_t::iterator It;
        for(It = Node->ChildBegin(); It != Node->ChildEnd(); ++It)
        {
            AddToListAtHeight(Lst, It->second, Hi);
        }
    }
}

/**********************************************************************************
 * Scan the trie and update the original word list with the alphabetical order
 * (or 'index location') of the words
 */
static void ScanTrieForOrder(EntryMap_t & Entries, int & Ord, NodeSPtr Root, const string & Str)
{
    if (Root->IsEnd())
    {
        // Root is a word ending node, so store its index in the input word store
        EntryMap_t::iterator Ite;
        Ite = Entries.find(Str);
        if (Ite == Entries.end())
            throw "Trie string not in entries";

        Ite->second.mOrder = ++Ord;
    }
    NodeMap_t::iterator It;
    string Tmp;
    // For each child, append its character to the current word string and do a recursive
    // call to update their word indexes.
    for(It = Root->ChildBegin(); It != Root->ChildEnd(); ++It)
    {
        Tmp = Str + It->first;
        ScanTrieForOrder(Entries, Ord, It->second, Tmp);
    }
}

/**********************************************************************************
 * Reduce the trie by merging tails where possible. Starting at greatest height,
 * get a list of all nodes with given height, then test for identical nodes. If
 * found, change the parent of the second identical node to use the first node,
 * and delete second node and its children. Reduce height by one and repeat
 * until height is zero.
 */
static void ReduceTrie(NodeSPtr Root)
{
    int Height;
    Root->CalcCheck();

    NodeSPtr pNode = Root;
    for(Height = Root->CalcHeight(); Height >= 0; --Height)
    {
        // Get a list of all nodes at given height
        NodeList_t Lst;
        AddToListAtHeight(Lst, Root, Height);

        NodeList_t::iterator Ita, Itb;
        for(Ita = Lst.begin(); Ita != Lst.end(); ++Ita)
        {
            // Going to use a CRC to decide if two nodes are identical
            TrieCheck::Check_t Chka = (*Ita)->CalcCheck();
            Itb = Ita;
            for(++Itb; Itb != Lst.end(); )
            {
                if (Chka == (*Itb)->CalcCheck())
                {
                    // Found two identical nodes (with identical children)
                    Node * Parentb = (*Itb)->GetParent();
                    if (Parentb)
                    {
                        // Change the 2nd parent to use the current node as child
                        // Remove the 2nd node from the scanning list to as it will
                        // get deleted by the sharing (as using std::shared_ptr)
                        Parentb->ChangeChild(*Ita, *Itb);
                        Itb = Lst.erase(Itb);
                    }
                    else
                    {
                        cout << " orphan ";
                        ++Itb;
                    }
                }
                else
                {
                    ++Itb;
                }
            }
        }
    }
}

/**********************************************************************************
 * Scan the trie to match with the supplied word. Return the order of the
 * word, or -1 if it is not in the trie.
 */
static int CheckWord(NodeSPtr Root, const string & Str)
{
    int i = 1;
    bool e = false;
    string::size_type x;
    NodeSPtr p = Root;

    for(x = 0; x < Str.length(); ++x)
    {
        int j;
        NodeMap_t::iterator It;
        // Scan children to find one that matches current character
        char c = Str[x];
        for(It = p->ChildBegin(); It != p->ChildEnd(); ++It)
        {
            if (It->first == c)
                break;
            // Add the number of endings at or below child to track the alphabetical index
            j = It->second->CalcEndings();
            i += j;
        }
        // Fail if no child matches the character
        if (It == p->ChildEnd())
            return -1;
        // Allow for this node being a word ending
        e = p->IsEnd();
        if (e)
            ++i;

        if (p->GetNumChild() > MaxNumChilds)
        {
            NodeMap_t::iterator Itc;
            MaxNumChilds = p->GetNumChild();
            MaxChildsPosn = x;
            PassWithMaxChilds = Str;
            MaxChildChars.clear();
            for(Itc = p->ChildBegin(); Itc != p->ChildEnd(); ++Itc)
                MaxChildChars += Itc->first;
        }
        p = It->second;
    }

    if (p && p->IsEnd())
    {
        if (x == Str.length())
            return i;
    }
    return -1;
}

/**********************************************************************************
 * Try to find every input word in the reduced trie. The order should also
 * match, otherwise the reduction has corrupted the trie.
 */
static int CheckReduction(StringIntVect_t & Ranks, NodeSPtr Root, EntryMap_t & Entries)
{
    int i = 0;
    int n = 0;
    EntryMap_t::iterator It;
    std::string Text;
    int b;
    Ranks.resize(Entries.size()+1);
    for(It = Entries.begin(); (It != Entries.end()) && (i <= 200000); ++It)
    {
        Text = It->first;
        b = CheckWord(Root, Text);
        if (b < 0)
        {
            ++i;
            cout << It->second.mOrder << ": Missing " << Text.c_str() << endl;
        }
        else if (It->second.mOrder != b)
        {
            ++i;
            cout << It->second.mOrder << ": Bad order " << b << " for  " << Text.c_str() << endl;
        }
        else
        {
            //if (Text == "fred")
            //    cout << Text.c_str() << "-> " << It->second.mOrder << ", " << It->second.mRank << endl;
            ++n;
        }
        if (b >= int(Ranks.size()))
            throw " Using  Ranks beyond end";
        if (b >= 0)
        {
            char Tmp[20];
            Ranks[b].i = It->second.mRank;
            snprintf(Tmp, sizeof(Tmp), "%d: ", n);
            Ranks[b].s = string(Tmp) + Text;
        }
        // Try to find a non-existant word
        Text.insert(0, "a");
        Text += '#';
        b = CheckWord(Root, Text);
        if (b > 0)
            throw string("Found non-existant word ") + Text;
     }
     if (i > 0)
         throw "Missing words in reduction check = " + to_string(i);
    return n;
}

struct ChkNum
{
    int Match;
    int Err;
    ChkNum() : Match(0), Err(0) {}
    ChkNum(const ChkNum &r) : Match(r.Match), Err(r.Err) {}
    ChkNum & operator = (const ChkNum & r) { Match = r.Match; Err = r.Err; return *this; }
    ChkNum & operator += (const ChkNum & r) { Match += r.Match; Err += r.Err; return *this; }
};

/**********************************************************************************
 * Find all possible words in the trie and make sure they are input words.
 * Return number of words found. Done as a second trie check.
 */
static ChkNum CheckEntries(NodeSPtr Root, string Str, const EntryMap_t & Entries)
{
    ChkNum Ret;
    if (Root->IsEnd())
    {
        // This is an end node, find the word in the input words
        EntryMap_t::const_iterator It = Entries.find(Str);
        if (It != Entries.end())
            ++Ret.Match;
        else
            ++Ret.Err;
    }
    // Add each child character to the passed string and recursively check
    NodeMap_t::iterator It;
    for(It = Root->ChildBegin(); It != Root->ChildEnd(); ++It)
    {
        string Tmp = Str;
        Tmp += It->first;
        Ret += CheckEntries(It->second, Tmp, Entries);
    }
    return Ret;
}

/**********************************************************************************
 * Convert the passed bool array of used chars into a character string
 */
string MakeCharSet(bool *InputCharSet)
{
    int i;
    string s;
    for(i = 1; i < 256; ++i)
    {
        if (InputCharSet[i])
            s += char(i);
    }
    return s;
}

/**********************************************************************************
 * Create a set of strings which contain the possible characters matched at
 * a node when checking a word.
 */
void MakeChildBitMap(StringIntSet_t & StrSet, NodeSPtr Root, int & Loc)
{
    // Skip if already done
    if (Root->IsCounted())
        return;

    string::size_type x;
    StringInt In;
    NodeSPtr p = Root;
    In.s = Root->GetChildChars();
    if (StrSet.find(In) == StrSet.end())
    {
        // Not already in set of possible child chars for a node, so add it
        In.i = Loc++; // Address in the final output array
        StrSet.insert(In);
    }
    // Recursively do the child nodes
    for(x = 0; x < In.s.length(); ++x)
    {
        char c = In.s[x];
        NodeSPtr q = p->FindChild(c);
        if (q)
            MakeChildBitMap(StrSet, q, Loc);
    }
    Root->SetCounted();
}

// Constants defining bit positions of node data
// Number of bits to represent the index of the child char pattern in the final child bitmap array,
const int BITS_CHILD_PATT_INDEX = 14;

// Number of bits to represent index of where the child pointers start for this node in
// the Child map array and its bit position
const int BITS_CHILD_MAP_INDEX = 18;
const int SHIFT_CHILD_MAP_INDEX = BITS_CHILD_PATT_INDEX;

// Bit positions of word ending indicator and indicator for number of word endings for this + child nodes is >= 256
const int SHIFT_WORD_ENDING_BIT = SHIFT_CHILD_MAP_INDEX + BITS_CHILD_MAP_INDEX;
const int SHIFT_LARGE_ENDING_BIT = SHIFT_WORD_ENDING_BIT + 1;
/**********************************************************************************
 * Create the arrays of data that will be output
 */
void CreateArrays(NodeSPtr Root, StringIntSet_t & StrSet, StringOfInts & ChildAddrs, Uint64Vect & NodeData, UintVect & NodeEnds)
{
    NodeMap_t::iterator Itc;
    StringInt Tmp;
    StringOfInts Chld;

    // Find children in the child pattern array
    Tmp.s= Root->GetChildChars();
    StringIntSet_t::iterator Its = StrSet.find(Tmp);

    // Make a 'string' of pointers to the children
    for(Itc = Root->ChildBegin(); Itc != Root->ChildEnd(); ++Itc)
    {
        int i = Itc->second->GetAddr();
        Chld.push_back(i);
    }
    // Find where in pointer array the child pointer string is
    StringOfInts::size_type x = search(ChildAddrs.begin(), ChildAddrs.end(), Chld.begin(), Chld.end()) - ChildAddrs.begin();
    if (x == ChildAddrs.size())
    {
        // Not found, add it
        ChildAddrs.insert(ChildAddrs.end(), Chld.begin(), Chld.end());
    }
    // Val will contain the final node data
    uint64_t Val = Its->i;
    if (Val >= (1 << BITS_CHILD_PATT_INDEX))
    {
        char Tmp[20];
        snprintf(Tmp, sizeof Tmp, "%u", Its->i);
        throw string("Not enough bits for child pattern index value of ") + Tmp + " for " +
                Its->s + " (BITS_CHILD_PATT_INDEX too small)";
    }
    if (x >= (1 << BITS_CHILD_MAP_INDEX))
    {
        char Tmp[24];
        snprintf(Tmp, sizeof Tmp, "%lu", x);
        throw string("Not enough bits for child map index value of ") + Tmp + " for " +
                Its->s + " (BITS_CHILD_MAP_INDEX too small)";
    }
    Val |= x << SHIFT_CHILD_MAP_INDEX;
    if (Root->IsEnd())
        Val |= uint64_t(1) << SHIFT_WORD_ENDING_BIT;
    if (Root->GetNumEnds() >= 256)
        Val |= uint64_t(1) << SHIFT_LARGE_ENDING_BIT;

    // Make sure output arrays are big enough
    if (Root->GetAddr() > NodeData.size())
    {
        NodeData.resize(Root->GetAddr()+1, 4000000000);
        NodeEnds.resize(Root->GetAddr()+1, 4000000000);
    }
    // Save the node data and number of word endings for the node
    NodeData[Root->GetAddr()] = Val;
    NodeEnds[Root->GetAddr()] = Root->GetNumEnds();

    // Now do the children
    for(Itc = Root->ChildBegin(); Itc != Root->ChildEnd(); ++Itc)
    {
        CreateArrays(Itc->second, StrSet, ChildAddrs, NodeData, NodeEnds);
    }
}

/**********************************************************************************
 * Output the data as a binary file.
 */
static int OutputBinary(ostream *Out, const string & ChkFile, const string & CharSet, StringIntSet_t & StrSet, //NodeSPtr & Root,
                        StringOfInts & ChildAddrs, Uint64Vect & NodeData, UintVect & NodeEnds, StringIntVect_t & Ranks)
{
    int OutputSize;
    unsigned int FewEndStart = 2000000000;
    unsigned int i;
    unsigned int Index;
    unsigned short u;
    TrieCheck h;

    for(Index = 0; Index < NodeData.size(); ++Index)
    {
        uint64_t v = NodeData[Index];
        if ((FewEndStart >= 2000000000) && !(v & (uint64_t(1) << SHIFT_LARGE_ENDING_BIT)))
        {
            FewEndStart = Index;
            break;
        }
    }
    // Header words
    unsigned int NumWordEnd;
    const unsigned int MAGIC = 'z' + ('x'<< 8) + ('c' << 16) + ('v' << 24);
    Out->write((char *)&MAGIC, sizeof MAGIC);   // Write magic
    h(&MAGIC, sizeof MAGIC);
    OutputSize = sizeof MAGIC;

    i = NodeData.size();
    Out->write((char *)&i, sizeof i);   // Write number of nodes
    h(&i, sizeof i);
    OutputSize += sizeof i;

    i = ChildAddrs.size();
    if (NodeData.size() > numeric_limits<unsigned int>::max())
        i |= 1<<31;
    Out->write((char *)&i, sizeof i);   // Write number of child location entries & size of each entry
    h(&i, sizeof i);
    OutputSize += sizeof i;

    i = Ranks.size();
    Out->write((char *)&i, sizeof i);   // Write number of ranks
    h(&i, sizeof i);
    OutputSize += sizeof i;

    NumWordEnd = (NodeData.size() + 7) / 8;
    Out->write((char *)&NumWordEnd, sizeof NumWordEnd);   // Write number of word endings
    h(&NumWordEnd, sizeof NumWordEnd);
    OutputSize += sizeof NumWordEnd;

    i = StrSet.size();
    Out->write((char *)&i, sizeof i);   // Write size of of child bitmap data
    h(&i, sizeof i);
    OutputSize += sizeof i;

    unsigned int BytePerEntry = (CharSet.length() + 7) / 8;
    Out->write((char *)&BytePerEntry, sizeof BytePerEntry);   // Write size of each child bitmap
    h(&BytePerEntry, sizeof BytePerEntry);
    OutputSize += sizeof BytePerEntry;

    Out->write((char *)&FewEndStart, sizeof FewEndStart);   // Write number of large end counts
    h(&FewEndStart, sizeof FewEndStart);
    OutputSize += sizeof FewEndStart;

    i = NodeData.size();
    Out->write((char *)&i, sizeof i);   // Write number of end counts
    h(&i, sizeof i);
    OutputSize += sizeof i;

    i = CharSet.length();
    Out->write((char *)&i, sizeof i);   // Write size of character set
    h(&i, sizeof i);
    OutputSize += sizeof i;

    // Output array of node data
    unsigned char *WordEnds = new unsigned char[NumWordEnd];
    unsigned char v = 0;
    unsigned int z = 0;
    int y = 0;
    for(Index = 0; Index < NodeData.size(); ++Index)
    {
        i = NodeData[Index];
        Out->write((char *)&i, sizeof i);
        h(&i, sizeof i);

        if (NodeData[Index] & (uint64_t(1) << SHIFT_WORD_ENDING_BIT))
            v |= 1 << y;
        if (++y >= 8)
        {
            WordEnds[z++] = v;
            y = 0;
            v = 0;
        }
    }
    while(z < NumWordEnd)
    {
         WordEnds[z++] = v;
         v = 0;
    }
    OutputSize += Index * sizeof i;

    // Output array of node pointers
    for(Index = 0; Index < ChildAddrs.size(); ++Index)
    {
        i = ChildAddrs[Index];
        Out->write((char *)&i, sizeof i);
        h(&i, sizeof i);
    }
    OutputSize += Index * sizeof i;

    // Output ranks
    for(Index = 0; Index < Ranks.size(); ++Index)
    {
        i = Ranks[Index].i;
        if (i >= (1 << 15))
        {
            i -= 1 << 15;
            i /= 4;
            if (i >= (1 << 15))
                i = (1 << 15) - 1;
            i |= 1 << 15;
        }
        if (i > numeric_limits<unsigned short>::max())
            i = numeric_limits<unsigned short>::max();
        u = i;
        Out->write((char *)&u, sizeof u);
        h(&u, sizeof u);
    }
    OutputSize += Index * sizeof u;

    // Output word end bit markers
    Out->write((char *)WordEnds, NumWordEnd);
    h(WordEnds, NumWordEnd);
    OutputSize += NumWordEnd;
    delete [] WordEnds;

    StringIntSet_t::iterator Its;
    string Str;
    unsigned char Buf[8];

    // Get the items from StrSet ordered by the index
    StrIntPtrVect_t SetPtrs;
    SetPtrs.resize(StrSet.size());
    for(Its = StrSet.begin(); Its != StrSet.end(); ++Its)
    {
        StringInt *p = Its->Self();
        if (p->i >= StrSet.size())
            throw "Bad index";
        SetPtrs[p->i] = p;
    }
    // Output child bitmap
    unsigned int CharSetLen = 0;
    for(Index = 0; Index < SetPtrs.size(); ++Index)
    {
        unsigned int i, j;
        string::size_type z, y;
        StringInt *p;
        memset(Buf, 0, sizeof Buf);
        p = SetPtrs[Index];
        Str = p->s;
        for(z = 0; z < Str.length(); ++z)
        {
            y = CharSet.find(Str[z]);
            if (y != string::npos)
            {
                Buf[y/8] |= 1 << (y & 7);
            }
        }
        // Find max bits set which indicates max number chars ued at a node
        for(i = j = 0; i < 8 * sizeof Buf; ++i)
        {
            if (Buf[i/8] & (1 << (i & 7)))
                ++j;
        }
        if (j > CharSetLen)
            CharSetLen = j;

        Out->write((char *)Buf, BytePerEntry);
        h(Buf, BytePerEntry);
     }
    OutputSize += Index * BytePerEntry;

    unsigned char c;
    for(Index = 0; Index < FewEndStart; ++Index)
    {
        i = NodeEnds[Index] >> 8;
        if (i >= 256)
            c = 0;
        else
            c = i;
        Out->write((char *)&c, 1);
        h(&c, 1);
    }
    OutputSize += Index * sizeof c;

    for(Index = 0; Index < NodeEnds.size(); ++Index)
    {
        c = NodeEnds[Index];
        Out->write((char *)&c, 1);
        h(&c, 1);
    }
    OutputSize += Index * sizeof c;

    Out->write(CharSet.c_str(), CharSet.length());
    h(CharSet.c_str(), CharSet.length());
    OutputSize += CharSet.length();

    if (!ChkFile.empty())
    {
        // Write the checksum file
        TrieCheck::Check_t x = h.Result();
        ofstream f(ChkFile);
        f << "static const unsigned char WordCheck[" << sizeof x << "] =\n{\n    ";
        unsigned char *c = reinterpret_cast<unsigned char *>(&x);
        for(Index = 0; Index < sizeof x; ++Index, ++c)
        {
            if (Index)
                f << ',';
            f << int(*c);
        }
        f << "\n};\n";
        f << "#define WORD_FILE_SIZE " << OutputSize << endl;
        f << "#define ROOT_NODE_LOC 0\n"
             "#define BITS_CHILD_PATT_INDEX " << BITS_CHILD_PATT_INDEX << "\n"
             "#define BITS_CHILD_MAP_INDEX  " << BITS_CHILD_MAP_INDEX << "\n" 
             "#define SHIFT_CHILD_MAP_INDEX BITS_CHILD_PATT_INDEX\n"
             "#define SHIFT_WORD_ENDING_BIT (SHIFT_CHILD_MAP_INDEX + BITS_CHILD_MAP_INDEX)\n"
             "#define CHARSET_SIZE " << (CharSetLen + 1) << endl;
        f.close();
    }
    return OutputSize;
}

int OutputTester(ostream *Out, bool /*Cmnts*/, StringIntVect_t & Ranks)
{
    unsigned int Index;
    string Pwd;
    for(Index = 01; Index < Ranks.size(); ++Index)
    {
        unsigned int v = Ranks[Index].i;
        Pwd = Ranks[Index].s;
        string::size_type x = Pwd.find(':');
        if (x != string::npos)
            Pwd.erase(0, x+1);

        *Out << Pwd.c_str() << "  ";
        for(x = Pwd.length(); x < 16; ++x)
            *Out << ' ';
        *Out << log(v*1.0) / log(2.0) << "  " << v << '\n';
    }
    return Index;
}
const int LINE_OUT_LEN = 160;
/**********************************************************************************
 * Output the data as C source.
 */
int OutputCode(ostream *Out, bool Cmnts, const string & CharSet, StringIntSet_t & StrSet, NodeSPtr & Root,
               StringOfInts & ChildAddrs, Uint64Vect & NodeData, UintVect & NodeEnds, StringIntVect_t & Ranks)
{
    unsigned int Index;
    int OutputSize;

    if (Cmnts)
        *Out << "#define ND(e,c,b) (c<<" << SHIFT_CHILD_MAP_INDEX << ")|b\n";

    // Output array of node data
    *Out << "#define ROOT_NODE_LOC 0\n"
            "#define BITS_CHILD_PATT_INDEX " << BITS_CHILD_PATT_INDEX << "\n"
            "#define BITS_CHILD_MAP_INDEX  " << BITS_CHILD_MAP_INDEX << "\n" 
            "#define SHIFT_CHILD_MAP_INDEX BITS_CHILD_PATT_INDEX\n"
            "#define SHIFT_WORD_ENDING_BIT (SHIFT_CHILD_MAP_INDEX + BITS_CHILD_MAP_INDEX)\n"
            "static const unsigned int DictNodes[" << NodeData.size() << "] =\n{";
    OutputSize = NodeData.size() * sizeof(unsigned int);
    int x = 999;
    unsigned int FewEndStart = 2000000000;
    for(Index = 0; Index < NodeData.size(); ++Index)
    {
        uint64_t v;
        x += 11;
        if (x > LINE_OUT_LEN)
        {
            *Out << "\n    ";
            x=0;
        }
        v = NodeData[Index];
        v &= (uint64_t(1) << SHIFT_WORD_ENDING_BIT) - 1;
        if (Cmnts)
        {
            uint64_t i;
            i = (v >> SHIFT_WORD_ENDING_BIT) & 3;
            *Out << "ND(" << i << ',';
            i= (v >> SHIFT_CHILD_MAP_INDEX) & ((1<<BITS_CHILD_MAP_INDEX)-1);
            *Out << i << ',';
            if (i < 10000) *Out << ' ';
            if (i < 1000)  *Out << ' ';
            if (i < 100)   *Out << ' ';
            if (i < 10)    *Out << ' ';
            i = v & ((1<<BITS_CHILD_PATT_INDEX)-1);
            *Out << i << ")";
            if (Index < (NodeData.size()-1))
            {
                *Out << ',';
                if (i < 1000)  *Out << ' ';
                if (i < 100)   *Out << ' ';
                if (i < 10)    *Out << ' ';
            }
        }
        else
        {
            *Out << v;
            if (Index < (NodeData.size()-1))
            {
                *Out << ',';
                if (v < 1000000000) *Out << ' ';
                if (v < 100000000) *Out << ' ';
                if (v < 10000000) *Out << ' ';
                if (v < 1000000) *Out << ' ';
                if (v < 100000) *Out << ' ';
                if (v < 10000) *Out << ' ';
                if (v < 1000)  *Out << ' ';
                if (v < 100)   *Out << ' ';
                if (v < 10)    *Out << ' ';
            }
        }
        if ((FewEndStart >= 2000000000) && !(NodeData[Index] & (uint64_t(1) << SHIFT_LARGE_ENDING_BIT)))
            FewEndStart = Index;
    }
    *Out << "\n};\n";
    unsigned int Len = ((NodeData.size() + 7) / 8);
    OutputSize += Len;
    x = 999;
    *Out << "static unsigned char WordEndBits[" << Len << "] =\n{";
    Index = 0;
    unsigned int v = 0;
    unsigned int y = 0;
    unsigned int z = 0;
    while(z < Len)
    {
        if (Index < NodeData.size())
        {
            if (NodeData[Index] & (uint64_t(1) << SHIFT_WORD_ENDING_BIT))
                v |= 1 << y;
        }
        if (++y >= 8)
        {
            x += 4;
            if (x > LINE_OUT_LEN)
            {
                *Out << "\n    ";
                x = 0;
            }
            *Out << v;
            if (++z < Len)
            {
                *Out << ',';
                if (v < 100) *Out << ' ';
                if (v < 10) *Out << ' ';
            }
            y = 0;
            v = 0;
        }
        ++Index;
    }
    *Out << "\n};\n";
    // Output array of node pointers
    *Out << "static const unsigned ";
    if (NodeData.size() > numeric_limits<unsigned short>::max())
    {
        *Out << "int";
        x = sizeof(unsigned int);
    }
    else
    {
        *Out << "short";
        x = sizeof(unsigned short);
    }
    *Out << " ChildLocs[" << ChildAddrs.size() << "] =\n{";
    OutputSize += x * ChildAddrs.size();
    x = 999;
    for(Index = 0; Index < ChildAddrs.size(); ++Index)
    {
        int v;
        x += 6;
        if (x > LINE_OUT_LEN)
        {
            *Out << "\n    ";
            x=0;
        }
        v = ChildAddrs[Index];
        *Out << v;
        if (Index < (ChildAddrs.size()-1))
        {
            *Out << ',';
            if (v < 10000) *Out << ' ';
            if (v < 1000)  *Out << ' ';
            if (v < 100)   *Out << ' ';
            if (v < 10)    *Out << ' ';
        }
    }
    *Out << "\n};\n";

    // Output the rank of the words
    *Out << "static const unsigned short Ranks[" << Ranks.size() << "] =\n{";
    OutputSize += Ranks.size() * sizeof(unsigned short);
    x = 999;
    bool TooBig = false;
    if (Cmnts)
    {
        *Out << "\n";
        for(Index = 0; Index < Ranks.size(); ++Index)
        {
            unsigned int v;
            *Out << "    ";
            v = Ranks[Index].i;
            if (v >= (1 << 15))
            {
                v -= 1 << 15;
                v /= 4;
                if (v >= (1 << 15))
                {
                    TooBig = true;
                    v = (1 << 15) - 1;
                }
                v |= 1 << 15;
            }
            if (v > numeric_limits<unsigned short>::max())
                v = numeric_limits<unsigned short>::max();
            *Out << v;
            if (Index < (Ranks.size()-1))
            {
                *Out << ',';
                if (v < 10000) *Out << ' ';
                if (v < 1000)  *Out << ' ';
                if (v < 100)   *Out << ' ';
                if (v < 10)    *Out << ' ';
            }
            *Out << " // " << Ranks[Index].s.c_str() << '\n';
        }
    }
    else
    {
        for(Index = 0; Index < Ranks.size(); ++Index)
        {
            unsigned int v;
            x += 6;
            if (x > LINE_OUT_LEN)
            {
                *Out << "\n    ";
                x=0;
            }
            v = Ranks[Index].i;
            if (v >= (1 << 15))
            {
                v -= 1 << 15;
                v /= 4;
                if (v >= (1<<15))
                {
                    TooBig = true;
                    v = (1 << 15) - 1;
                }
                v |= 1 << 15;
            }
            if (v > numeric_limits<unsigned short>::max())
                v = numeric_limits<unsigned short>::max();
            *Out << v;
            if (Index < (Ranks.size()-1))
            {
                *Out << ',';
                if (v < 10000) *Out << ' ';
                if (v < 1000)  *Out << ' ';
                if (v < 100)   *Out << ' ';
                if (v < 10)    *Out << ' ';
            }
        }
    }
    *Out << "\n};\n";
    if (TooBig)
    {
        unsigned int v  = ((1<<15) - 1) * 4 + (1<<15);
        cout << "// Word ranks too large, value restricted to " << v << endl;
    }
    unsigned int BytePerEntry = (CharSet.length() + 7) / 8;
    *Out << "#define SizeChildMapEntry " << BytePerEntry << '\n';
    *Out << "static const unsigned char ChildMap[" << StrSet.size() << '*' << BytePerEntry << "] =\n{";
    OutputSize += StrSet.size() * BytePerEntry * sizeof(unsigned char);

    StringIntSet_t::iterator Its;
    string Str;
    unsigned char Buf[8];

    // Get the items from StrSet ordered by the index
    StrIntPtrVect_t SetPtrs;
    SetPtrs.resize(StrSet.size());
    for(Its = StrSet.begin(); Its != StrSet.end(); ++Its)
    {
        StringInt *p = Its->Self();
        if (p->i >= StrSet.size())
        {
            cout << "p->i=" << p->i << "  " << p->s.c_str() << endl;
            throw "Bad index";
        }
        SetPtrs[p->i] = p;
    }
    unsigned int CharSetLen = 0;
    x = 999;
    Len = 0;
    for(Index = 0; Index < SetPtrs.size(); ++Index)
    {
        unsigned int i, j;
        string::size_type z, y;
        StringInt *p;
        memset(Buf, 0, sizeof Buf);
        if (x > LINE_OUT_LEN)
        {
            *Out << "\n    ";
            x = 4*BytePerEntry;
        }
        p = SetPtrs[Index];
        Str = p->s;
        for(z = 0; z < Str.length(); ++z)
        {
            y = CharSet.find(Str[z]);
            if (y != string::npos)
            {
                Buf[y/8] |= 1 << (y & 7);
            }
        }
        // Find max bits set which indicates max number chars ued at a node
        for(i = j = 0; i < 8 * sizeof Buf; ++i)
        {
            if (Buf[i/8] & (1 << (i & 7)))
                ++j;
        }
        if (j > CharSetLen)
            CharSetLen = j;
        for(z = 0; z < BytePerEntry; ++z)
        {
            y = Buf[z] & 0xFF;
            *Out << y;
            if (z < (BytePerEntry-1))
                *Out << ',';
            else
            {
                if (Index < (SetPtrs.size() - 1))
                    *Out << ", ";
            }
            if (y < 100)
                *Out << ' ';
            if (y < 10)
                *Out << ' ';
            x += 4;
        }
        if (Cmnts)
        {
            *Out << " // " << p->i << ": " << Str;
            x = 999;
        }
    }
    *Out << "\n};\n#define CHARSET_SIZE " << (CharSetLen+1) << endl;

    // Output the top 8 bits of the node word endings count. Since node with >255 endings have
    // been placed at the begining, and ther are not too many of them the array is fairly small.
    *Out << "#define NumLargeCounts " << FewEndStart << "\n";
    *Out << "static const unsigned char EndCountLge[" << FewEndStart << "] =\n{";
    OutputSize += FewEndStart * sizeof(unsigned char);
    x = 999;
    for(Index = 0; Index < FewEndStart; ++Index)
    {
        unsigned int v;
        x += 4;
        if (x > LINE_OUT_LEN)
        {
            *Out << "\n    ";
            x=0;
        }
        v = NodeEnds[Index] >> 8;
        if (v >= 256)
            v = 0;
        *Out << v;
        if (Index < (FewEndStart-1))
        {
            *Out << ',';
            if (v < 100)   *Out << ' ';
            if (v < 10)    *Out << ' ';
        }
    }
    *Out << "\n};\n";

    // Output all the word ending counts. For the first few nodes this is just the lower 8 bits of
    // the value. For the rest each entry contains the whole count. The split between lower and
    // upper halves of the value for the first few nodes allows bytes arrays to be used, so saving
    // memory.
    *Out << "static const unsigned char EndCountSml[" << NodeEnds.size() << "] =\n{";
    OutputSize += NodeEnds.size() * sizeof(unsigned char);
    x = 999;
    for(Index = 0; Index < NodeEnds.size(); ++Index)
    {
        unsigned int v;
        x += 4;
        if (x > LINE_OUT_LEN)
        {
            *Out << "\n    ";
            x=0;
        }
        v = NodeEnds[Index] & 255;
        *Out << v;
        if (Index < (NodeEnds.size()-1))
        {
            *Out << ',';
            if (v < 100)   *Out << ' ';
            if (v < 10)    *Out << ' ';
        }
    }
    *Out << "\n};\n";

    // Finally output the used characters.
    *Out << "static const char CharSet[" << CharSet.length()+1 << "] = \"";
    OutputSize += CharSet.length() * sizeof(char);
    for(Index = 0; Index < CharSet.length(); ++Index)
    {
        char c = CharSet[Index];
        if ((c == '\\') || (c == '"'))
            *Out << '\\';
        *Out << c;
    }
    *Out << "\";" << endl;
    *Out << "#define ROOT_NODE_LOC " << Root->GetAddr() << "\n";
    return OutputSize + sizeof(unsigned int);
}
enum { OUT_C_CODE, OUT_BINARY, OUT_TESTER };
/**********************************************************************************
 */
int main(int argc, char *argv[])
{
    int i;
    int MaxRank = 999999999;
    int OutType = OUT_C_CODE;
    bool Verbose = false;
    bool Comments = false;
    string FileName, HashFile;
    char *OutFile = 0;
    EntryMap_t Entries;
    FileInfo InInfo[10];
    int NumFiles = 0;
    MinLength = 999;

    try
    {
        for(i = 1; i < argc; ++i)
        {
            FileName = argv[i];
            if (FileName == "-b")
            {
                // Output a binary file to stdout or file
                OutType = OUT_BINARY;
                continue;
            }
            if (FileName == "-t")
            {
                // Output a tester file to stdout or file
                OutType = OUT_TESTER;
                continue;
            }
            if (FileName == "-c")
            {
                // Add comments to the output (if text)
                Comments = true;
                continue;
            }
            if (FileName == "-o")
            {
                // Give output file
                if (++i < argc)
                    OutFile = argv[i];
                continue;
            }
            if (FileName == "-h")
            {
                // Give crc header output file
                if (++i < argc)
                    HashFile = argv[i];
                continue;
            }
            if (FileName == "-r")
            {
                // Ignore words with too high rank
                if (++i < argc)
                {
                    char *p=0;
                    MaxRank = strtol(argv[i], &p, 0);
                    if ((MaxRank < 1000) || *p)
                        MaxRank = 999999999;
                    continue;
                }
            }
            if (FileName == "-v")
            {
                Verbose = true;
                continue;
            }
            if (FileName[0] == '-')
            {
                cerr << "Usage: " << argv[0] << " [ -c ] [ -b | -t ] [ -o Ofile ] [ -h Hfile ] Files...\n"
                        "Where:\n"
                        "     -b        Generate a binary output file\n"
                        "     -t        Generate a test file for testing zxcvbn\n"
                        "     -c        Add comments to output file if C code mode\n"
                        "     -r number Ignore words with rank greater than number (must be >=1000)\n"
                        "     -v        Additional information output\n"
                        "     -h Hfile  Write file checksum to file Hfile as C code (for -b mode)\n"
                        "     -o Ofile  Write output to file Ofile\n"
                        "     Files     The dictionary input files to read\n"
                        "  If the -o option is not used, output is written to stdout\n"
                        "  if the -b option is not used, output is in the form of C source code\n"
                        << endl;
                return 1;
            }
            ReadInputFile(FileName, InInfo[NumFiles], MaxRank);
            if (NumFiles < int(sizeof InInfo / sizeof InInfo[0] - 1))
                ++NumFiles;
        }
        CombineWordLists(Entries, InInfo, NumFiles);
        if (Verbose)
        {
            if (!OutFile && (OutType == OUT_C_CODE))
                cout << "/*\n";
            for(i = 0; i < NumFiles; ++i)
            {
                FileInfo *Fi = InInfo + i;
                cout << "Read input file " << Fi->Name << endl;
                cout << "   Input words  " << Fi->Words << endl;
                cout << "   Used words   " << Fi->Used << endl;
                cout << "        Unused  " << Fi->BruteIgnore <<
                        " Bruteforce compare, " << Fi->Accented <<
                        " Accented char, " << Fi->Dups << " Duplicates" << endl;
            }
        }
        bool InputCharSet[256];
        NodeSPtr Root(new Node);
        // Initially charset of used chracters is empty
        memset(InputCharSet, 0, sizeof InputCharSet);

        // Add words to the trie with root in Root
        ProcessEntries(Root, Entries, InputCharSet);

        // Get some interesting info
        int NumEnds = Root->CalcEndings();
        int Hi = Root->CalcHeight();
        int NumNodes = Root->NodeCount();
        if (Verbose)
        {
            cout << "Max word length = " << MaxLength << endl;
            cout << "Min word length = " << MinLength << endl;
            cout << "Num input chars = " << NumChars << endl;
            cout << "Num input words = " << NumInWords << endl;
            cout << "Duplicate words = " << NumDuplicate;
            cout << "Number of Ends  = " << NumEnds << endl;
            cout << "Number of Nodes = " << NumNodes << endl;
            cout << "Trie height = " << Hi << endl;
        }
        // Store the alphabetical ordering of the input words
        i = 0;
        ScanTrieForOrder(Entries, i, Root, string());
        if (Verbose)
            cout << "Trie Order = " << i << endl;
        int InputOrder = i;
        // Reduce the Trie
        ReduceTrie(Root);

        // Output some interesting information
        NumNodes = Root->NodeCount();
        int ReduceEnds = Root->CalcEndings();
        if (Verbose)
        {
            cout << "After reduce:\n";
            cout << "Number of Ends  = " << ReduceEnds << endl;
            cout << "Number of Nodes = " << NumNodes << endl;
        }
        // Check reduction was OK
        StringIntVect_t Ranks;
        int CheckEnds = CheckReduction(Ranks, Root, Entries);
        if (Verbose)
            cout << "Number of Words = " << CheckEnds << endl;

        ChkNum Tst = CheckEntries(Root, string(), Entries);
        if (Verbose)
        {
            cout << "2nd check - Number of valid words = " << Tst.Match << endl;
            cout << "          Number of invalid words = " << Tst.Err << endl;
        }

        // Give up if there was an error
        if (Tst.Err)
            throw "Checks show invalid words after reduction";
        if ((Tst.Match != InputOrder) || (ReduceEnds != InputOrder))
            throw "Word count changed after reduce";

        // Output more info
        StringIntSet_t ChildBits;
        string CharSet = MakeCharSet(InputCharSet);
        if (Verbose)
            cout << "Used characters (" << CharSet.length() << "): " << CharSet.c_str() << endl;

        // Make a set of all unique child character patterns for the nodes
        i=0;
        Root->ClearCounted();
        MakeChildBitMap(ChildBits, Root, i);
        if (Verbose)
            cout << "Number of child bitmaps = " << ChildBits.size() << endl;

        // Get final node address
        Root->CalcAddress();

        Uint64Vect NodeData;
        UintVect NodeEnds;
        StringOfInts ChildAddrs;

        // Resize to save library adjusting allocation during data creation
        NodeData.resize(NumNodes, 4000000000);
        NodeEnds.resize(NumNodes, 4000000000);
        CreateArrays(Root, ChildBits, ChildAddrs, NodeData, NodeEnds);
        if (Verbose)
        {
            cout << "Node data array size " << NodeData.size() << endl;
            cout << "Child pointer array size " << ChildAddrs.size() << endl;
            cout << "Max node childs " << MaxNumChilds <<  " (chars " << MaxChildChars << " ) at character index "
                 << MaxChildsPosn << " using password " << PassWithMaxChilds.c_str() << endl;
        }
        shared_ptr<ofstream> fout;
        ostream *Out = &cout;
        if (OutFile)
        {
            fout = shared_ptr<ofstream>(new ofstream);
            if (OutType == OUT_BINARY)
                fout->open(OutFile, ios_base::trunc | ios_base::binary);
            else
                fout->open(OutFile, ios_base::trunc);
            Out = fout.get();
        }
        if (!OutFile && (OutType == OUT_C_CODE))
            cout << "*/\n";

        if (OutType == OUT_BINARY)
            i = OutputBinary(Out, HashFile, CharSet, ChildBits, ChildAddrs, NodeData, NodeEnds, Ranks);
        else if (OutType == OUT_TESTER)
            i = OutputTester(Out, Comments, Ranks);
        else
            i = OutputCode(Out, Comments, CharSet, ChildBits, Root, ChildAddrs, NodeData, NodeEnds, Ranks);

        if (fout)
        {
            fout->close();
        }
    }
    catch(const char *m)
    {
        cerr << m << endl;
        return 1;
    }
    catch(string m)
    {
        cerr << m.c_str() << endl;
        return 1;
    }
    catch(...)
    {
        cerr << "Unhandled exception" << endl;
        return 1;
    }
    return 0;
}

/**********************************************************************************/