FileDocCategorySizeDatePackage
MakeBinaryDictionary.javaAPI DocAndroid 1.5 API10809Wed May 06 22:41:10 BST 2009com.android.tools.dict

MakeBinaryDictionary.java

/*
 * Copyright (C) 2009 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.android.tools.dict;

import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

/**
 * Compresses a list of words and frequencies into a tree structured binary dictionary.
 */
public class MakeBinaryDictionary {

    public static final int ALPHA_SIZE = 256;

    public static final String TAG_WORD = "w";
    public static final String ATTR_FREQ = "f";

    private static final int FLAG_ADDRESS_MASK  = 0x400000;
    private static final int FLAG_TERMINAL_MASK = 0x800000;
    private static final int ADDRESS_MASK = 0x3FFFFF;
    
    public static final CharNode EMPTY_NODE = new CharNode();

    List<CharNode> roots;
    Map<String, Integer> mDictionary;
    int mWordCount;
    
    static class CharNode {
        char data;
        int freq;
        boolean terminal;
        List<CharNode> children;
        static int sNodes;

        public CharNode() {
            sNodes++;
        }
    }

    public static void usage() {
        System.err.println("Usage: makedict <src.xml> <dest.dict>");
        System.exit(-1);
    }
    
    public static void main(String[] args) {
        if (args.length < 2) {
            usage();
        } else {
            new MakeBinaryDictionary(args[0], args[1]);
        }
    }

    public MakeBinaryDictionary(String srcFilename, String destFilename) {
        populateDictionary(srcFilename);
        writeToDict(destFilename);
        // Enable the code below to verify that the generated tree is traversable.
        if (false) {
            traverseDict(0, new char[32], 0);
        }
    }
    
    private void populateDictionary(String filename) {
        roots = new ArrayList<CharNode>();
        try {
            SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
            parser.parse(new File(filename), new DefaultHandler() {
                boolean inWord;
                int freq;

                @Override
                public void startElement(String uri, String localName,
                        String qName, Attributes attributes) {
                    if (qName.equals("w")) {
                        inWord = true;
                        freq = Integer.parseInt(attributes.getValue(0));
                    }
                }

                @Override
                public void characters(char[] data, int offset, int length) {
                    // Ignore other whitespace
                    if (!inWord) return;
                    
                    // Ignore one letter words
                    if (length < 2) return;
                    mWordCount++;
                    String word = new String(data, offset, length);
                    addWordTop(word, freq);
                }

                @Override
                public void endElement(String uri, String localName,
                        String qName) {
                    if (qName.equals("w")) inWord = false;
                }
            });
        } catch (Exception ioe) {
            System.err.println("Exception in parsing\n" + ioe);
            ioe.printStackTrace();
        }
        System.out.println("Nodes = " + CharNode.sNodes);
    }

    private int indexOf(List<CharNode> children, char c) {
        if (children == null) {
            return -1;
        }
        for (int i = 0; i < children.size(); i++) {
            if (children.get(i).data == c) {
                return i;
            }
        }
        return -1;
    }

    private void addWordTop(String word, int occur) {
        if (occur > 255) occur = 255;

        char firstChar = word.charAt(0);
        int index = indexOf(roots, firstChar);
        if (index == -1) {
            CharNode newNode = new CharNode();
            newNode.data = firstChar;
            newNode.freq = occur;
            index = roots.size();
            roots.add(newNode);
        } else {
            roots.get(index).freq += occur;
        }
        if (word.length() > 1) {
            addWordRec(roots.get(index), word, 1, occur);
        } else {
            roots.get(index).terminal = true;
        }
    }

    private void addWordRec(CharNode parent, String word, int charAt, int occur) {
        CharNode child = null;
        char data = word.charAt(charAt);
        if (parent.children == null) {
            parent.children = new ArrayList<CharNode>();
        } else {
            for (int i = 0; i < parent.children.size(); i++) {
                CharNode node = parent.children.get(i);
                if (node.data == data) {
                    child = node;
                    break;
                }
            }
        }
        if (child == null) {
            child = new CharNode();
            parent.children.add(child);
        }
        child.data = data;
        if (child.freq == 0) child.freq = occur;
        if (word.length() > charAt + 1) {
            addWordRec(child, word, charAt + 1, occur);
        } else {
            child.terminal = true;
            child.freq = occur;
        }
    }

    byte[] dict;
    int dictSize;
    static final int CHAR_WIDTH = 8;
    static final int FLAGS_WIDTH = 1; // Terminal flag (word end)
    static final int ADDR_WIDTH = 23; // Offset to children
    static final int FREQ_WIDTH_BYTES = 1;
    static final int COUNT_WIDTH_BYTES = 1;

    private void addCount(int count) {
        dict[dictSize++] = (byte) (0xFF & count);
    }
    
    private void addNode(CharNode node) {
        int charData = 0xFFFF & node.data;
        if (charData > 254) {
            dict[dictSize++] = (byte) 255;
            dict[dictSize++] = (byte) ((node.data >> 8) & 0xFF);
            dict[dictSize++] = (byte) (node.data & 0xFF);
        } else {
            dict[dictSize++] = (byte) (0xFF & node.data);
        }
        if (node.children != null) {
            dictSize += 3; // Space for children address
        } else {
            dictSize += 1; // Space for just the terminal/address flags
        }
        if ((0xFFFFFF & node.freq) > 255) {
            node.freq = 255;
        }
        if (node.terminal) {
            byte freq = (byte) (0xFF & node.freq);
            dict[dictSize++] = freq;
        }
    }

    int nullChildrenCount = 0;
    int notTerminalCount = 0;

    private void updateNodeAddress(int nodeAddress, CharNode node,
            int childrenAddress) {
        if ((dict[nodeAddress] & 0xFF) == 0xFF) { // 3 byte character
            nodeAddress += 2;
        }
        childrenAddress = ADDRESS_MASK & childrenAddress;
        if (childrenAddress == 0) {
            nullChildrenCount++;
        } else {
            childrenAddress |= FLAG_ADDRESS_MASK;
        }
        if (node.terminal) {
            childrenAddress |= FLAG_TERMINAL_MASK;
        } else {
            notTerminalCount++;
        }
        dict[nodeAddress + 1] = (byte) (childrenAddress >> 16);
        if ((childrenAddress & FLAG_ADDRESS_MASK) != 0) {
            dict[nodeAddress + 2] = (byte) ((childrenAddress & 0xFF00) >> 8);
            dict[nodeAddress + 3] = (byte) ((childrenAddress & 0xFF));
        }
    }

    void writeWordsRec(List<CharNode> children) {
        if (children == null || children.size() == 0) {
            return;
        }
        final int childCount = children.size();
        addCount(childCount);
        //int childrenStart = dictSize;
        int[] childrenAddresses = new int[childCount];
        for (int j = 0; j < childCount; j++) {
            CharNode node = children.get(j);
            childrenAddresses[j] = dictSize;
            addNode(node);
        }
        for (int j = 0; j < childCount; j++) {
            CharNode node = children.get(j);
            int nodeAddress = childrenAddresses[j];
            int cacheDictSize = dictSize;
            writeWordsRec(node.children);
            updateNodeAddress(nodeAddress, node, node.children != null
                    ? cacheDictSize : 0);
        }
    }

    void writeToDict(String dictFilename) {
        // 4MB max, 22-bit offsets
        dict = new byte[4 * 1024 * 1024]; // 4MB upper limit. Actual is probably
                                          // < 1MB in most cases, as there is a limit in the
                                          // resource size in apks.
        dictSize = 0;
        writeWordsRec(roots);
        System.out.println("Dict Size = " + dictSize);
        try {
            FileOutputStream fos = new FileOutputStream(dictFilename);
            fos.write(dict, 0, dictSize);
            fos.close();
        } catch (IOException ioe) {
            System.err.println("Error writing dict file:" + ioe);
        }
    }

    void traverseDict(int pos, char[] word, int depth) {
        int count = dict[pos++] & 0xFF;
        for (int i = 0; i < count; i++) {
            char c = (char) (dict[pos++] & 0xFF);
            if (c == 0xFF) {
                c = (char) (((dict[pos] & 0xFF) << 8) | (dict[pos+1] & 0xFF));
                pos += 2;
            }
            word[depth] = c;
            boolean terminal = (dict[pos] & 0x80) > 0;
            int address = 0;
            if ((dict[pos] & (FLAG_ADDRESS_MASK >> 16)) > 0) {
                address = 
                    ((dict[pos + 0] & (FLAG_ADDRESS_MASK >> 16)) << 16)
                    | ((dict[pos + 1] & 0xFF) << 8)
                    | ((dict[pos + 2] & 0xFF));
                pos += 2;
            }
            pos++;
            if (terminal) {
                showWord(word, depth + 1, dict[pos] & 0xFF);
                pos++;
            }
            if (address != 0) {
                traverseDict(address, word, depth + 1);
            }
        }
    }

    void showWord(char[] word, int size, int freq) {
        System.out.print(new String(word, 0, size) + " " + freq + "\n");
    }
}