package kylm.main;

import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import kylm.model.ClassMap;
import kylm.model.ngram.NgramLM;
import kylm.model.ngram.smoother.AbsoluteSmoother;
import kylm.model.ngram.smoother.GTSmoother;
import kylm.model.ngram.smoother.KNSmoother;
import kylm.model.ngram.smoother.MKNSmoother;
import kylm.model.ngram.smoother.MLSmoother;
import kylm.model.ngram.smoother.NgramSmoother;
import kylm.model.ngram.smoother.WBSmoother;
import kylm.model.ngram.writer.ArpaNgramWriter;
import kylm.model.ngram.writer.NgramWriter;
import kylm.model.ngram.writer.SerializedNgramWriter;
import kylm.model.ngram.writer.WFSTNgramWriter;
import kylm.reader.SentenceReader;
import kylm.reader.TextFileClassMapReader;
import kylm.reader.TextFileSentenceReader;
import kylm.reader.TextStreamSentenceReader;
import kylm.util.KylmConfigUtils;
import kylm.util.SymbolSet;

/* loaded from: input_file:kylm/main/CountNgrams.class */
public class CountNgrams {
    public static void main(String[] strArr) throws Exception {
        String property = System.getProperty("line.separator");
        KylmConfigUtils kylmConfigUtils = new KylmConfigUtils("CountNgrams" + property + "A program to calculate an n-gram language model given a training corpus" + property + "Example: java -cp kylm.jar kylm.main.CountNgrams training.txt model.arpa");
        kylmConfigUtils.addGroup("N-gram model options");
        kylmConfigUtils.addEntry("n", 1, 3, false, "the length of the n-gram context");
        kylmConfigUtils.addEntry("trim", 5, null, false, "the trimming for each level of the n-gram (example: 0:1:1)");
        kylmConfigUtils.addEntry("name", 0, null, false, "the name of the model");
        kylmConfigUtils.addEntry("smoothuni", 3, false, false, "whether or not to smooth unigrams");
        kylmConfigUtils.addGroup("Symbol/Vocabulary options");
        kylmConfigUtils.addEntry("vocab", 0, null, false, "the vocabulary file to use");
        kylmConfigUtils.addEntry("startsym", 0, "<s>", false, "the symbol to use for sentence starts");
        kylmConfigUtils.addEntry("termsym", 0, "</s>", false, "the terminal symbol for sentences");
        kylmConfigUtils.addEntry("vocabout", 0, null, false, "the vocabulary file to write out to");
        kylmConfigUtils.addEntry("ukcutoff", 1, 0, false, "the cut-off for unknown words");
        kylmConfigUtils.addEntry("uksym", 0, "<unk>", false, "the symbol to use for unknown words");
        kylmConfigUtils.addEntry("ukexpand", 3, false, false, "expand unknown symbols in the vocabulary");
        kylmConfigUtils.addEntry("ukmodel", 4, null, false, "model unknown words. Arguments are processed first to last, so the most general model should be specified last. Format: \"symbol:vocabsize[:regex(.*)][:order(2)][:smoothing(wb)]\"");
        kylmConfigUtils.addGroup("Class options");
        kylmConfigUtils.addEntry("classes", 0, null, false, "a file containing word class definitions");
        kylmConfigUtils.addGroup("Smoothing options [default: kn]");
        kylmConfigUtils.addEntry("ml", 3, false, false, "maximum likelihood smoothing");
        kylmConfigUtils.addEntry("gt", 3, false, false, "Good-Turing smoothing (Katz Backoff)");
        kylmConfigUtils.addEntry("wb", 3, false, false, "Witten-Bell smoothing");
        kylmConfigUtils.addEntry("abs", 3, false, false, "absolute smoothing");
        kylmConfigUtils.addEntry("kn", 3, true, false, "Kneser-Ney smoothing (default)");
        kylmConfigUtils.addEntry("mkn", 3, false, false, "Modified Kneser-Ney smoothing (of Chen & Goodman)");
        kylmConfigUtils.addGroup("Output options [default: arpa]");
        kylmConfigUtils.addEntry("bin", 3, false, false, "output in binary format");
        kylmConfigUtils.addEntry("wfst", 3, false, false, "output in weighted finite state transducer format (WFST)");
        kylmConfigUtils.addEntry("arpa", 3, true, false, "output in ARPA format");
        kylmConfigUtils.addEntry("neginf", 2, null, false, "the number to print for non-existent backoffs (default: null, example: -99)");
        kylmConfigUtils.addGroup("Miscellaneous options");
        kylmConfigUtils.addEntry("debug", 1, 0, false, "the level of debugging information to print");
        String[] parseArguments = kylmConfigUtils.parseArguments(strArr);
        int i = kylmConfigUtils.getInt("debug");
        int i2 = kylmConfigUtils.getInt("n");
        if (parseArguments.length > 2 || i2 == -1) {
            kylmConfigUtils.exitOnUsage();
        }
        NgramSmoother ngramSmoother = null;
        if (kylmConfigUtils.getBoolean("ml")) {
            ngramSmoother = new MLSmoother();
        } else if (kylmConfigUtils.getBoolean("gt")) {
            ngramSmoother = new GTSmoother();
        } else if (kylmConfigUtils.getBoolean("wb")) {
            ngramSmoother = new WBSmoother();
        } else if (kylmConfigUtils.getBoolean("abs")) {
            ngramSmoother = new AbsoluteSmoother();
        } else if (kylmConfigUtils.getBoolean("mkn")) {
            ngramSmoother = new MKNSmoother();
        } else if (kylmConfigUtils.getBoolean("kn")) {
            ngramSmoother = new KNSmoother();
        }
        if (ngramSmoother == null) {
            System.err.println("A type of smoothing must be chosen (ml|gt|wb|abs|kn|mkn)");
            kylmConfigUtils.exitOnUsage(1);
        }
        ngramSmoother.setDebugLevel(i);
        ngramSmoother.setSmoothUnigrams(kylmConfigUtils.getBoolean("smoothuni"));
        NgramWriter ngramWriter = null;
        if (kylmConfigUtils.getBoolean("bin")) {
            ngramWriter = new SerializedNgramWriter();
        } else if (kylmConfigUtils.getBoolean("wfst")) {
            ngramWriter = new WFSTNgramWriter();
        } else if (kylmConfigUtils.getBoolean("arpa")) {
            ngramWriter = new ArpaNgramWriter();
            Object value = kylmConfigUtils.getValue("neginf");
            if (value != null) {
                ((ArpaNgramWriter) ngramWriter).setNegativeInfinity((Float) value);
            }
        } else {
            System.err.println("A type of writer must be chosen (arpa|bin|wfst)");
            kylmConfigUtils.exitOnUsage(1);
        }
        SentenceReader textFileSentenceReader = parseArguments.length > 0 ? new TextFileSentenceReader(parseArguments[0]) : new TextStreamSentenceReader(System.in);
        NgramLM ngramLM = new NgramLM(i2, ngramSmoother);
        ngramLM.getSmoother().setCutoffs(kylmConfigUtils.getIntArray("trim"));
        ngramLM.setDebug(i);
        ngramLM.setName(kylmConfigUtils.getString("name"));
        ngramLM.setUnknownSymbol(kylmConfigUtils.getString("uksym"));
        ngramLM.setVocabFrequency(kylmConfigUtils.getInt("ukcutoff"));
        ngramLM.setStartSymbol(kylmConfigUtils.getString("startsym"));
        ngramLM.setTerminalSymbol(kylmConfigUtils.getString("termsym"));
        String[] stringArray = kylmConfigUtils.getStringArray("ukmodel");
        if (stringArray != null) {
            NgramLM[] ngramLMArr = new NgramLM[stringArray.length];
            for (int i3 = 0; i3 < stringArray.length; i3++) {
                ngramLMArr[i3] = getUnknownModel(stringArray[i3]);
            }
            ngramLM.setUnknownModels(ngramLMArr);
        }
        if (kylmConfigUtils.getString("vocab") != null) {
            ngramLM.setVocab(SymbolSet.readFromFile(kylmConfigUtils.getString("vocab")));
            if (i > 0) {
                System.err.println("CountNgrams, loaded " + ngramLM.getVocab().getSize() + " vocabulary");
            }
        } else if (!textFileSentenceReader.supportsReset()) {
            System.err.println("CountNgrams only supports piped input if the vocabulary is specified.");
            System.err.println("Either specify a vocabulary or load the input directly from a file.");
            System.exit(1);
        }
        if (kylmConfigUtils.getString("classes") != null) {
            ClassMap readClassMap = new TextFileClassMapReader(kylmConfigUtils.getString("classes")).readClassMap(ngramLM.getVocab(), ngramLM.getUnknownModelCount() + 2, false);
            readClassMap.getClasses().addAlias(ngramLM.getTerminalSymbol(), ngramLM.getId(ngramLM.getStartSymbol()));
            ngramLM.setClassMap(readClassMap);
        }
        ngramLM.trainModel(textFileSentenceReader);
        if (kylmConfigUtils.getString("vocabout") != null) {
            ngramLM.getVocab().writeToFile(kylmConfigUtils.getString("vocabout"), false);
        }
        if (kylmConfigUtils.getBoolean("ukexpand")) {
            ngramLM.expandUnknowns();
        }
        if (i > 0) {
            System.err.println("CountNgrams, Started writing");
        }
        long currentTimeMillis = System.currentTimeMillis();
        BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(parseArguments.length > 1 ? new FileOutputStream(parseArguments[1]) : System.out, 16384);
        ngramWriter.write(ngramLM, bufferedOutputStream);
        bufferedOutputStream.close();
        if (i > 0) {
            System.err.println("CountNgrams, done writing - " + (System.currentTimeMillis() - currentTimeMillis) + " ms");
        }
    }

    private static NgramLM getUnknownModel(String str) {
        String[] split = str.split(":");
        if (split.length < 2) {
            System.err.println("Must specify at least a symbol and a vocabulary size for unknown models (e.g. <unk>:5000)");
            System.exit(1);
        }
        int i = 0;
        try {
            i = Integer.parseInt(split[1]);
        } catch (NumberFormatException e) {
            System.err.println("Illegal vocabulary size for " + split[0] + ": " + split[1] + ". Must be an integer.");
        }
        String str2 = split.length > 2 ? split[2] : null;
        int i2 = 2;
        if (split.length > 3) {
            try {
                i2 = Integer.parseInt(split[3]);
            } catch (NumberFormatException e2) {
                System.err.println("Illegal ngram-order size for " + split[0] + ": " + split[3] + ". Must be an integer.");
            }
        }
        NgramSmoother ngramSmoother = null;
        String str3 = split.length > 4 ? split[4] : "wb";
        if (str3.equals("ml")) {
            ngramSmoother = new MLSmoother();
        } else if (str3.equals("gt")) {
            ngramSmoother = new GTSmoother();
        } else if (str3.equals("wb")) {
            ngramSmoother = new WBSmoother();
        } else if (str3.equals("abs")) {
            ngramSmoother = new AbsoluteSmoother();
        } else if (str3.equals("mkn")) {
            ngramSmoother = new MKNSmoother();
        } else if (str3.equals("kn")) {
            ngramSmoother = new KNSmoother();
        } else {
            System.err.println("Illegal smoother type in unknown model \"" + str + "\"");
            System.exit(1);
        }
        NgramLM ngramLM = new NgramLM(i2, ngramSmoother);
        ngramLM.setSymbol(split[0]);
        ngramLM.setVocabLimit(i);
        if (str2 != null) {
            ngramLM.setRegex(str2);
        }
        return ngramLM;
    }
}
