kylm.model
Class LanguageModel

java.lang.Object
  extended by kylm.model.LanguageModel
All Implemented Interfaces:
java.io.Serializable
Direct Known Subclasses:
NgramLM

public abstract class LanguageModel
extends java.lang.Object
implements java.io.Serializable

An abstract class representing many of the common functions of language models. All probabilities/entropies are represented as log10 form floats.

Author:
neubig TODO: allow limitation of vocabulary by size, not only by word frequency TODO: allow conversion to and from regular/log probabilities
See Also:
Serialized Form

Constructor Summary
LanguageModel()
           
 
Method Summary
 boolean equals(java.lang.Object obj)
           
 int findUnknownId(java.lang.String key)
          Find the appropriate unknown word model for the key
 float[] getClassEntropies()
           
 ClassMap getClassMap()
           
 boolean getCountTerminals()
           
 int getDebug()
           
 int getId(java.lang.String word)
          Get the ID assigned to a particular word
 int getMaxLength()
           
 java.lang.String getName()
           
 java.util.regex.Pattern getRegex()
           
 float getSentenceClassEntropy()
           
 float getSentenceEntropy(java.lang.String[] sent)
          Returns the entropy of an entire sentence.
 int[] getSentenceIds(java.lang.String[] sent)
          Get an array of IDs for a sentence, add terminal symbols on the left side, and on the right side if countTerminals is true.
 float getSentenceSimpleEntropy()
           
 float getSentenceUnknownEntropy()
           
 float[] getSimpleEntropies()
           
 java.lang.String getStartSymbol()
           
 java.lang.String getSymbol()
           
 java.lang.String getTerminalSymbol()
           
 float[] getUnknownEntropies()
           
 int getUnknownModelCount()
           
 LanguageModel[] getUnknownModels()
           
 java.lang.String getUnknownSymbol()
           
 SymbolSet getVocab()
           
 int getVocabFrequency()
          Get the frequency limit for the vocabulary.
 int getVocabLimit()
           
 java.lang.String[] getVocabulary()
          Get all in vocabulary words.
 float[] getWordEntropies(int[] ids)
          Get the entropies of every word in a sentence by ID.
 float[] getWordEntropies(java.lang.String[] sent)
          Get the entropies of every word in a sentence.
abstract  float getWordEntropy(int[] ids, int pos)
          Get the entropies of the last word in the sequence by ID
 void importVocabulary(java.lang.Iterable<java.lang.String[]> sl)
          Load the vocabulary and trim it at the appropriate level.
 boolean isClosed()
           
 boolean isInVocab(int idx)
          Returns whether or not an id is in the vocabulary.
 boolean isInVocab(java.lang.String str)
          Returns whether or not a string is in the vocabulary, returns false for special symbols as well.
abstract  java.lang.String printReport()
           
 void setClassMap(ClassMap cm)
           
 void setClosed(boolean closed)
           
 void setCountTerminals(boolean countTerminals)
           
 void setDebug(int debug)
           
 void setMaxLength(int maxLength)
           
 void setName(java.lang.String name)
           
 void setRegex(java.lang.String regex)
           
 void setStartSymbol(java.lang.String startSymbol)
           
 void setSymbol(java.lang.String symbol)
           
 void setTerminalSymbol(java.lang.String terminalSymbol)
           
 void setUnknownModels(LanguageModel[] ukModels)
           
 void setUnknownSymbol(java.lang.String ukSymbol)
           
 void setVocab(SymbolSet newVocab)
           
 void setVocabFrequency(int vocabFrequency)
          Set the limit on the vocabulary frequency.
 void setVocabLimit(int vocabLimit)
           
 void setVocabulary(java.lang.String[] voc)
          Set the vocabulary to be used by the language model.
abstract  void trainModel(java.lang.Iterable<java.lang.String[]> sl)
           
 
Methods inherited from class java.lang.Object
getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Constructor Detail

LanguageModel

public LanguageModel()
Method Detail

equals

public boolean equals(java.lang.Object obj)
Overrides:
equals in class java.lang.Object

getWordEntropy

public abstract float getWordEntropy(int[] ids,
                                     int pos)
Get the entropies of the last word in the sequence by ID

Parameters:
ids - The IDs of the words in the sentence. Will always start and end with the sentence terminal symbol.
pos - The position of the word to be judged in ids
Returns:
The entropy of the word at position pos given the rest as context

getWordEntropies

public float[] getWordEntropies(int[] ids)
Get the entropies of every word in a sentence by ID. The version implemented in LanguageModel calls getWordEntropy individually for each value, but might be overridden for higher efficiency.

Parameters:
ids - The IDs of the words in the sentence. Will always start and end with the sentence terminal symbol.
Returns:
An array of entropies of length ids.length-1. The first non-terminal symbol need not be assigned an entropy.

getWordEntropies

public float[] getWordEntropies(java.lang.String[] sent)
Get the entropies of every word in a sentence. Sent should not contain terminal symbols.

Parameters:
sent - The string of words
Returns:
Returns an array of float entropies.

getClassEntropies

public float[] getClassEntropies()

getSimpleEntropies

public float[] getSimpleEntropies()

getUnknownEntropies

public float[] getUnknownEntropies()

getSentenceEntropy

public float getSentenceEntropy(java.lang.String[] sent)
Returns the entropy of an entire sentence.

Parameters:
sent - The sentence to find the entropy of
Returns:
The entropy of the sentence

getSentenceSimpleEntropy

public float getSentenceSimpleEntropy()

getSentenceUnknownEntropy

public float getSentenceUnknownEntropy()

getSentenceClassEntropy

public float getSentenceClassEntropy()

trainModel

public abstract void trainModel(java.lang.Iterable<java.lang.String[]> sl)
                         throws java.io.IOException
Throws:
java.io.IOException

printReport

public abstract java.lang.String printReport()

setVocabulary

public void setVocabulary(java.lang.String[] voc)
Set the vocabulary to be used by the language model. Everything else will be treated as an out of vocabulary word.


isInVocab

public boolean isInVocab(java.lang.String str)
Returns whether or not a string is in the vocabulary, returns false for special symbols as well.

Parameters:
str - The symbol to check
Returns:
whether or not the symbol is in the vocab

isInVocab

public boolean isInVocab(int idx)
Returns whether or not an id is in the vocabulary.

Parameters:
idx - the index to check
Returns:
whether or not the symbol is in the vocab

getSentenceIds

public int[] getSentenceIds(java.lang.String[] sent)
Get an array of IDs for a sentence, add terminal symbols on the left side, and on the right side if countTerminals is true.

Parameters:
sent - The sentence to convert
Returns:
The ID array

getVocabulary

public java.lang.String[] getVocabulary()
Get all in vocabulary words.

Returns:
An array containing every vocabulary word

importVocabulary

public void importVocabulary(java.lang.Iterable<java.lang.String[]> sl)
                      throws java.io.IOException
Load the vocabulary and trim it at the appropriate level.

Parameters:
sl - The sentenceLoader containing the corpus sentences.
Throws:
java.io.IOException

findUnknownId

public int findUnknownId(java.lang.String key)
Find the appropriate unknown word model for the key


getId

public int getId(java.lang.String word)
Get the ID assigned to a particular word

Parameters:
word - The word to find the id for
Returns:
The id of the word pointing to IVs or the unknown model for OOVs

getDebug

public int getDebug()

setDebug

public void setDebug(int debug)

getSymbol

public java.lang.String getSymbol()

setSymbol

public void setSymbol(java.lang.String symbol)

getName

public java.lang.String getName()

setName

public void setName(java.lang.String name)

getVocab

public SymbolSet getVocab()

setVocab

public void setVocab(SymbolSet newVocab)

getStartSymbol

public java.lang.String getStartSymbol()

setStartSymbol

public void setStartSymbol(java.lang.String startSymbol)

getTerminalSymbol

public java.lang.String getTerminalSymbol()

setTerminalSymbol

public void setTerminalSymbol(java.lang.String terminalSymbol)

getUnknownSymbol

public java.lang.String getUnknownSymbol()

setUnknownSymbol

public void setUnknownSymbol(java.lang.String ukSymbol)

getUnknownModels

public LanguageModel[] getUnknownModels()

setUnknownModels

public void setUnknownModels(LanguageModel[] ukModels)

isClosed

public boolean isClosed()

setClosed

public void setClosed(boolean closed)

getCountTerminals

public boolean getCountTerminals()

setCountTerminals

public void setCountTerminals(boolean countTerminals)

getVocabFrequency

public int getVocabFrequency()
Get the frequency limit for the vocabulary.

Returns:
The frequency limit for the vocabulary. All words that occur this many times or fewer will be treated as unknowns.

setVocabFrequency

public void setVocabFrequency(int vocabFrequency)
Set the limit on the vocabulary frequency.

Parameters:
vocabFrequency - All words that occur this many times or fewer will be treated as unknown words.

getRegex

public java.util.regex.Pattern getRegex()

setRegex

public void setRegex(java.lang.String regex)

getVocabLimit

public int getVocabLimit()

setVocabLimit

public void setVocabLimit(int vocabLimit)

getMaxLength

public int getMaxLength()

setMaxLength

public void setMaxLength(int maxLength)

getClassMap

public ClassMap getClassMap()

setClassMap

public void setClassMap(ClassMap cm)

getUnknownModelCount

public int getUnknownModelCount()