morfessor.py 文件源码-python代码片段

def main():
    logging.basicConfig(stream=stderr, level=INFO)

    a = ArgumentParser()
    a.add_argument('-data', dest='data', required=True, metavar='WORDLIST',
                   help="a text file (the corpus) consisting of one word per line. The word may be preceded by a word"\
                        " count (separated by whitespace), otherwise a count of one is assumed. If the same word "\
                        "occurs many times, the counts are accumulated.")
    a.add_argument('-finish', dest='finish', metavar='float', type=float, default=0.005,
                   help="convergence threshold. From one pass over all input words to the next, "\
                        "if the overall coding length in bits (i.e. logprob) of the lexicon together with the corpus "\
                        "improves less than this value times the number of word types (distinct word forms) in the "\
                        "data, the program stops. (If this value is small the program runs for a longer time and the "\
                        "result is in principle more accurate. However, the changes in word splittings during the "\
                        "last training epochs are usually very small.) The value must be within the range: 0 < float "\
                        "< 1. Default 0.005")
    a.add_argument('-rand', dest='rand', metavar='int', type=int, default=0,
                   help="random seed that affects the sorting of words when processing them. Default 0")
    a.add_argument('-gammalendistr', dest='gammalendistr', type=float, metavar='float', nargs=2,
                   help="Use Gamma Length distribution with two parameters. Float1 is the prior for the most common "\
                        "morph length in the lexicon, such that 0 < float1 <= 24*float2. Float2 is the beta value of "\
                        "the Gamma pdf, such that beta > 0. The beta value affects the wideness of the morph length "\
                        "distribution. The higher beta, the wider and less discriminative the distribution. If this "\
                        "option is omitted, morphs in the lexicon are terminated with  an end-of-morph character, "\
                        "which corresponds to an exponential pdf for morph lengths. Suggested values: float1 = 7.0, "\
                        "float2 = 1.0 ")
    a.add_argument('-zipffreqdistr', dest='zipffreqdistr', type=float, metavar='float',
                   help="Use Zipf Frequency distribution with paramter float1 for the proportion of morphs in the "\
                        "lexicon that occur only once in the data (hapax legomena): 0 < value < 1. If this option is "\
                        "omitted a (non-informative) morph frequency distribution based on enumerative coding is used"\
                        " instead. Suggested value: 0.5")
    a.add_argument('-load', dest='load', metavar='filename',
                   help="An existing model for word splitting is loaded from a file (which is the output of an "\
                        "earlier run of this program) and the words in the corpus defined using the option '-data "\
                        "wordlist' are segmented according to the loaded model. That is, "\
                        "no learning of a new model takes place. The existing model is simply used for segmenting a " \
                        "list of words. The segmentation takes place using Viterbi search. No new morphs are ever " \
                        "created (except one-letter morphs, if there is no other way of segmenting a particular input" \
                        " word)")

    a.add_argument('-encoding', dest='encoding', help='Input encoding (defaults to local encoding)')

    a.add_argument('-savememory', type=int, nargs='?', help=SUPPRESS)

    options = a.parse_args()

    if options.load is not None:
        m = MorphModel(vars(options))
        m.load(options.load)

        for word in open(options.data):
            print(' + '.join(m.viterbi_segment_word(word.strip())))

    else:
        m = MorphModel(vars(options))
        m.train(options.data)
        stderr.flush()
        m.print_segmentation()