################################################ ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ### ################################################ ### You definitely need to edit the lines starting with the following: username, home-dir, working-dir, input-extension, output-extension, pair-extension, moses-src-dir, external-bin-dir, [GENERAL] input-extension = en output-extension = de pair-extension = de-en ### Your username username = myusername ### On Cygwin or Linux, it is /home/$username ### On Mac OSX, it is /Users/$username home-dir = /home/$username ### directory in which experiment is run working-dir = $home-dir/corpora data-dir = $working-dir train-dir = $data-dir dev-dir = $data-dir/dev # Moses paths moses-src-dir = $home-dir/moses moses-script-dir = $moses-src-dir/scripts moses-bin-dir = $moses-src-dir/bin ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2" decoder = $moses-bin-dir/moses # directory where GIZA++/MGIZA programs resides external-bin-dir = $moses-src-dir/mgiza/mgizapp/bin input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -l $input-extension" output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -l $output-extension" input-truecaser = $moses-script-dir/recaser/truecase.perl output-truecaser = $moses-script-dir/recaser/truecase.perl detruecaser = $moses-script-dir/recaser/detruecase.perl ### multi-core settings # when the generic parallelizer is used, the number of cores # specified here cores = 3 srilm-dir = $moses-src-dir/ irstlm-dir = $moses-src-dir/ randlm-dir = $moses-src-dir/ ################################################################# # PARALLEL CORPUS PREPARATION: # create a tokenized, sentence-aligned corpus, ready for training [CORPUS] max-sentence-length = 80 ### command to run to get raw corpus files # # get-corpus-script = [CORPUS:mycorpus] raw-stem = $train-dir/news-commentary-v10.$pair-extension ### tokenized corpus files (may contain long sentences) # #tokenized-stem = ### if sentence filtering should be skipped, # point to the clean training data # #clean-stem = ### if corpus preparation should be skipped, # point to the prepared training data # #lowercased-stem = ################################################################# # LANGUAGE MODEL TRAINING [LM] ### tool to be used for language model training # kenlm training lm-training = "$moses-script-dir/ems/support/lmplz-wrapper.perl -bin $moses-bin-dir/lmplz" settings = "--prune '0 0 1' -T $working-dir/lm -S 40%" order = 4 type = 8 lm-binarizer = $moses-bin-dir/build_binary ### each language model to be used has its own section here [LM:mycorpus] raw-corpus = $train-dir/news-commentary-v10.$pair-extension.$output-extension ################################################################# # INTERPOLATING LANGUAGE MODELS [INTERPOLATED-LM] # if multiple language models are used, these may be combined # by optimizing perplexity on a tuning set # see, for instance [Koehn and Schwenk, IJCNLP 2008] ### script to interpolate language models # if commented out, no interpolation is performed # #script = $moses-script-dir/ems/support/interpolate-lm.perl ### tuning set # you may use the same set that is used for mert tuning (reference set) # #tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm #raw-tuning = #tokenized-tuning = #factored-tuning = #lowercased-tuning = #split-tuning = ### group language models for hierarchical interpolation # (flat interpolation is limited to 10 language models) #group = "first,second fourth,fifth" ### script to use for binary table format for irstlm or kenlm # (default: no binarization) # irstlm #lm-binarizer = $irstlm-dir/compile-lm # kenlm, also set type to 8 lm-binarizer = $moses-bin-dir/build_binary type = 8 ### script to create quantized language model format (irstlm) # (default: no quantization) # #lm-quantizer = $irstlm-dir/quantize-lm ### script to use for converting into randomized table format # (default: no randomization) # #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8" ################################################################# # TRANSLATION MODEL TRAINING [TRAINING] # If everything else will be parallel, you'll want to set mgiza-cpus to half of the number of you CPUs, since we're aligning in both directions training-options = "-parallel -cores 3 -sort-buffer-size 1G -sort-compress gzip -sort-parallel 3 -mgiza -mgiza-cpus 2" ### factored training: specify here which factors used # if none specified, single factor training is assumed # (one translation step, surface to surface) # #input-factors = word lemma pos morph #output-factors = word lemma pos #alignment-factors = "word -> word" #translation-factors = "word -> word" #reordering-factors = "word -> word" #generation-factors = "word -> pos" #decoding-steps = "t0, g0" ### parallelization of data preparation step # the two directions of the data preparation can be run in parallel # comment out if not needed # parallel = yes ### pre-computation for giza++ # giza++ has a more efficient data structure that needs to be # initialized with snt2cooc. if run in parallel, this may reduces # memory requirements. set here the number of parts # #run-giza-in-parts = 5 ### training script to be used: either a legacy script or # current moses training script (default) # script = $moses-script-dir/training/train-model.perl ### symmetrization method to obtain word alignments from giza output # (commonly used: grow-diag-final-and) # #alignment-symmetrization-method = berkeley alignment-symmetrization-method = grow-diag-final-and ### use of Chris Dyer's fast align for word alignment # #fast-align-settings = "-d -o -v" ### use of berkeley aligner for word alignment # #use-berkeley = true #alignment-symmetrization-method = berkeley #berkeley-train = $moses-script-dir/ems/support/berkeley-train.sh #berkeley-process = $moses-script-dir/ems/support/berkeley-process.sh #berkeley-jar = /your/path/to/berkeleyaligner-1.1/berkeleyaligner.jar #berkeley-java-options = "-server -mx30000m -ea" #berkeley-training-options = "-Main.iters 5 5 -EMWordAligner.numThreads 4" #berkeley-process-options = "-EMWordAligner.numThreads 4" #berkeley-posterior = 0.5 ### use of baseline alignment model (incremental training) # #baseline = 68 #baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \ # $working-dir/training/prepared.$baseline/$output-extension.vcb \ # $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \ # $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \ # $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \ # $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \ # $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \ # $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5" ### if word alignment should be skipped, # point to word alignment files # #word-alignment = $working-dir/model/aligned.1 ### filtering some corpora with modified Moore-Lewis # specify corpora to be filtered and ratio to be kept, either before or after word alignment #mml-filter-corpora = toy #mml-before-wa = "-proportion 0.9" #mml-after-wa = "-proportion 0.9" ### build memory mapped suffix array phrase table # (binarizing the reordering table is a good idea, since filtering makes little sense) #mmsapt = "num-features=9 pfwd=g+ pbwd=g+ smooth=0 sample=1000 workers=1" #binarize-all = $moses-script-dir/training/binarize-model.perl ### create a bilingual concordancer for the model # #biconcor = $moses-bin-dir/biconcor ## Operation Sequence Model (OSM) # Durrani, Schmid and Fraser. (2011): # "A Joint Sequence Translation Model with Integrated Reordering" # compile Moses with --max-kenlm-order=9 if higher order is required # #operation-sequence-model = "yes" #operation-sequence-model-order = 5 #operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% -T $working-dir/model/tmp'" # # if OSM training should be skipped, point to OSM Model #osm-model = ### unsupervised transliteration module # Durrani, Sajjad, Hoang and Koehn (EACL, 2014). # "Integrating an Unsupervised Transliteration Model # into Statistical Machine Translation." # #transliteration-module = "yes" #post-decoding-transliteration = "yes" ### lexicalized reordering: specify orientation type # (default: only distance-based reordering model) # lexicalized-reordering = msd-bidirectional-fe ### if word alignment (giza symmetrization) should be skipped, # point to word alignment files # #word-alignment = ### settings for rule extraction # #extract-settings = "" max-phrase-length = 5 ### if phrase extraction should be skipped, # point to stem for extract files # #extracted-phrases = ### settings for rule scoring # score-settings = "--GoodTuring" ### if phrase table training should be skipped, # point to phrase translation table # #phrase-translation-table = ### if reordering table training should be skipped, # point to reordering table # #reordering-table = ### if training should be skipped, # point to a configuration file that contains # pointers to all relevant model files # #config = ### TUNING: finding good weights for model components [TUNING] ### instead of tuning with this setting, old weights may be recycled ### tuning script to be used # tuning-script = $moses-script-dir/training/mert-moses.pl tuning-settings = "-mertdir $moses-bin-dir --threads=4" ### specify the corpus used for tuning # it should contain 100s if not 1000s of sentences # #raw-input = #raw-reference = input-sgm = $dev-dir/news-test2008-src.$input-extension.sgm reference-sgm = $dev-dir/news-test2008-ref.$output-extension.sgm ### size of n-best list used (typically 100) # nbest = 100 ### ranges for weights for random initialization # if not specified, the tuning script will use generic ranges # it is not clear, if this matters # # lambda = ### additional flags for the decoder # decoder-settings = "-threads 3" ### if tuning should be skipped, specify this here # and also point to a configuration file that contains # pointers to all relevant model files # #config = ####################################################### ## TRUECASER: train model to truecase corpora and input [TRUECASER] ### script to train truecaser models # trainer = $moses-script-dir/recaser/train-truecaser.perl ### training data # raw input needs to be still tokenized, # also also tokenized input may be specified # raw-stem = CORPUS:raw-stem ### trained model # #truecase-model = ################################## ## EVALUATION: score system output [EVALUATION] ### prepare system output for scoring # this may include detokenization and wrapping output in sgm # (needed for nist-bleu, ter, meteor) # detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension" ### additional decoder settings # switches for the Moses decoder # common choices: # "-threads N" for multi-threading # "-mbr" for MBR decoding # "-drop-unknown" for dropping unknown source words # "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning # decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" decoder-settings = "-threads 4" ### should output be scored case-sensitive (default: no)? # # case-sensitive = yes ### BLEU # multi-bleu = "$moses-script-dir/generic/multi-bleu.perl -lc" # ibm-bleu = ### TER: translation error rate (BBN metric) based on edit distance # # ter = $edinburgh-script-dir/tercom_v6a.pl ### METEOR: gives credit to stem / worknet synonym matches # # meteor = ### Analysis: carry out various forms of analysis on the output # analysis = $moses-script-dir/ems/support/analysis.perl # # also report on input coverage analyze-coverage = yes # # also report on phrase mappings used report-segmentation = yes # # report precision of translations for each input word, broken down by # count of input word in corpus and model #report-precision-by-coverage = yes # # further precision breakdown by factor #precision-by-coverage-factor = pos # # visualization of the search graph in tree-based models #analyze-search-graph = yes [EVALUATION:newstest2009] #raw-input = #raw-reference = input-sgm = $dev-dir/newstest2009-src.$input-extension.sgm reference-sgm = $dev-dir/newstest2009-ref.$output-extension.sgm [REPORTING] ### what to do with result (default: store in file evaluation/report) # # email = pkoehn@inf.ed.ac.uk