################################################
### CONFIGURATION FILE FOR AN SMT EXPERIMENT ###
################################################

### You definitely need to edit the lines starting with the following: username, home-dir, working-dir, input-extension, output-extension, pair-extension, moses-src-dir, external-bin-dir, 

[GENERAL]

input-extension = en
output-extension = de
pair-extension = de-en

### Your username
username = myusername

### On Cygwin or Linux, it is /home/$username
### On Mac OSX, it is /Users/$username
home-dir = /home/$username

### directory in which experiment is run
working-dir = $home-dir/corpora
data-dir = $working-dir
train-dir = $data-dir
dev-dir = $data-dir/dev

# Moses paths
moses-src-dir = $home-dir/moses
moses-script-dir = $moses-src-dir/scripts
moses-bin-dir = $moses-src-dir/bin
ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2"
decoder = $moses-bin-dir/moses

# directory where GIZA++/MGIZA programs resides
external-bin-dir = $moses-src-dir/mgiza/mgizapp/bin

input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -l $input-extension"
output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -l $output-extension"
input-truecaser = $moses-script-dir/recaser/truecase.perl
output-truecaser = $moses-script-dir/recaser/truecase.perl
detruecaser = $moses-script-dir/recaser/detruecase.perl


### multi-core settings
# when the generic parallelizer is used, the number of cores
# specified here 
cores = 3

srilm-dir = $moses-src-dir/
irstlm-dir = $moses-src-dir/
randlm-dir = $moses-src-dir/


#################################################################
# PARALLEL CORPUS PREPARATION: 
# create a tokenized, sentence-aligned corpus, ready for training

[CORPUS]

max-sentence-length = 80

### command to run to get raw corpus files
#
# get-corpus-script = 

[CORPUS:mycorpus] 
raw-stem = $train-dir/news-commentary-v10.$pair-extension

### tokenized corpus files (may contain long sentences)
#
#tokenized-stem =

### if sentence filtering should be skipped,
# point to the clean training data
#
#clean-stem = 

### if corpus preparation should be skipped,
# point to the prepared training data
#
#lowercased-stem = 


#################################################################
# LANGUAGE MODEL TRAINING

[LM]

### tool to be used for language model training
# kenlm training
lm-training = "$moses-script-dir/ems/support/lmplz-wrapper.perl -bin $moses-bin-dir/lmplz"
settings = "--prune '0 0 1' -T $working-dir/lm -S 40%"
order = 4
type = 8
lm-binarizer = $moses-bin-dir/build_binary

### each language model to be used has its own section here
[LM:mycorpus]
raw-corpus = $train-dir/news-commentary-v10.$pair-extension.$output-extension


#################################################################
# INTERPOLATING LANGUAGE MODELS

[INTERPOLATED-LM]

# if multiple language models are used, these may be combined
# by optimizing perplexity on a tuning set
# see, for instance [Koehn and Schwenk, IJCNLP 2008]

### script to interpolate language models
# if commented out, no interpolation is performed
#
#script = $moses-script-dir/ems/support/interpolate-lm.perl

### tuning set
# you may use the same set that is used for mert tuning (reference set)
#
#tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
#raw-tuning =
#tokenized-tuning = 
#factored-tuning = 
#lowercased-tuning = 
#split-tuning = 

### group language models for hierarchical interpolation
# (flat interpolation is limited to 10 language models)
#group = "first,second fourth,fifth"

### script to use for binary table format for irstlm or kenlm
# (default: no binarization)

# irstlm
#lm-binarizer = $irstlm-dir/compile-lm

# kenlm, also set type to 8
lm-binarizer = $moses-bin-dir/build_binary
type = 8

### script to create quantized language model format (irstlm)
# (default: no quantization)
# 
#lm-quantizer = $irstlm-dir/quantize-lm

### script to use for converting into randomized table format
# (default: no randomization)
#
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

#################################################################
# TRANSLATION MODEL TRAINING

[TRAINING]

# If everything else will be parallel, you'll want to set mgiza-cpus to half of the number of you CPUs, since we're aligning in both directions
training-options = "-parallel -cores 3 -sort-buffer-size 1G -sort-compress gzip -sort-parallel 3 -mgiza -mgiza-cpus 2"

### factored training: specify here which factors used
# if none specified, single factor training is assumed
# (one translation step, surface to surface)
#
#input-factors = word lemma pos morph
#output-factors = word lemma pos
#alignment-factors = "word -> word"
#translation-factors = "word -> word"
#reordering-factors = "word -> word"
#generation-factors = "word -> pos"
#decoding-steps = "t0, g0"

### parallelization of data preparation step
# the two directions of the data preparation can be run in parallel
# comment out if not needed
#
parallel = yes

### pre-computation for giza++
# giza++ has a more efficient data structure that needs to be
# initialized with snt2cooc. if run in parallel, this may reduces
# memory requirements. set here the number of parts
#
#run-giza-in-parts = 5

### training script to be used: either a legacy script or 
# current moses training script (default) 
# 
script = $moses-script-dir/training/train-model.perl

### symmetrization method to obtain word alignments from giza output
# (commonly used: grow-diag-final-and)
#
#alignment-symmetrization-method = berkeley
alignment-symmetrization-method = grow-diag-final-and

### use of Chris Dyer's fast align for word alignment
#
#fast-align-settings = "-d -o -v"

### use of berkeley aligner for word alignment
#
#use-berkeley = true
#alignment-symmetrization-method = berkeley
#berkeley-train = $moses-script-dir/ems/support/berkeley-train.sh
#berkeley-process =  $moses-script-dir/ems/support/berkeley-process.sh
#berkeley-jar = /your/path/to/berkeleyaligner-1.1/berkeleyaligner.jar
#berkeley-java-options = "-server -mx30000m -ea"
#berkeley-training-options = "-Main.iters 5 5 -EMWordAligner.numThreads 4"
#berkeley-process-options = "-EMWordAligner.numThreads 4"
#berkeley-posterior = 0.5

### use of baseline alignment model (incremental training)
# 
#baseline = 68
#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
#  $working-dir/training/prepared.$baseline/$output-extension.vcb \
#  $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
#  $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \ 
#  $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
#  $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
#  $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
#  $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"

### if word alignment should be skipped,
# point to word alignment files
#
#word-alignment = $working-dir/model/aligned.1

### filtering some corpora with modified Moore-Lewis
# specify corpora to be filtered and ratio to be kept, either before or after word alignment
#mml-filter-corpora = toy
#mml-before-wa = "-proportion 0.9"
#mml-after-wa = "-proportion 0.9"

### build memory mapped suffix array phrase table
# (binarizing the reordering table is a good idea, since filtering makes little sense)
#mmsapt = "num-features=9 pfwd=g+ pbwd=g+ smooth=0 sample=1000 workers=1"
#binarize-all = $moses-script-dir/training/binarize-model.perl

### create a bilingual concordancer for the model
#
#biconcor = $moses-bin-dir/biconcor

## Operation Sequence Model  (OSM)
# Durrani, Schmid and Fraser. (2011): 
# "A Joint Sequence Translation Model with Integrated Reordering"
# compile Moses with --max-kenlm-order=9 if higher order is required
# 
#operation-sequence-model = "yes"
#operation-sequence-model-order = 5
#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% -T $working-dir/model/tmp'"
#
# if OSM training should be skipped, point to OSM Model 
#osm-model =

### unsupervised transliteration module
# Durrani, Sajjad, Hoang and Koehn (EACL, 2014).
# "Integrating an Unsupervised Transliteration Model 
# into Statistical Machine Translation."
#
#transliteration-module = "yes"
#post-decoding-transliteration = "yes"

### lexicalized reordering: specify orientation type
# (default: only distance-based reordering model)
#
lexicalized-reordering = msd-bidirectional-fe

### if word alignment (giza symmetrization) should be skipped,
# point to word alignment files
#
#word-alignment = 

### settings for rule extraction
#
#extract-settings = ""
max-phrase-length = 5

### if phrase extraction should be skipped,
# point to stem for extract files
#
#extracted-phrases = 

### settings for rule scoring
#
score-settings = "--GoodTuring"

### if phrase table training should be skipped,
# point to phrase translation table
#
#phrase-translation-table = 

### if reordering table training should be skipped,
# point to reordering table
#
#reordering-table = 

### if training should be skipped, 
# point to a configuration file that contains
# pointers to all relevant model files
#
#config = 

### TUNING: finding good weights for model components

[TUNING]

### instead of tuning with this setting, old weights may be recycled

### tuning script to be used
#
tuning-script = $moses-script-dir/training/mert-moses.pl
tuning-settings = "-mertdir $moses-bin-dir --threads=4"

### specify the corpus used for tuning 
# it should contain 100s if not 1000s of sentences
#
#raw-input =
#raw-reference =
input-sgm = $dev-dir/news-test2008-src.$input-extension.sgm
reference-sgm = $dev-dir/news-test2008-ref.$output-extension.sgm

### size of n-best list used (typically 100)
#
nbest = 100

### ranges for weights for random initialization
# if not specified, the tuning script will use generic ranges
# it is not clear, if this matters
#
# lambda = 

### additional flags for the decoder
#
decoder-settings = "-threads 3"

### if tuning should be skipped, specify this here
# and also point to a configuration file that contains
# pointers to all relevant model files
#
#config = 


#######################################################
## TRUECASER: train model to truecase corpora and input

[TRUECASER]

### script to train truecaser models
#
trainer = $moses-script-dir/recaser/train-truecaser.perl

### training data
# raw input needs to be still tokenized,
# also also tokenized input may be specified
#
raw-stem = CORPUS:raw-stem

### trained model
#
#truecase-model = 


##################################
## EVALUATION: score system output

[EVALUATION]

### prepare system output for scoring 
# this may include detokenization and wrapping output in sgm 
# (needed for nist-bleu, ter, meteor)
#
detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension"

### additional decoder settings
# switches for the Moses decoder
# common choices: 
#   "-threads N" for multi-threading
#   "-mbr" for MBR decoding
#   "-drop-unknown" for dropping unknown source words
#   "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
#
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"

decoder-settings = "-threads 4"

### should output be scored case-sensitive (default: no)?
#
# case-sensitive = yes

### BLEU
#

multi-bleu = "$moses-script-dir/generic/multi-bleu.perl -lc"
# ibm-bleu =

### TER: translation error rate (BBN metric) based on edit distance
#
# ter = $edinburgh-script-dir/tercom_v6a.pl

### METEOR: gives credit to stem / worknet synonym matches
#
# meteor = 

### Analysis: carry out various forms of analysis on the output
#
analysis = $moses-script-dir/ems/support/analysis.perl
#
# also report on input coverage
analyze-coverage = yes
#
# also report on phrase mappings used
report-segmentation = yes
#
# report precision of translations for each input word, broken down by
# count of input word in corpus and model
#report-precision-by-coverage = yes
#
# further precision breakdown by factor
#precision-by-coverage-factor = pos
# 
# visualization of the search graph in tree-based models
#analyze-search-graph = yes

[EVALUATION:newstest2009]
#raw-input = 
#raw-reference = 
input-sgm = $dev-dir/newstest2009-src.$input-extension.sgm
reference-sgm = $dev-dir/newstest2009-ref.$output-extension.sgm


[REPORTING]

### what to do with result (default: store in file evaluation/report)
# 
# email = pkoehn@inf.ed.ac.uk