# FILES

allwords.txt        -- all the words, enumerating words is based on this list
allwords_n.txt      -- all the words and their total counts in all documents
stop_words.txt      -- list of Finnish stopwords from 
  http://nettiapina.fi/blog/wp-content/uploads/2007/07/fi_stopwords.txt

bin/                -- python programs with which the files were preprocessed

libstemmer_c/       -- Snowball stemmer http://snowball.tartarus.org/
libstemmer_c.tgz    -- and its original tgz

# Just to illustrate the process, here are different preprocessing
  phases of the documents. 
  Python programs read from stdin and write to stdout.

raw/                -- Raw files downloaded with wget from 
                       http://www.pirkka.fi/ruoka/reseptihaku

txt/                -- Extra html stripped with bin/extex.py

wrd/                -- Words filtered in from the docs with bin/words.py

stm/                -- Words stemmed with libstemmer_c/stemwords -l fi

stp/                -- Stop words removed with bin/stopsout.py stop_words.txt

num/                -- Words converted to numbers by bin/num.py allwords.txt

coc/                -- Co-occurrences formed with bin/coc.py


d2coc.sh            -- a program that counts word occurrences from the input.
                       This can be used to turn a query into the counts:

		       For example:

                       echo Paistettua kalaa | ./d2coc.sh 

raw2coc.sh         -- This is the script that was used when turning the
                      raw documents into the co-occurrences.
