Friday, July 20, 2012


Command Line commands for Kmeans Clustering 

./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o ./examples/bin/work/reuters-out-seqdir -c UTF-8 -chunk 5

./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o ./examples/bin/work/reuters-out-seqdir-sparse -ng 2 -nv

./bin/mahout kmeans -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf-vectors/ -c ./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -x 10 -k 2 -ow -cl

./bin/mahout clusterdump -d ./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0  -dt sequencefile -s ./examples/bin/work/reuters-kmeans/clusters-3/part-r-00000 -n 20  -b 100 -p ./examples/bin/work/reuters-kmeans/clusteredPoints

./bin/mahout seqdumper -s ./examples/bin/work/reuters-kmeans/clusteredPoints/part-m-00000  | more

./bin/mahout rowid -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf-vectors/part-r-00000  -o ./examples/bin/work/reuters-matrix

./bin/mahout rowid -Dmapred.input.dir=/home/venkat/Desktop/mahout/mahout-distribution-0.5/examples/bin/work/reuters-out-seqdir-sparse/tfidf-vectors/part-r-00000 -Dmapred.output.dir=/home/venkat/Desktop/mahout/mahout-distribution-0.5/examples/bin/work/reuters-matrix

4 rows and 1073 columns

./bin/mahout seqdumper -s /home/venkat/Desktop/mahout/mahout-distribution-0.5/examples/bin/work/reuters-matrix/matrix | more

./bin/mahout rowsimilarity -i /home/venkat/Desktop/mahout/mahout-distribution-0.5/examples/bin/work/reuters-matrix/matrix
 -o /home/venkat/Desktop/mahout/mahout-distribution-0.5/examples/bin/work/reuters-named-similarity
  -r 1073
    --similarityClassname SIMILARITY_COSINE  -m 10

./bin/mahout rowsimilarity -i /home/venkat/Desktop/mahout/mahout-distribution-0.5/examples/bin/work/reuters-matrix/matrix  -o /home/venkat/Desktop/mahout/mahout-distribution-0.5/examples/bin/work/reuters-named-similarity -r 1073  --similarityClassname SIMILARITY_COOCCURRENCE -m 10 --tempDir /home/venkat/Desktop/tmp

SIMILARITY_COOCCURRENCE,                    
                                                        SIMILARITY_EUCLIDEAN_DISTANCE,                
                                                        SIMILARITY_LOGLIKELIHOOD,                      
                                                        SIMILARITY_PEARSON_CORRELATION,                
                                                        SIMILARITY_TANIMOTO_COEFFICIENT,              
                                                        SIMILARITY_UNCENTERED_COSINE,                  
                                                        SIMILARITY_UNCENTERED_ZERO_ASSUMING_COSINE,    
                                                        SIMILARITY_CITY_BLOCK

./bin/mahout seqdumper -s /home/venkat/Desktop/mahout/mahout-distribution-0.5/examples/bin/work/reuters-matrix/docIndex

Canopy 

./bin/mahout canopy -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf-vectors/  -o ./examples/bin/work/canopy-output  -t1 3.0 -t2 2.8  -t3 3.0  -t4 2.8 -ow  -xm sequential

./bin/mahout kmeans -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf-vectors/ -c ./examples/bin/work/canopy-output -o ./examples/bin/work/reuters-kmeans -x 10 -k 3 -ow
./bin/mahout kmeans -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf-vectors/ -c ./examples/bin/work/canopy-output -o ./examples/bin/work/reuters-kmeans -x 10  -ow


./bin/mahout seqdumper -s ./examples/bin/work/reuters-kmeans/clusters-1/part-r-00000


bin/mahout canopy
    -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf-vectors/
    -o ./examples/bin/work/reuters-out-seqdir-sparse
    -dm new ManhattanDistanceMeasure()
    -t1 3.0
    -t2 2.8
    -t3 3.0
    -t4 2.8
    -ow
    -cl <run input vector clustering after computing Canopies>
    -xm sequential



Usage:                                                                        
 [--minSupport <minSupport> --analyzerName <analyzerName> --chunkSize          
<chunkSize> --output <output> --input <input> --minDF <minDF> --maxDFPercent  
<maxDFPercent> --weight <weight> --norm <norm> --minLLR <minLLR> --numReducers
<numReducers> --maxNGramSize <ngramSize> --overwrite --help                    
--sequentialAccessVector --namedVector --logNormalize]                        
Options                                                                        
  --minSupport (-s) minSupport        (Optional) Minimum Support. Default      
                                      Value: 2                                
  --analyzerName (-a) analyzerName    The class name of the analyzer          
  --chunkSize (-chunk) chunkSize      The chunkSize in MegaBytes. 100-10000 MB
  --output (-o) output                The output directory                    
  --input (-i) input                  input dir containing the documents in    
                                      sequence file format                    
  --minDF (-md) minDF                 The minimum document frequency.  Default
                                      is 1                                    
  --maxDFPercent (-x) maxDFPercent    The max percentage of docs for the DF.  
                                      Can be used to remove really high        
                                      frequency terms. Expressed as an integer
                                      between 0 and 100. Default is 99.        
  --weight (-wt) weight               The kind of weight to use. Currently TF  
                                      or TFIDF                                
  --norm (-n) norm                    The norm to use, expressed as either a  
                                      float or "INF" if you want to use the    
                                      Infinite norm.  Must be greater or equal
                                      to 0.  The default is not to normalize  
  --minLLR (-ml) minLLR               (Optional)The minimum Log Likelihood    
                                      Ratio(Float)  Default is 1.0            
  --numReducers (-nr) numReducers     (Optional) Number of reduce tasks.      
                                      Default Value: 1                        
  --maxNGramSize (-ng) ngramSize      (Optional) The maximum size of ngrams to
                                      create (2 = bigrams, 3 = trigrams, etc)  
                                      Default Value:1                          
  --overwrite (-ow)                   If set, overwrite the output directory  
  --help (-h)                         Print out help                          
  --sequentialAccessVector (-seq)     (Optional) Whether output vectors should
                                      be SequentialAccessVectors. If set true  
                                      else false                              
  --namedVector (-nv)                 (Optional) Whether output vectors should
                                      be NamedVectors. If set true else false  
  --logNormalize (-lnorm)             (Optional) Whether output vectors should
                                      be logNormalize. If set true else false  



No comments:

Post a Comment