/*
 * 17/06/2009, 16:30.
 *
 * Simuquiz - http://www.simuquiz.com.br
 */
package br.com.simuquiz.antispam;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Scanner;
import java.util.Set;

/**
 * @author Thiago Henrique Coraini
 * @author Israel Lacerra
 * @author Victor Williams Stafusa da Silva
 * @author Pedro Lopes de Souza
 */
public class DocumentReader {

    public static Set<String> getDocFeatures(InputStream is) {
        Scanner scanner = new Scanner(is);
        Set<String> features = new HashSet<String>();

        while (scanner.hasNext()) {
            String line = scanner.nextLine();
            line = line.toLowerCase();
            String[] lineFeatures = line.split("\\W+");
            addAll(features, lineFeatures);
        }
        return features;
    }

    private static void addAll(Set<String> features, String[] lineFeatures) {
        for (String feature : lineFeatures) {
            if (feature.length() > 2) {
                features.add(feature);
            }
        }
    }
}
