Commit b420daf8 authored by Manuel  Segimon's avatar Manuel Segimon
Browse files

work in progress

parent 580a5f7b
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -20,6 +20,7 @@ public class Checker {
        Map<String, Float> phraseScores = new HashMap<>();

        for (String sentence : sentences) {
            sentence = sentence.replaceAll("[^a-zA-Z0-9\\s]", "");
            //System.out.println("Analyzing sentence: " + sentence);
            List<String> phrases = TextProcessor.extractPhrases(sentence, 2, 3);

@@ -27,7 +28,7 @@ public class Checker {
            for (String phrase : phrases) {
                // System.out.println("Analyzing phrase: " + phrase);
                float perplexity = detector.perplexity(phrase);
                if (perplexity > 100) {
                if (perplexity < 0) {
                    phraseScores.put(phrase, 100f);
                } else {
                    phraseScores.put(phrase, perplexity);
+6 −2
Original line number Diff line number Diff line
@@ -90,8 +90,12 @@ public class Corrector {
            try {
                String content = new String(Files.readAllBytes(Paths.get(path)));
                Corrector corrector = new Corrector(); // Run corrector
                String corrected = corrector.correct(content);
                String[] sentences = TextProcessor.extractSentences(content).toArray(new String[0]);
                for (String sentence : sentences) {
                    sentence = sentence.replaceAll("[^a-zA-Z0-9\\s]", "");
                    String corrected = corrector.correct(sentence);
                    System.out.println(corrected);
                }
            } catch (IOException e) {
                System.err.println("Error reading file: " + e.getMessage());
            }
+18 −42
Original line number Diff line number Diff line
@@ -22,66 +22,42 @@ public class TrieNode implements Serializable, Cloneable {
    }

    public float probability(String phrase) {
        String[] words = phrase.split(" ");
        if (words.length <= 1) { // If word does not exist in trie
            // System.out.println("Probability of phrase: " + 1 / this.childCounts);
            return (float) 0.1;
        }
        TrieNode current = this;
        TrieNode past = this;
        for (String word : phrase.split(" ")) {
        System.out.println("Phrase: " + phrase);
        for (String word : words) {
            past = current;
            current = current.children.get(word);
            if (current == null) {
                float alpha = (float) 1;
                // System.out.println("Phrase not found in trie.");
                return 0;
                return alpha * probability(phrase.substring(phrase.indexOf(" ") + 1));
            }
        }
        // System.out.println("Probability of phrase: " + (double) current.count /
        // past.childCounts);
        // System.out.println("Probability of phrase: " + (float) current.count / past.childCounts);
        return (float) current.count / past.childCounts;
    }

    private float getAverageChildCount() {
        if (this.children.size() == 0) {
            return 1;
        }
        return (float) this.childCounts / this.children.size();
    private float getAverageChildCount(TrieNode node) {
        return (float) node.childCounts / node.children.size();
    }

    public float perplexity(String phrase) {
        TrieNode current = this;
        TrieNode past = this;
        float logProb = 0;
        String[] words = phrase.split(" ");
        if (words.length == 1) {
            return (float) 100 / words.length;
        }
        String currentPhrase = "";
        for (String word : words) {
            past = current;
            current = current.children.get(word);
            if (current == null) {
                float alpha = (float) 100 / words.length;
                return alpha + perplexity(phrase.replaceFirst(words[0] + " ", ""), words.length);
            }
            logProb += Math.log((float) current.count / past.getAverageChildCount());
        }
        float perplexity = (float) Math.pow(2, -logProb);
        //System.out.println("Perplexity of phrase (" + phrase + ") : " + perplexity);
        return perplexity;
    }

    private float perplexity(String phrase, int wordCount) {
        TrieNode current = this;
        TrieNode past = this;
        float logProb = 0;
        String[] words = phrase.split(" ");
        if (words.length == 1) {
            return (float) 100 / wordCount;
        }
        for (String word : words) {
            past = current;
            current = current.children.get(word);
            if (current == null) {
                float alpha = (float) 100 / wordCount;
                return alpha + perplexity(phrase.replaceFirst(words[0] + " ", ""), wordCount);
            if (currentPhrase.length() == 0) {
                currentPhrase = word;
            } else {
                currentPhrase += " " + word;
            }
            logProb += Math.log((float) current.count / past.getAverageChildCount());
            logProb += Math.log((probability(currentPhrase)));
        }
        float perplexity = (float) Math.pow(2, -logProb);
        //System.out.println("Perplexity of phrase (" + phrase + ") : " + perplexity);