Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
TextProcessor.java 1.29 KiB
package edu.bu.LanguageCorrection;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

public class TextProcessor {
    public static List<String> extractSentences(String text) {
        List<String> sentences = List.of(text.split("([.!?] )|([.!?]\n)"));
        return new ArrayList<>(sentences);
    }

    // Extracts phrases with variable lengths using n-gram method
    public static List<String> extractPhrases(String sentence, int minN, int maxN) {
        // Using a Set to avoid duplicate phrases
        Set<String> phraseSet = new HashSet<>();
        // Remove punctuation
        sentence = sentence.replaceAll("[^a-zA-Z0-9 ]", "");    
        // Split the sentence into words
        String[] words = sentence.split("\\s+");

        // Loop over the range of n values
        for (int n = minN; n <= maxN; n++) {
            for (int i = 0; i < words.length - n + 1; i++) {
                StringBuilder sb = new StringBuilder();
                for (int j = i; j < i + n; j++) {
                    sb.append((j > i ? " " : "") + words[j]);
                }
                phraseSet.add(sb.toString());
            }
        }

        // Convert the set back to a list to maintain the original interface
        return new ArrayList<>(phraseSet);
    }
}