-
Manuel Segimon authoredManuel Segimon authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
TextProcessor.java 1.29 KiB
package edu.bu.LanguageCorrection;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class TextProcessor {
public static List<String> extractSentences(String text) {
List<String> sentences = List.of(text.split("([.!?] )|([.!?]\n)"));
return new ArrayList<>(sentences);
}
// Extracts phrases with variable lengths using n-gram method
public static List<String> extractPhrases(String sentence, int minN, int maxN) {
// Using a Set to avoid duplicate phrases
Set<String> phraseSet = new HashSet<>();
// Remove punctuation
sentence = sentence.replaceAll("[^a-zA-Z0-9 ]", "");
// Split the sentence into words
String[] words = sentence.split("\\s+");
// Loop over the range of n values
for (int n = minN; n <= maxN; n++) {
for (int i = 0; i < words.length - n + 1; i++) {
StringBuilder sb = new StringBuilder();
for (int j = i; j < i + n; j++) {
sb.append((j > i ? " " : "") + words[j]);
}
phraseSet.add(sb.toString());
}
}
// Convert the set back to a list to maintain the original interface
return new ArrayList<>(phraseSet);
}
}