From 504280dee75c95161827dd1c54ccfe79718d68dd Mon Sep 17 00:00:00 2001 From: alexrmelnick Date: Sat, 30 Mar 2024 18:42:55 -0400 Subject: [PATCH] Pushing RegexParser.java into master --- src/main/java/RegexParser.java | 106 +++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 src/main/java/RegexParser.java diff --git a/src/main/java/RegexParser.java b/src/main/java/RegexParser.java new file mode 100644 index 0000000..7604c09 --- /dev/null +++ b/src/main/java/RegexParser.java @@ -0,0 +1,106 @@ +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.*; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class RegexParser { + + public static void main(String[] args) { + // Read URLs from a text file + List urls = readUrlsFromFile("urls.txt"); + List links = new ArrayList(); + List sentences = new ArrayList(); + + // Parse each webpage and gather information + for (String url : urls) { + try { + // Fetch the webpage using JSoup + Document doc = Jsoup.connect(url).get(); + + // Extract all the text content from the page + String allText = getAllText(doc); + + // Find all sentences in the text + sentences = findSentences(allText); + + // Extract links from the page + //links = extractLinks(doc); + + // Write sentences and links to a file + writeToFile(sentences, links, "output_" + getFilenameFromUrl(url) + ".txt"); + + } catch (IOException e) { + e.printStackTrace(); + } + } + } + + // Function to read URLs from a text file + private static List readUrlsFromFile(String filename) { + List urls = new ArrayList<>(); + try (BufferedReader reader = new BufferedReader(new FileReader(filename))) { + String line; + while ((line = reader.readLine()) != null) { + urls.add(line); + } + } catch (IOException e) { + e.printStackTrace(); + } + return urls; + } + + // Function to extract all text content from the page + private static String getAllText(Document doc) { + return doc.text(); + } + + // Function to find all sentences in the given text + private static List findSentences(String text) { + List sentences = new ArrayList<>(); + Pattern pattern = Pattern.compile("([^?.!]+[?.!])"); + Matcher matcher = pattern.matcher(text); + while (matcher.find()) { + sentences.add(matcher.group()); + } + return sentences; + } + + // Function to extract links from the page + private static List extractLinks(Document doc) { + List links = new ArrayList<>(); + Elements elements = doc.select("a[href]"); + for (Element element : elements) { + links.add(element.attr("href")); + } + return links; + } + + // Function to write sentences and links to a file + private static void writeToFile(List sentences, List links, String filename) { + try (BufferedWriter writer = new BufferedWriter(new FileWriter(filename))) { + writer.write("Sentences found:\n"); + for (String sentence : sentences) { + writer.write(sentence + "\n"); + } + writer.write("\nLinks on the page:\n"); + for (String link : links) { + writer.write(link + "\n"); + } + System.out.println("Results written to " + filename); + } catch (IOException e) { + e.printStackTrace(); + } + } + + // Function to get a filename from the URL + private static String getFilenameFromUrl(String url) { + String[] parts = url.split("/"); + return parts[parts.length - 1]; + } +} \ No newline at end of file -- GitLab