create large trie based on brown corpus by running ./crawler --build (73343d3e) · Commits · EC504 Spring 2024 Group Projects / Group7

src/main/java/edu/bu/LanguageCorrection/crawler.java

+76 −32

Original line number	Diff line number	Diff line
		@@ -30,8 +30,16 @@ public class crawler {

		// Open the file and read all lines (URLs) into the queue
		String file_url = "";
		if (args.length != 0 && args[0].equals("--file")) file_url = args[1];
		if (args.length > 2 && args[2].equals("--debug")) web_crawler.debug = true;
		for (int i = 0; i < args.length; i++) {
		if (args[i].equals("--file")) {
		file_url = args[i + 1];
		} else if (args[i].equals("--debug")) {
		web_crawler.debug = true;
		} else if (args[i].equals("--build")) {
		web_crawler.build_off_corpus = true;
		}
		}
		if (!web_crawler.build_off_corpus) {
		try (FileReader f_read = new FileReader(file_url);
		BufferedReader buf_read = new BufferedReader(f_read)) {
		String url_line;
		@@ -39,9 +47,10 @@ public class crawler {
		web_crawler.add_to_queue(url_line);
		}
		}
		}

		// Start crawling
		int crawlLimit = 100; // Adjustable limit (SET TO 1 FOR EASE OF USE)
		final int crawlLimit = 100; // Adjustable limit (SET TO 1 FOR EASE OF USE)
		web_crawler.crawl(crawlLimit);

		// Print visited URLs
		@@ -54,7 +63,9 @@ public class crawler {
		private String filePath = "metadata.ser";
		private TrieNode wordUsage = new TrieNode();
		private int compression_size = 0;
		private boolean debug = false;
		private boolean debug = false; // flag that outputs uncompressed json showing the trie
		private boolean build_off_corpus = false;
		private static final int MAXNGRAM = 3;

		public crawler() {
		url_queue = new LinkedList<>();
		@@ -72,6 +83,10 @@ public class crawler {

		public void crawl(int maxPages) {
		int pageCount = 0;
		if (build_off_corpus) {
		System.out.println("Building off corpus...");
		processPage(get_file_text("brown.txt")); // TODO: extend to multiple files i.e. one per language
		}
		while (!url_queue.isEmpty() && pageCount < maxPages) {
		String cur_site = url_queue.poll();
		if (cur_site == null \|\| visited_urls.contains(cur_site)) {
		@@ -116,12 +131,33 @@ public class crawler {
		}
		}

		private Document get_file_text(String filename) {
		// takes filename as input, reads it from src/main/resources and returns the text as a Document object
		try {
		Document doc = new Document("");
		doc.title(filename);
		// Add each line to the document
		try (BufferedReader reader = new BufferedReader(new FileReader("src/main/java/resources/"+filename))) {
		String line;
		while ((line = reader.readLine()) != null) {
		doc.append(line);
		}
		System.out.println("File read successfully.");
		}
		return doc;
		} catch (IOException e) {
		System.err.println("Error reading file: " + e.getMessage());
		return null;
		}
		}

		private void processPage(Document web_data) {
		byte[] compressedData = new byte[0];
		byte[] uncompressedData = new byte[0];
		boolean sizeLimitExceeded = false;
		byte[] previousUncompressedData = wordUsage.serialize();

		if(!build_off_corpus) {
		// Break the page text into manageable chunks, considering sentences
		List<String> chunks = splitTextIntoChunks(web_data.text());
		// int chunkCount = 0;
		@@ -137,7 +173,7 @@ public class crawler {
		uncompressedData = wordUsage.serialize();
		compressedData = compress(uncompressedData);

		if (compressedData.length - compression_size > 1024) {
		if ((compressedData.length - compression_size > 1024) ) {
		System.out.println("Previous compressed data size: " + compression_size + " bytes. Current compressed data size: " + compressedData.length + " bytes. Delta:"+ (compressedData.length - compression_size) + " bytes.");
		System.out.println("Size limit exceeded. Reverting to previous chunk.");
		sizeLimitExceeded = true;
		@@ -146,6 +182,13 @@ public class crawler {
		break; // Stop processing further chunks
		}
		}
		} else {
		extractWordUsage(web_data.text(), wordUsage);
		// System.out.println("Ngrams built successfully. for size:"+MAXNGRAM);
		uncompressedData = wordUsage.serialize();
		compressedData = compress(uncompressedData);
		build_off_corpus = false; // if there are urls to read it should still be able to read them
		}

		// Save the uncompressed and compressed data to separate files
		if (debug) {
		@@ -183,7 +226,7 @@ public class crawler {
		// Split text into sentences
		String[] sentences = text.split("[.!?] ");
		for (String sentence : sentences) {
		for (int nGram = 1; nGram <= 3; nGram++) {
		for (int nGram = 1; nGram <= MAXNGRAM; nGram++) {
		String[] words = sentence.split("\\s+");
		for (int i = 0; i < words.length - nGram + 1; i++) {
		trie.insert(Arrays.copyOfRange(words, i, i + nGram));
		@@ -244,6 +287,7 @@ public class crawler {
		return trie;
		} catch (IOException e) {
		System.err.println("Error reading metadata from file: " + e.getMessage());
		System.err.println("Creating new trie...");
		return new TrieNode();
		}
		}

src/main/java/resources/test.txt

deleted100644 → 0

+0 −1

Original line number	Diff line number	Diff line
		This is strange so choice word.
		No newline at end of file