Commit 935cf3fa authored by Manuel  Segimon's avatar Manuel Segimon
Browse files

Add support for Trie-based serialization and deserialization in TrieNode class

parent 40776f57
Loading
Loading
Loading
Loading
+42 −25
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@ package edu.bu.LanguageCorrection;

import java.io.Serializable;
import java.util.HashMap;
import java.util.Stack;

public class TrieNode implements Serializable {
    HashMap<String, TrieNode> children = new HashMap<>();
@@ -54,44 +55,60 @@ public class TrieNode implements Serializable {
    }
    
    public byte[] serialize() {
        return serializeHelper(this).getBytes();
        return serializeHelper(this).replaceAll(" \\}", "\\}").trim().getBytes();
    }

    private String serializeHelper(TrieNode node) {
        StringBuilder serialized = new StringBuilder();
        serialized.append(node.count).append(" ").append(node.childCounts).append(" ");
        serialized.append(node.count).append(" ").append(node.childCounts);
        if (node.childCounts == 0) {
            serialized.append(" ");
            // System.out.println("Serialized node: " + node.count + " " + node.childCounts);
            return serialized.toString();
        } else {
            serialized.append("{");
        }
        for (String key : node.children.keySet()) {
            System.out.println(" key: " + key);
            // System.out.println(" key: " + key);
            serialized.append(key).append(" ").append(serializeHelper(node.children.get(key)));
        }
        System.out.println("Serialized node: " + node.count + " " + node.childCounts);
        serialized.append("}");
        // System.out.println("Serialized node: " + node.count + " " + node.childCounts);

        return serialized.toString();
    }

    public TrieNode deserialize(byte[] serialized) {
        String serializedString = new String(serialized);
        String[] tokens = serializedString.split(" ");
        System.out.println("Deserialized metadata size: " + tokens.length + " tokens");
        // Create Root
        TrieNode root = new TrieNode(); 
        root.count = Integer.parseInt(tokens[0]);
        root.childCounts = Integer.parseInt(tokens[1]);
        for (int i = 2; i < tokens.length; i += 2) {
            root.children.put(tokens[i], deserializeHelper(tokens, i + 1));
        }
        return root;
    public void deserialize(byte[] data) {
        String serializedString = new String(data);
        // Ensure proper spacing around '{' and '}' to correctly parse children
        serializedString = serializedString.replaceAll("\\{", " { ").replaceAll("\\}", " } ").trim();
        String[] parts = serializedString.split("\\s+");
        int[] index = { 0 };

        // Clear current state before rebuilding
        this.children.clear();
        deserializeHelper(this, parts, index);
    }

    // Helper method adapted for in-place deserialization
    private static void deserializeHelper(TrieNode node, String[] parts, int[] index) {
        if (index[0] >= parts.length) {
            throw new IllegalArgumentException("Unexpected end of serialized data.");
        }

    private TrieNode deserializeHelper(String[] tokens, int index) {
        // Create Trie Node
        TrieNode node = new TrieNode();
        node.count = Integer.parseInt(tokens[index]);
        node.childCounts = Integer.parseInt(tokens[index + 1]);
        System.out.println("Index "+index+" Deserialized node: " + node.count + " " + node.childCounts+" token "+tokens[index+2]);
        for (int i = index + 2; i < tokens.length; i += 2) {
            node.children.put(tokens[i], deserializeHelper(tokens, i + 1));
        }
        return node;
        // Set the current node's count and childCounts
        node.count = Integer.parseInt(parts[index[0]++]);
        node.childCounts = Integer.parseInt(parts[index[0]++]);

        if ("{".equals(parts[index[0]])) {
            index[0]++; // Move past the "{" marker
            while (!"}".equals(parts[index[0]])) {
                String key = parts[index[0]++];
                TrieNode child = new TrieNode();
                deserializeHelper(child, parts, index);
                node.children.put(key, child);
            }
            index[0]++; // Skip the "}" marker
        }
    }
}
+3 −2
Original line number Diff line number Diff line
@@ -113,6 +113,7 @@ public class crawler {
        System.out.println(web_data.text());
        */
        // Extract word usage data
        //System.err.println(web_data.text());
        extractWordUsage(web_data.text(), wordUsage);
        byte[] metadata = wordUsage.serialize();
        System.out.println("Metadata extracted successfully.");
@@ -140,7 +141,7 @@ public class crawler {
        // Split text into sentences
        String[] sentences = text.split("[.!?] ");
        for (String sentence : sentences) {
            for (int nGram = 1; nGram <= 2; nGram++) {
            for (int nGram = 1; nGram <= 3; nGram++) {
                String[] words = sentence.split("\\s+");
                for (int i = 0; i < words.length - nGram + 1; i++) {
                    trie.insert(Arrays.copyOfRange(words, i, i + nGram));