System.out.println("Previous compressed data size: "+compression_size+" bytes. Current compressed data size: "+compressedData.length+" bytes. Delta:"+(compressedData.length-compression_size)+" bytes.");
System.out.println("Size limit exceeded. Reverting to previous chunk.");
sizeLimitExceeded=true;
@@ -146,6 +182,13 @@ public class crawler {
break;// Stop processing further chunks
}
}
}else{
extractWordUsage(web_data.text(),wordUsage);
// System.out.println("Ngrams built successfully. for size:"+MAXNGRAM);
uncompressedData=wordUsage.serialize();
compressedData=compress(uncompressedData);
build_off_corpus=false;// if there are urls to read it should still be able to read them
}
// Save the uncompressed and compressed data to separate files
if(debug){
@@ -183,7 +226,7 @@ public class crawler {
// Split text into sentences
String[]sentences=text.split("[.!?] ");
for(Stringsentence:sentences){
for(intnGram=1;nGram<=3;nGram++){
for(intnGram=1;nGram<=MAXNGRAM;nGram++){
String[]words=sentence.split("\\s+");
for(inti=0;i<words.length-nGram+1;i++){
trie.insert(Arrays.copyOfRange(words,i,i+nGram));
@@ -244,6 +287,7 @@ public class crawler {
returntrie;
}catch(IOExceptione){
System.err.println("Error reading metadata from file: "+e.getMessage());