Loading testcases/download_molecules.py +19 −15 Original line number Diff line number Diff line Loading @@ -6,9 +6,11 @@ import time MIN_ISOMER_COUNT = 100 MAX_CID_SEGMENT_LEN = 4000 # maybe 3997 MAX_MOLECULE_COUNT = 1 MAX_MOLECULE_COUNT = 10_000_000 C_count_max = 10000 TITIN = "C169719H270466N45688O52238S911" C_count_max = 100 C_count_min = C_count_max // 2 H_C_ratio_max = 48 / 25 Loading Loading @@ -47,18 +49,20 @@ if __name__ == "__main__": molecule_count = 0 while molecule_count < MAX_MOLECULE_COUNT: chemical_formula = "" while chemical_formula in previous_formulae: chemical_formula = "" molecular_formula = "" while molecular_formula in previous_formulae: molecular_formula = "" C_count = random.randint(C_count_min, C_count_max) chemical_formula += "C" + str(C_count) chemical_formula += add_atom("H", C_count, H_C_ratio_max, H_C_ratio_min) chemical_formula += add_atom("N", C_count, N_C_ratio_max, N_C_ratio_min) chemical_formula += add_atom("O", C_count, O_C_ratio_max, O_C_ratio_min) molecular_formula += "C" + str(C_count) molecular_formula += add_atom("H", C_count, H_C_ratio_max, H_C_ratio_min) molecular_formula += add_atom("N", C_count, N_C_ratio_max, N_C_ratio_min) molecular_formula += add_atom("O", C_count, O_C_ratio_max, O_C_ratio_min) print(molecular_formula) url = ( "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/fastformula/" + chemical_formula + molecular_formula + "/cids/JSON" ) previous_time = wait_for_api_ready(previous_time) Loading @@ -69,15 +73,15 @@ if __name__ == "__main__": data = response.json() if "Fault" in data: print(chemical_formula + ": " + data["Fault"]["Message"]) previous_formulae.append(chemical_formula) print(molecular_formula + ": " + data["Fault"]["Message"]) previous_formulae.append(molecular_formula) continue cids = data["IdentifierList"]["CID"] isomer_count = len(cids) if isomer_count < MIN_ISOMER_COUNT: # print(chemical_formula + ": Has only " + str(isomer_count) + " isomers") previous_formulae.append(chemical_formula) # print(molecular_formula + ": Has only " + str(isomer_count) + " isomers") previous_formulae.append(molecular_formula) continue cid_segment = str(cids[0]) Loading @@ -101,7 +105,7 @@ if __name__ == "__main__": for molecule_struct in data["PropertyTable"]["Properties"]: if "CanonicalSMILES" in molecule_struct: molecule_name = ( chemical_formula + "_" + str(molecule_struct["CID"]) molecular_formula + "_" + str(molecule_struct["CID"]) ) smiles = molecule_struct["CanonicalSMILES"] if ( Loading Loading
testcases/download_molecules.py +19 −15 Original line number Diff line number Diff line Loading @@ -6,9 +6,11 @@ import time MIN_ISOMER_COUNT = 100 MAX_CID_SEGMENT_LEN = 4000 # maybe 3997 MAX_MOLECULE_COUNT = 1 MAX_MOLECULE_COUNT = 10_000_000 C_count_max = 10000 TITIN = "C169719H270466N45688O52238S911" C_count_max = 100 C_count_min = C_count_max // 2 H_C_ratio_max = 48 / 25 Loading Loading @@ -47,18 +49,20 @@ if __name__ == "__main__": molecule_count = 0 while molecule_count < MAX_MOLECULE_COUNT: chemical_formula = "" while chemical_formula in previous_formulae: chemical_formula = "" molecular_formula = "" while molecular_formula in previous_formulae: molecular_formula = "" C_count = random.randint(C_count_min, C_count_max) chemical_formula += "C" + str(C_count) chemical_formula += add_atom("H", C_count, H_C_ratio_max, H_C_ratio_min) chemical_formula += add_atom("N", C_count, N_C_ratio_max, N_C_ratio_min) chemical_formula += add_atom("O", C_count, O_C_ratio_max, O_C_ratio_min) molecular_formula += "C" + str(C_count) molecular_formula += add_atom("H", C_count, H_C_ratio_max, H_C_ratio_min) molecular_formula += add_atom("N", C_count, N_C_ratio_max, N_C_ratio_min) molecular_formula += add_atom("O", C_count, O_C_ratio_max, O_C_ratio_min) print(molecular_formula) url = ( "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/fastformula/" + chemical_formula + molecular_formula + "/cids/JSON" ) previous_time = wait_for_api_ready(previous_time) Loading @@ -69,15 +73,15 @@ if __name__ == "__main__": data = response.json() if "Fault" in data: print(chemical_formula + ": " + data["Fault"]["Message"]) previous_formulae.append(chemical_formula) print(molecular_formula + ": " + data["Fault"]["Message"]) previous_formulae.append(molecular_formula) continue cids = data["IdentifierList"]["CID"] isomer_count = len(cids) if isomer_count < MIN_ISOMER_COUNT: # print(chemical_formula + ": Has only " + str(isomer_count) + " isomers") previous_formulae.append(chemical_formula) # print(molecular_formula + ": Has only " + str(isomer_count) + " isomers") previous_formulae.append(molecular_formula) continue cid_segment = str(cids[0]) Loading @@ -101,7 +105,7 @@ if __name__ == "__main__": for molecule_struct in data["PropertyTable"]["Properties"]: if "CanonicalSMILES" in molecule_struct: molecule_name = ( chemical_formula + "_" + str(molecule_struct["CID"]) molecular_formula + "_" + str(molecule_struct["CID"]) ) smiles = molecule_struct["CanonicalSMILES"] if ( Loading