Loading testcases/simple_million.py +102 −21 Original line number Diff line number Diff line import molecule_input as mi import random import requests import time ASPIRIN = "C9H8O4" DOXYCYCLINE = "C22H24N2O8" Loading @@ -9,27 +11,43 @@ PARACETAMOL = "C8H9NO2" VANCOMYCIN = "C66H75Cl2N9O24" FOSFOMYCIN = "C3H7O4P" LINEZOLID = "C16H20FN3O4" MIN_ISOMER_COUNT = 500 MAX_CID_SEGMENT_LEN = 4000 # maybe 3997 MAX_MOLECULE_COUNT = 10_000_000 C_count_max = 56 C_count_min = 3 C_count_min = 16 H_C_ratio_max = 7 / 3 H_C_ratio_max = 48 / 25 H_C_ratio_min = 17 / 15 Cl_C_ratio_max = 1 / 30 Cl_C_ratio_min = 0 Cl_range = [0, 1] F_C_ratio_max = 1 / 16 F_C_ratio_min = 0 F_range = [0, 1] N_C_ratio_max = 4 / 13 N_C_ratio_min = 0 O_C_ratio_max = 4 / 3 O_C_ratio_max = 2 / 5 O_C_ratio_min = 1 / 4 P_C_ratio_max = 1 / 3 P_C_ratio_min = 0 P_range = [0, 1] S_C_ratio_max = 1 / 15 S_C_ratio_min = 0 S_range = [0, 1, 2] def add_H_N_O(atom: str, C_count: int, ratio_max: float, ratio_min: float): def add_atom(atom: str, C_count: int, ratio_max: float, ratio_min: float): ret = "" ratio = ratio_min + random.random() * (ratio_max - ratio_min) count = int(C_count * ratio) Loading @@ -40,7 +58,7 @@ def add_H_N_O(atom: str, C_count: int, ratio_max: float, ratio_min: float): return ret def add_other(atom: str, range: list): def add_atom2(atom: str, range: list): ret = "" count = random.choice(range) if count > 1: Loading @@ -50,21 +68,84 @@ def add_other(atom: str, range: list): return ret def wait_for_api(previous_time): if previous_time != None: while time.time() - previous_time < 0.2: pass return time.time() if __name__ == "__main__": molecule_count = 0 while molecule_count < 10_000_000: molecule_formula = "" C_count = random.randint(C_count_min, C_count_max) molecule_formula += "C" + str(C_count) previous_time = None molecule_formula += add_H_N_O("H", C_count, H_C_ratio_max, H_C_ratio_min) molecule_formula += add_other("Cl", Cl_range) molecule_formula += add_other("F", F_range) molecule_formula += add_H_N_O("N", C_count, N_C_ratio_max, N_C_ratio_min) molecule_formula += add_H_N_O("O", C_count, O_C_ratio_max, O_C_ratio_min) molecule_formula += add_other("P", P_range) molecule_formula += add_other("S", S_range) previous_formulae = [""] molecule_count = 0 while molecule_count < MAX_MOLECULE_COUNT: print(molecule_formula) chemical_formula = "" while chemical_formula in previous_formulae: chemical_formula = "" C_count = random.randint(C_count_min, C_count_max) chemical_formula += "C" + str(C_count) chemical_formula += add_atom("H", C_count, H_C_ratio_max, H_C_ratio_min) # chemical_formula += add_atom("Cl", C_count, Cl_C_ratio_max, Cl_C_ratio_min) # chemical_formula += add_atom("F", C_count, F_C_ratio_max, F_C_ratio_min) chemical_formula += add_atom("N", C_count, N_C_ratio_max, N_C_ratio_min) chemical_formula += add_atom("O", C_count, O_C_ratio_max, O_C_ratio_min) # chemical_formula += add_atom("P", C_count, P_C_ratio_max, P_C_ratio_min) # chemical_formula += add_atom("S", C_count, S_C_ratio_max, S_C_ratio_min) url = ( "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/fastformula/" + chemical_formula + "/cids/JSON" ) previous_time = wait_for_api(previous_time) response = requests.get(url) if response.status_code != 200: continue data = response.json() if "Fault" in data: print(chemical_formula + ": " + data["Fault"]["Message"]) previous_formulae.append(chemical_formula) continue cids = data["IdentifierList"]["CID"] isomer_count = len(cids) if isomer_count < MIN_ISOMER_COUNT: print(chemical_formula + ": Has only " + str(isomer_count) + " isomers") previous_formulae.append(chemical_formula) continue cid_segment = None cid_idx = 0 while True: cid_segment = str(cids[cid_idx]) cid_idx += 1 if cid_idx == isomer_count: break cid = cids[cid_idx] while len(cid_segment + "," + str(cid)) < MAX_CID_SEGMENT_LEN: cid_segment += "," + str(cid) cid_idx += 1 cid = cids[cid_idx] url = ( "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/" + cid_segment + "/property/Title,CanonicalSMILES/json" ) previous_time = wait_for_api(previous_time) response = requests.get(url) data = response.json() for molecule_struct in data["PropertyTable"]["Properties"]: if "CanonicalSMILES" in molecule_struct: molecule_name = chemical_formula + "_" + str(molecule_struct["CID"]) smiles = molecule_struct["CanonicalSMILES"] if mi.writeMolecule(molecule_name, smiles, "simple_molecules") == 0: molecule_count += 1 mi.writeIsomorphic(molecule_name, smiles, "simple_isomorphic") print("goodbye") Loading
testcases/simple_million.py +102 −21 Original line number Diff line number Diff line import molecule_input as mi import random import requests import time ASPIRIN = "C9H8O4" DOXYCYCLINE = "C22H24N2O8" Loading @@ -9,27 +11,43 @@ PARACETAMOL = "C8H9NO2" VANCOMYCIN = "C66H75Cl2N9O24" FOSFOMYCIN = "C3H7O4P" LINEZOLID = "C16H20FN3O4" MIN_ISOMER_COUNT = 500 MAX_CID_SEGMENT_LEN = 4000 # maybe 3997 MAX_MOLECULE_COUNT = 10_000_000 C_count_max = 56 C_count_min = 3 C_count_min = 16 H_C_ratio_max = 7 / 3 H_C_ratio_max = 48 / 25 H_C_ratio_min = 17 / 15 Cl_C_ratio_max = 1 / 30 Cl_C_ratio_min = 0 Cl_range = [0, 1] F_C_ratio_max = 1 / 16 F_C_ratio_min = 0 F_range = [0, 1] N_C_ratio_max = 4 / 13 N_C_ratio_min = 0 O_C_ratio_max = 4 / 3 O_C_ratio_max = 2 / 5 O_C_ratio_min = 1 / 4 P_C_ratio_max = 1 / 3 P_C_ratio_min = 0 P_range = [0, 1] S_C_ratio_max = 1 / 15 S_C_ratio_min = 0 S_range = [0, 1, 2] def add_H_N_O(atom: str, C_count: int, ratio_max: float, ratio_min: float): def add_atom(atom: str, C_count: int, ratio_max: float, ratio_min: float): ret = "" ratio = ratio_min + random.random() * (ratio_max - ratio_min) count = int(C_count * ratio) Loading @@ -40,7 +58,7 @@ def add_H_N_O(atom: str, C_count: int, ratio_max: float, ratio_min: float): return ret def add_other(atom: str, range: list): def add_atom2(atom: str, range: list): ret = "" count = random.choice(range) if count > 1: Loading @@ -50,21 +68,84 @@ def add_other(atom: str, range: list): return ret def wait_for_api(previous_time): if previous_time != None: while time.time() - previous_time < 0.2: pass return time.time() if __name__ == "__main__": molecule_count = 0 while molecule_count < 10_000_000: molecule_formula = "" C_count = random.randint(C_count_min, C_count_max) molecule_formula += "C" + str(C_count) previous_time = None molecule_formula += add_H_N_O("H", C_count, H_C_ratio_max, H_C_ratio_min) molecule_formula += add_other("Cl", Cl_range) molecule_formula += add_other("F", F_range) molecule_formula += add_H_N_O("N", C_count, N_C_ratio_max, N_C_ratio_min) molecule_formula += add_H_N_O("O", C_count, O_C_ratio_max, O_C_ratio_min) molecule_formula += add_other("P", P_range) molecule_formula += add_other("S", S_range) previous_formulae = [""] molecule_count = 0 while molecule_count < MAX_MOLECULE_COUNT: print(molecule_formula) chemical_formula = "" while chemical_formula in previous_formulae: chemical_formula = "" C_count = random.randint(C_count_min, C_count_max) chemical_formula += "C" + str(C_count) chemical_formula += add_atom("H", C_count, H_C_ratio_max, H_C_ratio_min) # chemical_formula += add_atom("Cl", C_count, Cl_C_ratio_max, Cl_C_ratio_min) # chemical_formula += add_atom("F", C_count, F_C_ratio_max, F_C_ratio_min) chemical_formula += add_atom("N", C_count, N_C_ratio_max, N_C_ratio_min) chemical_formula += add_atom("O", C_count, O_C_ratio_max, O_C_ratio_min) # chemical_formula += add_atom("P", C_count, P_C_ratio_max, P_C_ratio_min) # chemical_formula += add_atom("S", C_count, S_C_ratio_max, S_C_ratio_min) url = ( "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/fastformula/" + chemical_formula + "/cids/JSON" ) previous_time = wait_for_api(previous_time) response = requests.get(url) if response.status_code != 200: continue data = response.json() if "Fault" in data: print(chemical_formula + ": " + data["Fault"]["Message"]) previous_formulae.append(chemical_formula) continue cids = data["IdentifierList"]["CID"] isomer_count = len(cids) if isomer_count < MIN_ISOMER_COUNT: print(chemical_formula + ": Has only " + str(isomer_count) + " isomers") previous_formulae.append(chemical_formula) continue cid_segment = None cid_idx = 0 while True: cid_segment = str(cids[cid_idx]) cid_idx += 1 if cid_idx == isomer_count: break cid = cids[cid_idx] while len(cid_segment + "," + str(cid)) < MAX_CID_SEGMENT_LEN: cid_segment += "," + str(cid) cid_idx += 1 cid = cids[cid_idx] url = ( "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/" + cid_segment + "/property/Title,CanonicalSMILES/json" ) previous_time = wait_for_api(previous_time) response = requests.get(url) data = response.json() for molecule_struct in data["PropertyTable"]["Properties"]: if "CanonicalSMILES" in molecule_struct: molecule_name = chemical_formula + "_" + str(molecule_struct["CID"]) smiles = molecule_struct["CanonicalSMILES"] if mi.writeMolecule(molecule_name, smiles, "simple_molecules") == 0: molecule_count += 1 mi.writeIsomorphic(molecule_name, smiles, "simple_isomorphic") print("goodbye")