Upload New File (e0650809) · Commits · EC504 Spring 2024 Group Projects / Group4

testcases/downloadPubChem.py

0 → 100644

+114 −0

Original line number	Diff line number	Diff line
		from pysmiles import read_smiles
		import networkx as nx
		import random
		import requests
		import os
		import time
		import sys

		# returns the order number of the edge that appears using nx.edge_list
		def getOrder(line):
		# only allow integer bond order values
		order = line.split("{'order': ")[1].rstrip('}\n')
		return int(line.split("{'order': ")[1].rstrip('}\n')) if order.isdigit() else -1

		# multiplies the edge pair by the order number
		def processContent(content):
		new_content = ""
		for line in content:
		# check if the line contains bond order information
		if "{'order':" in line:
		bond_order = getOrder(line)
		# if bond order is invalid, then return error
		if bond_order == -1:
		return -1

		# duplicate the edge for value of bond order
		vertex_pair = ' '.join(line.split()[:2])
		for i in range(bond_order):
		new_content += vertex_pair + '\n'
		else:
		new_content += line
		return new_content

		def writeMolecule(mol_name, smiles):
		mol_with_H = read_smiles(smiles, explicit_hydrogen=True)
		file_path_mol = "./testcases/molecules/" + mol_name + ".txt"

		nx.write_edgelist(mol_with_H, file_path_mol)

		# write metadata for mol
		nodes = mol_with_H.nodes(data='element')
		mol_data = mol_name + '\n' + str(len(nodes)) + '\n'
		for node in nodes:
		mol_data += node[1] + '\n'


		# read the generated edgelist
		with open(file_path_mol, 'r') as file:
		edgelist = file.read()

		# if theres an invalid edgelist, remove it from the file system
		modified_edgelist = processContent(edgelist.split('\n'))
		if modified_edgelist == -1:
		os.remove(file_path_mol)
		return -1

		# write metadata at the beginning and the original content
		with open(file_path_mol, 'w') as file:
		file.write(mol_data + modified_edgelist)

		# print filepath to pick up in Java
		print(file_path_mol)
		return 0

		def writePubChem(start, end):
		MAXSTEP = 100
		INCREMENT = min(end - start, MAXSTEP)
		# range(start, end, step) --> Change values for number of molecules required
		for indx in range(start,end,INCREMENT):
		start_time = time.time()

		numbers = [str(i) for i in range(indx, indx + INCREMENT)]
		# if HH in numbers:
		# numbers.remove(HH)
		indexes = ",".join(numbers)

		# query from pubchem URL
		url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + indexes + '/property/Title,CanonicalSMILES/json'
		response = requests.get(url)

		# Check if the request was successful
		if response.status_code == 200:
		# Extract the json from the response
		page_text = response.json()

		for chemical in page_text['PropertyTable']['Properties']:
		# check if desired keys are in the json
		if 'CanonicalSMILES' in chemical:
		mol_name = "molecule" + str(chemical['CID'])
		smiles = chemical['CanonicalSMILES']
		# ignore Hydrogen Molecules
		if smiles == "[HH]":
		continue

		# print("molecule "+ str(chemical['CID']) + ": " + mol_name + "\t" + "smiles: " + smiles)

		writeMolecule(mol_name, smiles)
		else:
		print("Failed to retrieve the page. Status code:", response.status_code)

		while (time.time() - start_time < 0.3):
		pass

		if __name__ == "__main__":
		# get start and end index from Java call
		start = int(sys.argv[1])
		end = int(sys.argv[2])
		if start >= end or start < 0:
		pass
		else:
		writePubChem(start, end + 1)