| 189 | raise ValueError("Unexpected MD5 hash of the split file") |
| 190 | |
| 191 | def process(self): |
| 192 | data_df = pd.read_csv(self.raw_data_path) |
| 193 | smiles_list = data_df["smiles"] |
| 194 | target_names = [ |
| 195 | "Inertia_mass_a", |
| 196 | "Inertia_mass_b", |
| 197 | "Inertia_mass_c", |
| 198 | "Inertia_valence_a", |
| 199 | "Inertia_valence_b", |
| 200 | "Inertia_valence_c", |
| 201 | "length_a", |
| 202 | "length_b", |
| 203 | "length_c", |
| 204 | "Spherocity", |
| 205 | "Plane_best_fit", |
| 206 | ] |
| 207 | # Normalize to zero mean and unit standard deviation. |
| 208 | data_df.loc[:, target_names] = data_df.loc[:, target_names].apply( |
| 209 | lambda x: (x - x.mean()) / x.std(), axis=0 |
| 210 | ) |
| 211 | if self.verbose: |
| 212 | print("Converting SMILES strings into graphs...") |
| 213 | |
| 214 | for i in tqdm(range(len(smiles_list))): |
| 215 | smiles = smiles_list[i] |
| 216 | y = data_df.iloc[i][target_names] |
| 217 | graph = self.smiles2graph(smiles) |
| 218 | |
| 219 | assert len(graph["edge_feat"]) == graph["edge_index"].shape[1] |
| 220 | assert len(graph["node_feat"]) == graph["num_nodes"] |
| 221 | DGLgraph = dgl_graph( |
| 222 | (graph["edge_index"][0], graph["edge_index"][1]), |
| 223 | num_nodes=graph["num_nodes"], |
| 224 | ) |
| 225 | DGLgraph.edata["feat"] = F.zerocopy_from_numpy( |
| 226 | graph["edge_feat"] |
| 227 | ).to(F.int64) |
| 228 | DGLgraph.ndata["feat"] = F.zerocopy_from_numpy( |
| 229 | graph["node_feat"] |
| 230 | ).to(F.int64) |
| 231 | |
| 232 | self.graphs.append(DGLgraph) |
| 233 | self.labels.append(y) |
| 234 | |
| 235 | self.labels = F.tensor(self.labels, dtype=F.float32) |
| 236 | |
| 237 | def load(self): |
| 238 | self.graphs, label_dict = load_graphs(self.graph_path) |