()
| 1206 | |
| 1207 | |
| 1208 | def test_clustering_quality(): |
| 1209 | dataset_path = "public/pathway/python/pathway/tests/data/louvain_graph.gexf" |
| 1210 | # TODO (in another PR): |
| 1211 | # - add another type of xfail, that is allowed to fail when tested on environments |
| 1212 | # that does not do dvc pull (requires xfail, and perhaps setting some env_variable |
| 1213 | # for environments that do dvc_pull) |
| 1214 | # - separate this test to some different batch of tests that are executed |
| 1215 | # periodically (as it is some kind of clustering quality benchmark), one won't make |
| 1216 | # it slow, but adding such tests en masse will make default jenkins run slow |
| 1217 | |
| 1218 | if not os.path.isfile(dataset_path): |
| 1219 | pytest.skip("dvc file with the input graph is unavailable") |
| 1220 | graph = nx.read_gexf(dataset_path) |
| 1221 | |
| 1222 | input_vertices = dict() |
| 1223 | input_id = dict() |
| 1224 | for str_id, visits in graph.nodes(data="visits"): |
| 1225 | input_id[int(str_id)] = int(str_id) |
| 1226 | input_vertices[int(str_id)] = visits |
| 1227 | |
| 1228 | cnt = 1 |
| 1229 | input_edges_u = dict() |
| 1230 | input_edges_v = dict() |
| 1231 | input_edges_visits = dict() |
| 1232 | for stru, strv, visits in graph.edges(data="visits"): |
| 1233 | input_edges_u[cnt] = int(stru) |
| 1234 | input_edges_v[cnt] = int(strv) |
| 1235 | input_edges_visits[cnt] = visits |
| 1236 | cnt += 1 |
| 1237 | |
| 1238 | vertices = ( |
| 1239 | from_dicts(idd=input_id, visits=input_vertices) |
| 1240 | .with_id_from(pw.this.idd) |
| 1241 | .without(pw.this.idd) |
| 1242 | ) |
| 1243 | |
| 1244 | edges = from_dicts( |
| 1245 | u=input_edges_u, v=input_edges_v, edge_visits=input_edges_visits |
| 1246 | ).with_columns( |
| 1247 | u=vertices.pointer_from(pw.this.u), v=vertices.pointer_from(pw.this.v) |
| 1248 | ) |
| 1249 | |
| 1250 | weighted_edges = compute_weights(edges, vertices) |
| 1251 | doubled_edges = pw.Table.concat_reindex( |
| 1252 | weighted_edges, |
| 1253 | weighted_edges.select( |
| 1254 | u=weighted_edges.v, v=weighted_edges.u, weight=weighted_edges.weight |
| 1255 | ), |
| 1256 | ) |
| 1257 | MAX_LVL = 10 |
| 1258 | ret = louvain_communities_fixed_iterations( |
| 1259 | WeightedGraph.from_vertices_and_weighted_edges(vertices, doubled_edges), |
| 1260 | 0.1, |
| 1261 | MAX_LVL, |
| 1262 | ) |
| 1263 | our_mod = exact_modularity( |
| 1264 | WeightedGraph.from_vertices_and_weighted_edges(vertices, doubled_edges), |
| 1265 | ret.clustering_levels.filter(ret.clustering_levels.level == MAX_LVL).with_id( |
nothing calls this directly
no test coverage detected