|
| 1 | +from pathlib import Path |
| 2 | + |
| 3 | +import numpy as np |
| 4 | +import pandas as pd |
| 5 | +from ase.db import connect |
| 6 | +from scipy import stats |
| 7 | + |
| 8 | +from mlip_arena.models import REGISTRY, MLIPEnum |
| 9 | + |
| 10 | +DATA_DIR = Path(__file__).parent.absolute() |
| 11 | + |
| 12 | + |
| 13 | +def load_wbm_structures(): |
| 14 | + """ |
| 15 | + Load the WBM structures from a ASE DB file. |
| 16 | + """ |
| 17 | + with connect(DATA_DIR.parent / "wbm_structures.db") as db: |
| 18 | + for row in db.select(): |
| 19 | + yield row.toatoms(add_additional_information=True) |
| 20 | + |
| 21 | +def gather_results(): |
| 22 | + for model in MLIPEnum: |
| 23 | + if "eos_bulk" not in REGISTRY[model.name].get("gpu-tasks", []): |
| 24 | + continue |
| 25 | + |
| 26 | + if (DATA_DIR / f"{model.name}.parquet").exists(): |
| 27 | + continue |
| 28 | + |
| 29 | + all_data = [] |
| 30 | + |
| 31 | + for atoms in load_wbm_structures(): |
| 32 | + fpath = Path(model.name) / f"{atoms.info['key_value_pairs']['wbm_id']}.pkl" |
| 33 | + if not fpath.exists(): |
| 34 | + continue |
| 35 | + |
| 36 | + all_data.append(pd.read_pickle(fpath)) |
| 37 | + |
| 38 | + df = pd.concat(all_data, ignore_index=True) |
| 39 | + df.to_parquet(DATA_DIR / f"{model.name}.parquet") |
| 40 | + |
| 41 | + |
| 42 | +def summarize(): |
| 43 | + summary_table = pd.DataFrame( |
| 44 | + columns=[ |
| 45 | + "model", |
| 46 | + "energy-diff-flip-times", |
| 47 | + "tortuosity", |
| 48 | + "spearman-compression-energy", |
| 49 | + "spearman-compression-derivative", |
| 50 | + "spearman-tension-energy", |
| 51 | + "missing", |
| 52 | + ] |
| 53 | + ) |
| 54 | + |
| 55 | + |
| 56 | + for model in MLIPEnum: |
| 57 | + fpath = DATA_DIR / f"{model.name}.parquet" |
| 58 | + if not fpath.exists(): |
| 59 | + continue |
| 60 | + df_raw_results = pd.read_parquet(fpath) |
| 61 | + |
| 62 | + df_analyzed = pd.DataFrame( |
| 63 | + columns=[ |
| 64 | + "model", |
| 65 | + "structure", |
| 66 | + "formula", |
| 67 | + "volume-ratio", |
| 68 | + "energy-delta-per-atom", |
| 69 | + "energy-diff-flip-times", |
| 70 | + "energy-delta-per-volume-b0", |
| 71 | + "tortuosity", |
| 72 | + "spearman-compression-energy", |
| 73 | + "spearman-compression-derivative", |
| 74 | + "spearman-tension-energy", |
| 75 | + "missing", |
| 76 | + ] |
| 77 | + ) |
| 78 | + |
| 79 | + for wbm_struct in load_wbm_structures(): |
| 80 | + structure_id = wbm_struct.info["key_value_pairs"]["wbm_id"] |
| 81 | + |
| 82 | + try: |
| 83 | + results = df_raw_results.loc[df_raw_results["id"] == structure_id] |
| 84 | + b0 = results["b0"].values[0] |
| 85 | + # vol0 = results["v0"].values[0] |
| 86 | + results = results["eos"].values[0] |
| 87 | + es = np.array(results["energies"]) |
| 88 | + vols = np.array(results["volumes"]) |
| 89 | + |
| 90 | + indices = np.argsort(vols) |
| 91 | + vols = vols[indices] |
| 92 | + es = es[indices] |
| 93 | + |
| 94 | + imine = len(es) // 2 |
| 95 | + # min_center_val = np.min(es[imid - 1 : imid + 2]) |
| 96 | + # imine = np.where(es == min_center_val)[0][0] |
| 97 | + emin = es[imine] |
| 98 | + vol0 = vols[imine] |
| 99 | + |
| 100 | + interpolated_volumes = [ |
| 101 | + (vols[i] + vols[i + 1]) / 2 for i in range(len(vols) - 1) |
| 102 | + ] |
| 103 | + ediff = np.diff(es) |
| 104 | + ediff_sign = np.sign(ediff) |
| 105 | + mask = ediff_sign != 0 |
| 106 | + ediff = ediff[mask] |
| 107 | + ediff_sign = ediff_sign[mask] |
| 108 | + ediff_flip = np.diff(ediff_sign) != 0 |
| 109 | + |
| 110 | + etv = np.sum(np.abs(np.diff(es))) |
| 111 | + |
| 112 | + data = { |
| 113 | + "model": model.name, |
| 114 | + "structure": structure_id, |
| 115 | + "formula": wbm_struct.get_chemical_formula(), |
| 116 | + "missing": False, |
| 117 | + "volume-ratio": vols / vol0, |
| 118 | + "energy-delta-per-atom": (es - emin) / len(wbm_struct), |
| 119 | + "energy-diff-flip-times": np.sum(ediff_flip).astype(int), |
| 120 | + "energy-delta-per-volume-b0": (es - emin) / (b0*vol0), |
| 121 | + "tortuosity": etv / (abs(es[0] - emin) + abs(es[-1] - emin)), |
| 122 | + "spearman-compression-energy": stats.spearmanr( |
| 123 | + vols[:imine], es[:imine] |
| 124 | + ).statistic, |
| 125 | + "spearman-compression-derivative": stats.spearmanr( |
| 126 | + interpolated_volumes[:imine], ediff[:imine] |
| 127 | + ).statistic, |
| 128 | + "spearman-tension-energy": stats.spearmanr( |
| 129 | + vols[imine:], es[imine:] |
| 130 | + ).statistic, |
| 131 | + } |
| 132 | + |
| 133 | + except Exception as e: |
| 134 | + print(e) |
| 135 | + data = { |
| 136 | + "model": model.name, |
| 137 | + "structure": structure_id, |
| 138 | + "formula": wbm_struct.get_chemical_formula(), |
| 139 | + "missing": True, |
| 140 | + "volume-ratio": None, |
| 141 | + "energy-delta-per-atom": None, |
| 142 | + "energy-delta-per-volume-b0": None, |
| 143 | + "energy-diff-flip-times": None, |
| 144 | + "tortuosity": None, |
| 145 | + "spearman-compression-energy": None, |
| 146 | + "spearman-compression-derivative": None, |
| 147 | + "spearman-tension-energy": None, |
| 148 | + } |
| 149 | + |
| 150 | + df_analyzed = pd.concat([df_analyzed, pd.DataFrame([data])], ignore_index=True) |
| 151 | + |
| 152 | + df_analyzed.to_parquet(DATA_DIR / f"{model.name}_processed.parquet") |
| 153 | + # json_fpath = DATA_DIR / f"EV_scan_analyzed_{model.name}.json" |
| 154 | + |
| 155 | + # df_analyzed.to_json(json_fpath, orient="records") |
| 156 | + |
| 157 | + valid_results = df_analyzed[df_analyzed["missing"] == False] |
| 158 | + |
| 159 | + analysis_summary = { |
| 160 | + "model": model.name, |
| 161 | + "energy-diff-flip-times": valid_results["energy-diff-flip-times"].mean(), |
| 162 | + "tortuosity": valid_results["tortuosity"].mean(), |
| 163 | + "spearman-compression-energy": valid_results[ |
| 164 | + "spearman-compression-energy" |
| 165 | + ].mean(), |
| 166 | + "spearman-compression-derivative": valid_results[ |
| 167 | + "spearman-compression-derivative" |
| 168 | + ].mean(), |
| 169 | + "spearman-tension-energy": valid_results["spearman-tension-energy"].mean(), |
| 170 | + "missing": len(df_analyzed[df_analyzed["missing"] == True]), |
| 171 | + } |
| 172 | + summary_table = pd.concat( |
| 173 | + [summary_table, pd.DataFrame([analysis_summary])], ignore_index=True |
| 174 | + ) |
| 175 | + |
| 176 | + |
| 177 | + flip_rank = ( |
| 178 | + (summary_table["energy-diff-flip-times"] - 1) |
| 179 | + .abs() |
| 180 | + .rank(ascending=True, method="min") |
| 181 | + ) |
| 182 | + tortuosity_rank = summary_table["tortuosity"].rank(ascending=True, method="min") |
| 183 | + spearman_compression_energy_rank = summary_table["spearman-compression-energy"].rank( |
| 184 | + method="min" |
| 185 | + ) |
| 186 | + spearman_compression_derivative_rank = summary_table[ |
| 187 | + "spearman-compression-derivative" |
| 188 | + ].rank(ascending=False, method="min") |
| 189 | + spearman_tension_energy_rank = summary_table["spearman-tension-energy"].rank( |
| 190 | + ascending=False, method="min" |
| 191 | + ) |
| 192 | + missing_rank = summary_table["missing"].rank(ascending=True, method="min") |
| 193 | + |
| 194 | + rank_aggr = ( |
| 195 | + flip_rank |
| 196 | + + tortuosity_rank |
| 197 | + + spearman_compression_energy_rank |
| 198 | + + spearman_compression_derivative_rank |
| 199 | + + spearman_tension_energy_rank |
| 200 | + + missing_rank |
| 201 | + ) |
| 202 | + rank = rank_aggr.rank(method="min") |
| 203 | + |
| 204 | + summary_table.insert(1, "rank", rank.astype(int)) |
| 205 | + summary_table.insert(2, "rank-aggregation", rank_aggr.astype(int)) |
| 206 | + summary_table = summary_table.sort_values(by="rank", ascending=True) |
| 207 | + summary_table = summary_table.reset_index(drop=True) |
| 208 | + |
| 209 | + summary_table.to_csv(DATA_DIR / "summary.csv", index=False) |
| 210 | + summary_table.to_latex(DATA_DIR / "summary.tex", index=False) |
| 211 | + |
| 212 | + return summary_table |
| 213 | + |
| 214 | +if __name__ == "__main__": |
| 215 | + gather_results() |
| 216 | + summarize() |
0 commit comments