Support serializing/deserializing results.json so that we don't have to recompute metrics to redo figures

phrdang · phrdang · commit b52d966ab9ac · 2026-05-12T16:05:15.000-07:00
diff --git a/src/notebooks/technical_report.ipynb b/src/notebooks/technical_report.ipynb
@@ -41,7 +41,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "1f1bb86c",
    "metadata": {},
    "outputs": [],
@@ -171,17 +171,83 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "5e225998",
+   "execution_count": 8,
+   "id": "58168d30",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "def clean_enum_repr(s):\n",
+    "    # Regex breakdown:\n",
+    "    # <         : matches the opening bracket\n",
+    "    # ([^:]+)   : Capture Group 1: matches everything until the colon (the name)\n",
+    "    # :         : matches the colon\n",
+    "    # [^>]+     : matches the value and anything else until the closing bracket\n",
+    "    # >         : matches the closing bracket\n",
+    "    return re.sub(r'<([^:]+):[^>]+>', r'\\1', s)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "5e225998",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded 11 metrics:\n",
+      "- backups_per_student\n",
+      "- total_time_spent_days\n",
+      "- problems_solved_by_last_backup\n",
+      "- backups_per_minute\n",
+      "- time_between_backups\n",
+      "- net_num_lines_added\n",
+      "- num_occurrences_print\n",
+      "- backups_with_print\n",
+      "- backups_per_problem\n",
+      "- worksessions_per_student\n",
+      "- worksession_length_per_student\n"
+     ]
+    }
+   ],
    "source": [
     "load_results = input(\"Would you like to load the existing results.json file? Y/N: \")\n",
     "if load_results.strip().upper() == \"Y\":\n",
     "    if os.path.exists(\"results.json\"):\n",
     "        with open(\"results.json\") as f:\n",
-    "            results = json.load(f)\n",
-    "            print(f\"Loaded metrics from results.json: {list(results.keys())}\")\n",
+    "            raw_results = json.load(f)\n",
+    "\n",
+    "        deserialized_results = {}\n",
+    "\n",
+    "        for metric, courses in raw_results.items():\n",
+    "            deserialized_results[metric] = {}\n",
+    "\n",
+    "            for course_str, data in courses.items():\n",
+    "                # Remove extra characters from enum repr so that eval(...) works\n",
+    "                course_str = clean_enum_repr(course_str)\n",
+    "\n",
+    "                # Deserialize course string to Course object\n",
+    "                course_obj = eval(course_str)\n",
+    "                assert isinstance(course_obj, Course)\n",
+    "\n",
+    "                # Deserialize the data\n",
+    "                if isinstance(data, dict) and data.get(\"__df\"):\n",
+    "                    # Remove the flag before creating the DataFrame\n",
+    "                    data.pop(\"__df\")\n",
+    "                    processed_data = pd.DataFrame.from_dict(data)\n",
+    "                elif isinstance(data, list):\n",
+    "                    processed_data = np.array(data)\n",
+    "                else:\n",
+    "                    processed_data = data\n",
+    "\n",
+    "                deserialized_results[metric][course_obj] = processed_data\n",
+    "\n",
+    "        results = deserialized_results\n",
+    "\n",
+    "        print(f\"Loaded {len(results)} metrics:\")\n",
+    "        for metric in results.keys():\n",
+    "            print(f\"- {metric}\")\n",
     "    else:\n",
     "        print(\"results.json not found, defaulting to empty dict\")\n",
     "        results = {}\n",
@@ -191,7 +257,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 12,
    "id": "2422667d",
    "metadata": {},
    "outputs": [],
@@ -267,7 +333,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 13,
    "id": "f9bf7d88",
    "metadata": {},
    "outputs": [
@@ -277,7 +343,7 @@
        "<Axes: ylabel='Count'>"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -3718,7 +3784,28 @@
    "outputs": [],
    "source": [
     "with open(\"results.json\", \"w\") as f:\n",
-    "    json.dump(results, f)"
+    "    serialized_results = {}\n",
+    "\n",
+    "    for metric, courses in results.items():\n",
+    "        # Initialize the metric dictionary so we don't get a KeyError\n",
+    "        serialized_results[metric] = {}\n",
+    "\n",
+    "        for course, data in courses.items():\n",
+    "            # Convert courses into repr strings for deserialization\n",
+    "            course_key = repr(course)\n",
+    "\n",
+    "            if isinstance(data, pd.DataFrame):\n",
+    "                # .to_json() returns a string; we parse it to a dict to modify it\n",
+    "                df_dict = json.loads(data.to_json())\n",
+    "                df_dict[\"__df\"] = True\n",
+    "                serialized_results[metric][course_key] = df_dict\n",
+    "            elif isinstance(data, np.ndarray):\n",
+    "                # NumPy arrays aren't JSON serializable by default\n",
+    "                serialized_results[metric][course_key] = data.tolist()\n",
+    "            else:\n",
+    "                serialized_results[metric][course_key] = data\n",
+    "\n",
+    "    json.dump(serialized_results, f, indent=2)"
    ]
   },
   {