Skip to content
This repository was archived by the owner on Oct 4, 2021. It is now read-only.

Commit 26a9abe

Browse files
committed
feat: add mean over time + NA for latest period
1 parent 3e9ada8 commit 26a9abe

File tree

1 file changed

+118
-24
lines changed

1 file changed

+118
-24
lines changed

notebooks/02-rapport_post_integration.ipynb

Lines changed: 118 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "markdown",
5-
"id": "mental-deployment",
5+
"id": "directed-vessel",
66
"metadata": {},
77
"source": [
88
"# Automated Post-integration Report - Signaux Faibles\n",
@@ -12,7 +12,7 @@
1212
{
1313
"cell_type": "code",
1414
"execution_count": null,
15-
"id": "stock-shame",
15+
"id": "electoral-joining",
1616
"metadata": {},
1717
"outputs": [],
1818
"source": [
@@ -51,12 +51,15 @@
5151
" \"paydex_nb_jours_past_12\",\n",
5252
"]\n",
5353
"# ces variables sont toujours requêtées\n",
54-
"VARIABLES += [\"outcome\", \"periode\", \"siret\", \"siren\", \"time_til_outcome\", \"code_naf\"]"
54+
"VARIABLES += [\"outcome\", \"periode\", \"siret\", \"siren\", \"time_til_outcome\", \"code_naf\"]\n",
55+
"\n",
56+
"# période actuelle\n",
57+
"LATEST_PERIODE = \"2021-02-01\""
5558
]
5659
},
5760
{
5861
"cell_type": "markdown",
59-
"id": "adjustable-arkansas",
62+
"id": "athletic-adams",
6063
"metadata": {},
6164
"source": [
6265
"## Fetch a random sample of the data"
@@ -65,7 +68,7 @@
6568
{
6669
"cell_type": "code",
6770
"execution_count": null,
68-
"id": "false-shield",
71+
"id": "tutorial-congress",
6972
"metadata": {},
7073
"outputs": [],
7174
"source": [
@@ -76,7 +79,7 @@
7679
{
7780
"cell_type": "code",
7881
"execution_count": null,
79-
"id": "formed-salvation",
82+
"id": "mighty-feelings",
8083
"metadata": {},
8184
"outputs": [],
8285
"source": [
@@ -86,20 +89,20 @@
8689
{
8790
"cell_type": "code",
8891
"execution_count": null,
89-
"id": "suspended-london",
92+
"id": "extra-panama",
9093
"metadata": {},
9194
"outputs": [],
9295
"source": [
9396
"dataset = SFDataset(\n",
9497
" fields = VARIABLES,\n",
95-
" sample_size=10_000\n",
98+
" sample_size=100_000\n",
9699
")\n",
97100
"dataset.fetch_data();"
98101
]
99102
},
100103
{
101104
"cell_type": "markdown",
102-
"id": "pediatric-drama",
105+
"id": "headed-aurora",
103106
"metadata": {},
104107
"source": [
105108
"## Temporal Coverage and NA values"
@@ -108,7 +111,7 @@
108111
{
109112
"cell_type": "code",
110113
"execution_count": null,
111-
"id": "theoretical-density",
114+
"id": "comic-shift",
112115
"metadata": {},
113116
"outputs": [],
114117
"source": [
@@ -118,7 +121,7 @@
118121
{
119122
"cell_type": "code",
120123
"execution_count": null,
121-
"id": "promotional-heritage",
124+
"id": "optional-corner",
122125
"metadata": {},
123126
"outputs": [],
124127
"source": [
@@ -129,7 +132,7 @@
129132
{
130133
"cell_type": "code",
131134
"execution_count": null,
132-
"id": "included-industry",
135+
"id": "proof-horse",
133136
"metadata": {},
134137
"outputs": [],
135138
"source": [
@@ -140,7 +143,7 @@
140143
},
141144
{
142145
"cell_type": "markdown",
143-
"id": "logical-bailey",
146+
"id": "numerous-senate",
144147
"metadata": {},
145148
"source": [
146149
"## Coverage over time for selected variables"
@@ -149,7 +152,7 @@
149152
{
150153
"cell_type": "code",
151154
"execution_count": null,
152-
"id": "fiscal-samoa",
155+
"id": "pretty-memorabilia",
153156
"metadata": {},
154157
"outputs": [],
155158
"source": [
@@ -160,14 +163,10 @@
160163
{
161164
"cell_type": "code",
162165
"execution_count": null,
163-
"id": "handled-tuning",
166+
"id": "constitutional-audience",
164167
"metadata": {},
165168
"outputs": [],
166169
"source": [
167-
"def count_na_prop(series):\n",
168-
" return (1 - series.isna().sum() / len(series)) * 100\n",
169-
"\n",
170-
"\n",
171170
"fig, axs = plt.subplots(len(VARIABLES), figsize=(10, 100))\n",
172171
"fig.tight_layout()\n",
173172
"for i, variable in enumerate(VARIABLES):\n",
@@ -178,26 +177,121 @@
178177
" axs[i].set(adjustable='box')"
179178
]
180179
},
180+
{
181+
"cell_type": "markdown",
182+
"id": "aboriginal-dominican",
183+
"metadata": {},
184+
"source": [
185+
"## Average over time"
186+
]
187+
},
181188
{
182189
"cell_type": "code",
183190
"execution_count": null,
184-
"id": "critical-category",
191+
"id": "local-beijing",
185192
"metadata": {},
186193
"outputs": [],
187-
"source": []
194+
"source": [
195+
"from pandas.api.types import is_numeric_dtype"
196+
]
188197
},
189198
{
190199
"cell_type": "code",
191200
"execution_count": null,
192-
"id": "super-arabic",
201+
"id": "purple-helicopter",
193202
"metadata": {},
194203
"outputs": [],
195-
"source": []
204+
"source": [
205+
"VARIABLES_TO_AVERAGE = [var for var in VARIABLES if is_numeric_dtype(dataset.data[var])]\n",
206+
"fig, axs = plt.subplots(len(VARIABLES_TO_AVERAGE), figsize=(10, 100))\n",
207+
"fig.tight_layout()\n",
208+
"for i, variable in enumerate(VARIABLES_TO_AVERAGE):\n",
209+
" grouped = dataset.data.groupby(pd.Grouper(key=\"periode\", freq=\"M\")).agg({f\"{variable}\": \"mean\"})\n",
210+
" axs[i].set_title(f\"{variable}\")\n",
211+
" #axs[i].set_ylim([0, 100])\n",
212+
" axs[i].plot_date(grouped.index, grouped[f\"{variable}\"], \"-\");\n",
213+
" axs[i].set(adjustable='box')"
214+
]
215+
},
216+
{
217+
"cell_type": "markdown",
218+
"id": "guided-launch",
219+
"metadata": {},
220+
"source": [
221+
"## Codes NAF"
222+
]
223+
},
224+
{
225+
"cell_type": "code",
226+
"execution_count": null,
227+
"id": "crude-wesley",
228+
"metadata": {},
229+
"outputs": [],
230+
"source": [
231+
"import seaborn as sns\n",
232+
"grouped = dataset.data.groupby(\"code_naf\", as_index=False).agg({\"outcome\": \"count\"})\n",
233+
"sns.barplot(x = grouped.code_naf, y = grouped.outcome);"
234+
]
235+
},
236+
{
237+
"cell_type": "markdown",
238+
"id": "distinguished-router",
239+
"metadata": {},
240+
"source": [
241+
"## Codes NAF over time"
242+
]
243+
},
244+
{
245+
"cell_type": "code",
246+
"execution_count": null,
247+
"id": "satisfactory-selling",
248+
"metadata": {},
249+
"outputs": [],
250+
"source": [
251+
"grouped = dataset.data.groupby([pd.Grouper(key = \"periode\", freq = \"2Q\"), \"code_naf\"]).agg({\"outcome\": \"count\"}).reset_index()\n",
252+
"plt.figure(figsize = (15, 10))\n",
253+
"sns.lineplot(x = grouped.periode, y = grouped.outcome, hue = grouped.code_naf);"
254+
]
255+
},
256+
{
257+
"cell_type": "markdown",
258+
"id": "wanted-retrieval",
259+
"metadata": {},
260+
"source": [
261+
"## Analysis for latest period only"
262+
]
263+
},
264+
{
265+
"cell_type": "code",
266+
"execution_count": null,
267+
"id": "structural-bridal",
268+
"metadata": {},
269+
"outputs": [],
270+
"source": [
271+
"dataset = SFDataset(\n",
272+
" fields = VARIABLES,\n",
273+
" date_min = LATEST_PERIODE,\n",
274+
" sample_size=100_000\n",
275+
")\n",
276+
"dataset.fetch_data();"
277+
]
278+
},
279+
{
280+
"cell_type": "code",
281+
"execution_count": null,
282+
"id": "continuing-minnesota",
283+
"metadata": {},
284+
"outputs": [],
285+
"source": [
286+
"na_rates_df = (dataset.data.isna().sum() / len(dataset) * 100).sort_values(ascending = False).to_frame()\n",
287+
"na_rates_df.columns = [\"NA rate\"]\n",
288+
"na_rates_df"
289+
]
196290
},
197291
{
198292
"cell_type": "code",
199293
"execution_count": null,
200-
"id": "desperate-button",
294+
"id": "friendly-appearance",
201295
"metadata": {},
202296
"outputs": [],
203297
"source": []

0 commit comments

Comments
 (0)