From 5d5b1868629bd4f590205619c257289fa46947a2 Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Sat, 23 Mar 2019 00:02:32 -0300 Subject: [PATCH 01/23] is JSON data preparation --- .../isJson_dataPrep.ipynb | 392 ++++++++++++++++++ 1 file changed, 392 insertions(+) create mode 100644 analyses/2019_03_aliamcami_greatest_values_are_json/isJson_dataPrep.ipynb diff --git a/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_dataPrep.ipynb b/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_dataPrep.ipynb new file mode 100644 index 0000000..29b5242 --- /dev/null +++ b/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_dataPrep.ipynb @@ -0,0 +1,392 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Start client" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/overscripted/lib/python3.6/site-packages/dask/config.py:168: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", + " data = yaml.load(f.read()) or {}\n", + "/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/config.py:20: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", + " defaults = yaml.load(f)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 4
  • \n", + "
  • Cores: 4
  • \n", + "
  • Memory: 8.59 GB
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import dask.dataframe as dd\n", + "from dask.distributed import Client\n", + "\n", + "#Initializing client\n", + "client = Client()\n", + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data\n", + "Using 10% sample and self produced samples\n", + " - 10% sample has 11292867 rows\n", + " - Filtered by value_len > df.mean() has 499805 rows" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "#Original sample\n", + "df = dd.read_parquet('sample_0.parquet', \n", + " engine='pyarrow', \n", + " columns=['value_1000', 'value', 'value_len', 'symbol', 'script_url'])\n", + "\n", + "# df.astype({'value_1000': str, 'value': str,'value_len': int,'symbol': int,'script_url': str})\n", + "df_index={'value_1000': str, 'value': str,'value_len': int,'symbol': int,'script_url': str}\n", + "df.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filtered value_len > 1356\n", + "1356 is the value_len mean\n", + "\n", + "To filter the data into something that is more interesting to this task I decided to only work with values that are at above the mean.\n", + "\n", + "All values above the mean count up to 499805 rows. That is just 4,42% of the whole sample, and a lot easier to work on. " + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "499805" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Save\n", + "dff = df[df['value_len'] > 1356]\n", + "dd.to_parquet(df=dff, path='filtered_above_mean.parquet', engine='pyarrow')\n", + "# len(dff)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['value_1000', 'value', 'value_len', 'symbol', 'script_url'], dtype='object')" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Read\n", + "df = dd.read_parquet('filtered_above_mean.parquet', engine='pyarrow')\n", + "df.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DF overview\n", + "Some overview about the sample: \n", + "- Mean: 1356.97,\n", + "- Min: 0,\n", + "- Max: 4496861\n", + "- Std: 26310.62" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1356.9776628910975 0 4496861 26310.62140481331\n" + ] + } + ], + "source": [ + "df_mean = df['value_len'].mean()\n", + "df_min = df['value_len'].min()\n", + "df_max = df['value_len'].max()\n", + "df_std = df['value_len'].std()\n", + "(df_mean, df_min, df_max, df_std) = dd.compute(df_mean, df_min, df_max, df_std);\n", + "print(df_mean, df_min, df_max, df_std)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# is JSON analysis\n", + "\n", + "After manual initial analysis I have think that the huge values are json structured, to validate that I included an new column that is a boolean value with the validation of json" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "def is_json(myjson):\n", + " try:\n", + " json.loads(myjson)\n", + " return True\n", + " except ValueError as e:\n", + " return False" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df.astype({'value_1000': str, 'value': str,'value_len': int,'symbol': int,'script_url': str})\n", + "df['is_json'] = df['value'].apply(is_json)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### is_JSON data\n", + "Saving the new produced data with 'is_json' columns into disk" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/overscripted/lib/python3.6/site-packages/fastparquet/util.py:221: FutureWarning: A future version of pandas will default to `skipna=True`. To silence this warning, pass `skipna=True|False` explicitly.\n", + " inferred_dtype = infer_dtype(column)\n" + ] + } + ], + "source": [ + "#save\n", + "df.to_parquet('is_json_above_mean.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_1000valuevalue_lensymbolscript_urlis_json
0{\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...{\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...3713window.sessionStoragehttps://assets.adobedtm.com/caacec67651710193d...True
1{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...103878window.localStoragehttps://g.alicdn.com/alilog/mlog/aplus_v2.jsTrue
2{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...103878window.localStoragehttps://g.alicdn.com/alilog/mlog/aplus_v2.jsTrue
3usunico=17/12/2017:0-00155123:830; SessionASM=...usunico=17/12/2017:0-00155123:830; SessionASM=...1358window.document.cookiehttps://www.coches.net/scripts/common.min.js?2...False
4usunico=17/12/2017:0-00155123:830; SessionASM=...usunico=17/12/2017:0-00155123:830; SessionASM=...1358window.document.cookiehttps://www.coches.net/scripts/common.min.js?2...False
\n", + "
" + ], + "text/plain": [ + " value_1000 \\\n", + "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... \n", + "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", + "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", + "3 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", + "4 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", + "\n", + " value value_len \\\n", + "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... 3713 \n", + "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... 103878 \n", + "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... 103878 \n", + "3 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", + "4 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", + "\n", + " symbol script_url \\\n", + "0 window.sessionStorage https://assets.adobedtm.com/caacec67651710193d... \n", + "1 window.localStorage https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", + "2 window.localStorage https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", + "3 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", + "4 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", + "\n", + " is_json \n", + "0 True \n", + "1 True \n", + "2 True \n", + "3 False \n", + "4 False " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#read\n", + "df = dd.read_parquet('is_json_above_mean.parquet')\n", + "df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From cd0ac0c95115e559d716fb7ffe65353dc5336eac Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Sat, 23 Mar 2019 00:03:53 -0300 Subject: [PATCH 02/23] Quantitative analysts for json values --- .../isJson_Quantity_Analysis.ipynb | 509 ++++++++++++++++++ 1 file changed, 509 insertions(+) create mode 100644 analyses/2019_03_aliamcami_greatest_values_are_json/isJson_Quantity_Analysis.ipynb diff --git a/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_Quantity_Analysis.ipynb b/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_Quantity_Analysis.ipynb new file mode 100644 index 0000000..0a209bd --- /dev/null +++ b/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_Quantity_Analysis.ipynb @@ -0,0 +1,509 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Start client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/overscripted/lib/python3.6/site-packages/dask/config.py:168: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", + " data = yaml.load(f.read()) or {}\n", + "/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/config.py:20: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", + " defaults = yaml.load(f)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 4
  • \n", + "
  • Cores: 4
  • \n", + "
  • Memory: 8.59 GB
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import dask.dataframe as dd\n", + "from dask.distributed import Client\n", + "\n", + "#Initializing client\n", + "client = Client()\n", + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data\n", + "Using filtered and evaluated for json data named 'is_json_above_mean.parquet'. You can get this by running the 'isJson_dataPrep.ipynb'\n", + "\n", + "This new sample has 499805, meaning that its only 4,42% of the original sample (most values are smaller than the sample's mean of 1356)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_1000valuevalue_lensymbolscript_urlis_json
0{\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...{\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...3713window.sessionStoragehttps://assets.adobedtm.com/caacec67651710193d...True
1{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...103878window.localStoragehttps://g.alicdn.com/alilog/mlog/aplus_v2.jsTrue
2{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...103878window.localStoragehttps://g.alicdn.com/alilog/mlog/aplus_v2.jsTrue
3usunico=17/12/2017:0-00155123:830; SessionASM=...usunico=17/12/2017:0-00155123:830; SessionASM=...1358window.document.cookiehttps://www.coches.net/scripts/common.min.js?2...False
4usunico=17/12/2017:0-00155123:830; SessionASM=...usunico=17/12/2017:0-00155123:830; SessionASM=...1358window.document.cookiehttps://www.coches.net/scripts/common.min.js?2...False
\n", + "
" + ], + "text/plain": [ + " value_1000 \\\n", + "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... \n", + "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", + "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", + "3 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", + "4 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", + "\n", + " value value_len \\\n", + "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... 3713 \n", + "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... 103878 \n", + "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... 103878 \n", + "3 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", + "4 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", + "\n", + " symbol script_url \\\n", + "0 window.sessionStorage https://assets.adobedtm.com/caacec67651710193d... \n", + "1 window.localStorage https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", + "2 window.localStorage https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", + "3 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", + "4 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", + "\n", + " is_json \n", + "0 True \n", + "1 True \n", + "2 True \n", + "3 False \n", + "4 False " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = dd.read_parquet('is_json_above_mean.parquet')\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DF overview\n", + "Some overview about the sample after the data prep: \n", + "- Rows: 499805\n", + "- Mean: 27829.33,\n", + "- Min: 1357,\n", + "- Max: 4496861\n", + "- Std: 122092.41" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "27829.332847810645 1357 4496861 122092.41371885882\n" + ] + } + ], + "source": [ + "df_mean = df['value_len'].mean()\n", + "df_min = df['value_len'].min()\n", + "df_max = df['value_len'].max()\n", + "df_std = df['value_len'].std()\n", + "(df_mean, df_min, df_max, df_std) = dd.compute(df_mean, df_min, df_max, df_std);\n", + "print(df_mean, df_min, df_max, df_std)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Quantity analysis " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Whole\n", + "This whole sample has: \n", + "- False: 307577 rows\n", + " - 61,54% are not valid JSON\n", + " \n", + " \n", + "- True: 192228 rows\n", + " - 38,46% are valid JSON" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 307577\n", + "True 192228\n", + "Name: is_json, dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['is_json'].value_counts().compute()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ORIGINAL SAMPLE: One std above the mean\n", + "Original Sample Data: \n", + "- Mean: 1356.97\n", + "- Std: 26310.62\n", + "\n", + "I'll be using the original sample's mean and std to make the following analyses\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "ROW_COUNT = 499805\n", + "MEAN = 1356\n", + "STD = 26310" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "- 46745 rows have the value_len greater than 27666\n", + "- This represent 9.35% of the rows on this sample\n", + "\n", + "As the value_len increases the percentage of valid JSON on the columns 'value' also increases, for this filtered sample the following data was verified: \n", + "- True: 46691 rows\n", + " - 99,88% are valid JSON\n", + " \n", + "- False: 54 rows\n", + " - 0,11% are not valid JSON\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len: 46745 (9.35%)\n" + ] + } + ], + "source": [ + "dfa = df[df['value_len'] > (MEAN + STD)]\n", + "length = len(dfa)\n", + "print(\"len: {0} ({1:0.2f}%)\".format(length, length / ROW_COUNT * 100))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True 46691\n", + "False 54\n", + "Name: is_json, dtype: int64" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfa['is_json'].value_counts().compute()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## All greater values are JSON\n", + "\n", + "There is absolute no value greater than 104653 that represents a valid JSON. \n", + "\n", + "This implies that all the greater values are JSON but they represent very low percentage of the whole data. \n", + "\n", + "The top 46745 gratest value_len are valid JSONs, that is 9.35% of this sample (value_len > mean) and 0,41% of the original sample with all the smaller values. " + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "group = dfa.groupby('is_json')\n", + "group_result = group.agg({'value_len': ['mean', 'std', 'min', 'max', 'count']}).compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_len
meanstdminmaxcount
is_json
False82460.05555613627.1180632813510465354
True271422.740185412552.29861327669449686146691
\n", + "
" + ], + "text/plain": [ + " value_len \n", + " mean std min max count\n", + "is_json \n", + "False 82460.055556 13627.118063 28135 104653 54\n", + "True 271422.740185 412552.298613 27669 4496861 46691" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "group_result" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len: 46745 (9.35%)\n" + ] + } + ], + "source": [ + "allJson = df[df['value_len'] > 104653]\n", + "length = len(dfa)\n", + "print(\"len: {0} ({1:0.2f}%)\".format(length, length / ROW_COUNT * 100))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 970ea0eeaabbc28f5fb33785d0bc15101eb52d3b Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Sat, 23 Mar 2019 00:04:50 -0300 Subject: [PATCH 03/23] Readme with overview of the findings about the quantitative analysts --- .../README.md | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 analyses/2019_03_aliamcami_greatest_values_are_json/README.md diff --git a/analyses/2019_03_aliamcami_greatest_values_are_json/README.md b/analyses/2019_03_aliamcami_greatest_values_are_json/README.md new file mode 100644 index 0000000..67e78a1 --- /dev/null +++ b/analyses/2019_03_aliamcami_greatest_values_are_json/README.md @@ -0,0 +1,26 @@ +# Overview + +All the greatest values are JSON, but they represent very little percentage of the whole data. + +### Most of the data have small value_len + (mean = 1356 for the 10% sample) +- 95,58% of the data have value_len smaller than the mean +- 4,42% are bigger than the mean +- 9.35% are valid JSON + +### Values above the mean: +- 61,54% are NOT valid JSON +- 38,46% are valid JSON + +### Values that are 1 standard deviation (std) above the mean + (std = 26310 for 10% sample): +- 0,11% are NOT valid JSON +- 99,88% are valid JSON +- The bigger the value the greater the chance of being a valid JSON + +### Values 4 std above the mean +- 100% are valid JSON +- The biggest non-JSON value have the length of 104653 + +## +The top 46745 gratest value_len are valid JSONs, that is 9.35% of the filtered sample (value_len > mean) and 0,41% of the original 10% sample. From 0272b1ccb0e4da8db75e1f8623554c05321489ec Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Sun, 31 Mar 2019 13:57:10 -0300 Subject: [PATCH 04/23] Sample comparasions for quantity of valid json values --- .../isJson_Sample_Comparasion.ipynb | 917 ++++++++++++++++++ 1 file changed, 917 insertions(+) create mode 100644 analyses/2019_03_aliamcami_greatest_values_are_json/isJson_Sample_Comparasion.ipynb diff --git a/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_Sample_Comparasion.ipynb b/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_Sample_Comparasion.ipynb new file mode 100644 index 0000000..20a2660 --- /dev/null +++ b/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_Sample_Comparasion.ipynb @@ -0,0 +1,917 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Start dask" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/overscripted/lib/python3.6/site-packages/dask/config.py:168: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", + " data = yaml.load(f.read()) or {}\n", + "/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/config.py:20: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", + " defaults = yaml.load(f)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 4
  • \n", + "
  • Cores: 4
  • \n", + "
  • Memory: 8.59 GB
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import dask.dataframe as dd\n", + "from dask.distributed import Client\n", + "\n", + "#Initializing client\n", + "client = Client()\n", + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data\n", + "This notebook starts using 'is_json_above_bean.parquet', this is a filtered data that you can get by running the data preparation notebook called 'jsJson_dataPrep.ipynb'. \n", + "This parquet contains the 10% sample data filtered by values above the mean of value_len. \n", + "\n", + "This new sample has 499805 rows, meaning that its only 4,42% of the original sample (most values are smaller than the sample's mean of 1356). " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_lenis_json
03713True
1103878True
2103878True
31358False
41358False
\n", + "
" + ], + "text/plain": [ + " value_len is_json\n", + "0 3713 True\n", + "1 103878 True\n", + "2 103878 True\n", + "3 1358 False\n", + "4 1358 False" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = dd.read_parquet('is_json_above_mean.parquet', columns=['value_len', 'is_json'])\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Visualization: " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/worker.py:2791: UserWarning: Large object of size 1.89 MB detected in task graph: \n", + " (" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "cdf['value_len'].plot(kind='hist', legend=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "jsonGroup = cdf.groupby('is_json')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And we cannot identify any non_json (blue) on the right side of the histogram. This means there all frquency of non-json values are very low or inexistent for the biggest values. Since there are so many small values, the biggest ones represent such a small portion that is hard to identify by look on the histograms and graphs. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "is_json\n", + "False AxesSubplot(0.125,0.125;0.775x0.755)\n", + "True AxesSubplot(0.125,0.125;0.775x0.755)\n", + "Name: value_len, dtype: object" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "jsonGroup['value_len'].plot(kind='hist', legend=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sample overview\n", + "Some overview about the sample after the data prep: \n", + "- Rows: 499805\n", + "- Mean: 27829.33,\n", + "- Min: 1357,\n", + "- Max: 4496861\n", + "- Std: 122092.41" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "#Hardcoded data to fast use, but your can update for the calculed value within the next few cells \n", + "MEAN = 27829.33\n", + "MIN = 1357\n", + "MAX = 4496861\n", + "STD = 122092.41\n", + "COUNT = 499805\n", + "\n", + "#Information for original sample.\n", + "ORIG_MEAN = 1356.97\n", + "ORIG_MIN = 0\n", + "ORIG_MAX = 4496861\n", + "ORIG_STD = 26310.62\n", + "ORIG_COUNT = 11292867\n", + "\n", + "#hardcoded information about described data for values one std above the mean: \n", + "A_MEAN = 271204.44\n", + "A_MIN = 27669\n", + "A_MAX = 4496861\n", + "A_STD = 306555\n", + "A_COUNT = 46745" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def describedData(df):\n", + " tmp_mean = df['value_len'].mean()\n", + " tmp_min = df['value_len'].min()\n", + " tmp_max = df['value_len'].max()\n", + " tmp_std = df['value_len'].std()\n", + " tmp_count = df['value_len'].count()\n", + " (tmp_mean, tmp_min, tmp_max, tmp_std, tmp_count) = dd.compute(tmp_mean, tmp_min, tmp_max, tmp_std, tmp_count);\n", + " return (tmp_mean, tmp_min, tmp_max, tmp_std, tmp_count)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "27829.332847810645 1357 4496861 122092.41371885882 499805\n" + ] + } + ], + "source": [ + "#Calculate the described data for mean sample\n", + "(MEAN, MIN, MAX, STD, COUNT) = describedData(df)\n", + "print(MEAN, MIN, MAX, STD, COUNT)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1356.9776628910975 0 4496861 26310.62140481331 11292867\n" + ] + } + ], + "source": [ + "#Calculate the described data for original sample\n", + "(ORIG_MEAN, ORIG_MIN, ORIG_MAX, ORIG_STD, ORIG_COUNT) = describedData(dd.read_parquet('sample_0.parquet'))\n", + "print(ORIG_MEAN, ORIG_MIN, ORIG_MAX, ORIG_STD, ORIG_COUNT)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "271204.44978072523 27669 4496861 306555.0273738244 46745\n" + ] + } + ], + "source": [ + "#Calculate the described data for one std above the mean (using mean and stf of the original sample)\n", + "std_above = df[df['value_len'] > ORIG_STD + ORIG_MEAN]\n", + "(A_MEAN, A_MIN, A_MAX, A_STD, A_COUNT) = describedData(std_above)\n", + "print(A_MEAN, A_MIN, A_MAX, A_STD, A_COUNT)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following cell will create a dataframe of the described data calculated above and save it into a csv to fulture use, if calculations are not possible. " + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MEANMINMAXSTDCOUNT
ORIGINAL1356.9776630449686126310.62140511292867
ABOVE_MEAN27829.33284813574496861122092.413719499805
ABOVE_STD271204.449781276694496861306555.02737446745
\n", + "
" + ], + "text/plain": [ + " MEAN MIN MAX STD COUNT\n", + "ORIGINAL 1356.977663 0 4496861 26310.621405 11292867\n", + "ABOVE_MEAN 27829.332848 1357 4496861 122092.413719 499805\n", + "ABOVE_STD 271204.449781 27669 4496861 306555.027374 46745" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Comparasion of this sample and original 10% sample:\n", + "import pandas as pd\n", + "import numpy as np\n", + "%matplotlib inline\n", + "\n", + "compare = pd.DataFrame([(ORIG_MEAN, ORIG_MIN, ORIG_MAX, ORIG_STD, ORIG_COUNT),\n", + " (MEAN, MIN, MAX, STD, COUNT), \n", + " (A_MEAN, A_MIN, A_MAX, A_STD, A_COUNT)], \n", + " columns=['MEAN', 'MIN', 'MAX', 'STD', 'COUNT'],\n", + " index= ['ORIGINAL','ABOVE_MEAN', 'ABOVE_STD'])\n", + "compare.to_csv('describedData.csv')\n", + "compare" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Number of rows\n", + "The number of rows after filtering for values above the mean are about 4.42% of the original sample. \n", + "And the count for values one std above the mean is just 9.35% of this sample or 0.41% of original sample. \n", + "By this we can see that the really big values represent just a very small portion of the whole. " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Above the mean / original 4.425846864219688\n", + "1 STD Above the mean / original 0.41393385754033946\n", + "1 STD Above the mean / Above mean 9.35264753253769\n" + ] + } + ], + "source": [ + "print('Above the mean / original', COUNT / ORIG_COUNT * 100)\n", + "print('1 STD Above the mean / original', A_COUNT / ORIG_COUNT * 100)\n", + "print('1 STD Above the mean / Above mean', A_COUNT / COUNT * 100)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "compare['COUNT'].plot(kind='pie')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Max and Min values\n", + "\n", + "it is expected that the maximum will be the same for all mentioned samples since the filtering is being made by the minimum, and is also expected that the min is the value used to filter." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "compare[['MIN','MAX']].plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Mean and Std\n", + "> A low standard deviation indicates that the data points tend to be close to the mean (also called the expected value) of the set, while a high standard deviation indicates that the data points are spread out over a wider range of values. (https://en.wikipedia.org/wiki/Standard_deviation)\n", + "\n", + "It is noticeable that both mean and std are increassing as the data is filtered by bigger values. \n", + "\n", + "The increase of the mean is to be expected, since we are eliminating the smaller values and leaving only the bigger ones. \n", + "\n", + "But the STD is not necessary like the mean where it will increase after the filter, but since it is the case here we can safaly assume that as the values get bigger the more spread out they are, the mean of the sample is less accurate to represent the whole dataset since they have a huge difference of value from one another. " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAacAAAD9CAYAAAAYjbi9AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xd4VFX6wPHvS4CEklBDDRBKkCJNQlNRVFRAFLEBShMQ9KdrWXdXFNfOWta1rroiICBdRUEQEQs2IBB6h9ADIYEEUoD08/vj3sEhhCRAkjvl/TxPHmbOnHvvOzDknXvPe88RYwxKKaWUJynjdABKKaVUXpqclFJKeRxNTkoppTyOJiellFIeR5OTUkopj6PJSSmllMfR5KSUUsrjaHJSSinlcTQ5KaWU8jhlnQ6guNWsWdOEh4c7HYZSSnmVNWvWHDPGhDodh4vPJafw8HCio6OdDkMppbyKiOx3OgZ3ellPKaWUx9HkpJRSyuNoclJKKeVxfG7MKT9ZWVnExsaSnp7udCiOCAoKIiwsjHLlyjkdilJKFYlfJKfY2FiCg4MJDw9HRJwOp1QZY0hMTCQ2NpbGjRs7HY5SShWJX1zWS09Pp0aNGn6XmABEhBo1avjtWaNSyjv5RXIC/DIxufjze1dKeSe/uKynlFLqPIyBNZ86HcU5/ObMyWkiwpAhQ848z87OJjQ0lL59+wIwZcoUQkNDad++/ZmfrVu3nun/9ttvExQURHJy8pm2ZcuWISJ88803Z9r69u3LsmXLSv4NKaW838ljMPteWPiE05Gco9DkJCJBIrJKRDaIyBYRedFubywiUSKyS0TmiEh5uz3Qfh5jvx7utq+n7fYdInKzW3svuy1GRMa6ted7DG9UqVIlNm/ezOnTpwFYunQp9evXP6vPgAEDWL9+/ZmfVq1anXlt1qxZdOrUia+++uqsbcLCwhg/fnzJvwGllG/Z/RN8dCXE/AA3v+p0NOcoyplTBnC9MaYd0B7oJSJdgdeBt40xEcBxYKTdfyRw3BjTDHjb7oeItAIGAq2BXsCHIhIgIgHAB0BvoBUwyO5LAcfwSr1792bRokWAlWwGDRpUpO12795NWloar7zyCrNmzTrrtXbt2lGlShWWLl1a7PEqpXxQdgYsGQef9YcK1eCBn6Db/zkd1TkKHXMyxhggzX5azv4xwPXAvXb7VOAF4COgn/0Y4Avgv2KNyPcDZhtjMoC9IhIDdLb7xRhj9gCIyGygn4hsK+AYF+3Fb7aw9XDKpeziHK3qhfD8ra0L7Tdw4EBeeukl+vbty8aNGxkxYgS//fbbmdfnzJnD77//fub5ihUrqFChwplE1r17d3bs2EFCQgK1atU60+/ZZ5/l2Wef5cYbbyzW96WU8jEJ2+HLURC/CTo9ADe9DOUqOB1Vvoo05mSf4awHEoClwG7ghDEm2+4SC7iuUdUHDgLYrycDNdzb82xzvvYaBRzDK7Vt25Z9+/Yxa9Ys+vTpc87reS/rVahgfWhmz57NwIEDKVOmDHfccQeff/75Wdt1794d4KxEp5RSZxgDqz6BCddCahwMmgO3vOmxiQmKWK1njMkB2otIVeAroGV+3ew/86tbNgW055cgC+p/DhEZDYwGaNiwYX5dzijKGU5Juu222/jb3/7GsmXLSExMLLT/xo0b2bVr15mzoszMTJo0acLDDz98Vr9x48Yxfvx4ypbVAkyllJuTx2D+I7BzMTTrCf0+hODaTkdVqAuq1jPGnACWAV2BqiLi+k0YBhy2H8cCDQDs16sASe7tebY5X/uxAo6RN64JxphIY0xkaKjHLEeSrxEjRvDcc8/Rpk2bIvWfNWsWL7zwAvv27WPfvn0cPnyYQ4cOsX//2bPb33TTTRw/fpwNGzaURNhKKW8U86NV9LD7R+j1Gtz7uVckJihatV6ofcaEiFQAegLbgJ+Bu+xuw4D59uMF9nPs13+yx60WAAPtar7GQASwClgNRNiVeeWxiiYW2Nuc7xheKywsjMceeyzf1+bMmXNWKfny5cuZPXs2/fv3P6tf//79mT179jnbjxs3jtjY2BKJWynlRbLS4bunYfodUKE6PPAzdH0IynjP3UNi5YACOoi0xSpGCMBKZnONMS+JSBNgNlAdWAcMNsZkiEgQ8BnQAeuMaaBbscM4YASQDTxujFlst/cB3rGPMdkYM95uz/cYBcUbGRlp8i42uG3bNlq2zO9KpP/QvwOl/ETCdvhyJMRvhs6j4caXijS2JCJrjDGRpRBhkRSlWm8jVqLJ276HP6vt3NvTgbvPs6/xwDk35RhjvgW+LeoxlFJK5WEMrJ4I3z8L5SvDvXOh+c2Fb+ehdPRcKaW8XdpRWPAI7PwOmt0It38IlWsVvp0H0+SklFLeLOYH+OohSE+GXq9DlzHgA5M9a3JSSilvlJUOP74IKz+E0JYw9Guo7eytMsVJk5NSSnmbhG32TA+bofMYuPFFj76h9mJoclJKKW/hmulh6T8hMNi6b6n5TU5HVSI0OZWi8ePHM3PmTAICAihTpgzVqlXj+PHjpKWlcfTo0TPLqH/44Yc888wzxMXFERgYSGZmJj179uSVV16hatWqDr8LpZQj0o7C/Idh1xKIuAn6feD1RQ8F0eRUSlasWMHChQtZu3YtgYGBHDt2jMzMTOrVq8eyZct48803Wbhw4VnbzJgxg8jISDIzM3n66afp168fv/zyi0PvQCnlmF0/wNd20UPvN6z7l3yg6KEg3nO7sJeLi4ujZs2aBAYGAlCzZk3q1atXpG3Lly/PG2+8wYEDB3R6IqX8SVY6LH4KZtwJlWrC6GU+U41XGP87c1o8Fo5sKt591mkDvV8rsMtNN93ESy+9RPPmzenZsycDBgzg2muvLfIhAgICaNeuHdu3b6ddu3aXGrFSytPFb7WKHhK2QJcHoeeLUC7I6ahKjZ45lZLKlSuzZs0aJkyYQGhoKAMGDGDKlCkXtI/CpppSSvkAYyBqAkzoAScT4L4voPfrfpWYwB/PnAo5wylJAQEB9OjRgx49etCmTRumTp3K8OHDi7RtTk4OmzZt0vnxlPJlaQl20cP3dtHDh1DZs1daKCn+l5wcsmPHDsqUKUNERAQA69evp1GjRkXaNisri3HjxtGgQQPatm1bkmEqpZyy83uY/3+QkQp93oROo/xibOl8NDmVkrS0NP7yl79w4sQJypYtS7NmzZgwYUKB29x3330EBgaSkZFBz549mT/f61cMUUrllZUOS5+DVR9DrdYw7BuoVXpXSHJzDf9ZuqPUjldUmpxKSceOHVm+fHm+r7ku9blbtmxZyQellHJW/Ba76GErdHkIer5QqmNL2Tm5jJ23iS/WeN46cFoQoZRSpc0YWPk/mHCdtYz6fV9a4+GlmJgysnP4y6x1fLEmlsd7RpTacYtKz5yUUqo0pSXA1/8HMUuheS+47b+lXvRwOjOHMdPX8OvOo/yzbytGXt2YJ0o1gsL5TXIyxiB+OrioJehKeYidS6zElJnmWNFDSnoWI6esZs3+47xxZ1vu6dSgVI9fVH6RnIKCgkhMTKRGjRp+l6CMMSQmJhIU5F/3SCjlUbJO20UPE6D25XDnxFItenBJTMtg2Ker2HEklfcHXcEtbeuWegxF5RfJKSwsjNjYWI4ePep0KI4ICgoiLCzM6TCU8k/xW+CLkXB0G3R9GG54zpEbao8kp3PfxJXEHj/NhKGRXHeZZ08a6xfJqVy5cmdm/FZKqVJhDET9D5Y+D0FVYPCX0KynI6HsTzzJfROjOHEqi6kjOtO1SQ1H4rgQfpGclFKqVKXGWzfUxvxgFT30+8CauNUBO46kMmRSFJk5ucx8oAttw7xj2R1NTkopVZzcix5u+Q9EjnRspocNB08w7NNVlA8ow9wx3WheO9iROC6GJiellCoOWafh+3/C6k+gdhu76KGFY+Gs3JPIqKnRVKtUjhkju9KwRkXHYrkYhd6EKyINRORnEdkmIltE5DG7/QUROSQi6+2fPm7bPC0iMSKyQ0RudmvvZbfFiMhYt/bGIhIlIrtEZI6IlLfbA+3nMfbr4cX55pVSqlgc2WzNIr76E6vo4YEfHU1MP29PYNjkVdSpEsTnY670usQERZshIht40hjTEugKPCwirezX3jbGtLd/vgWwXxsItAZ6AR+KSICIBAAfAL2BVsAgt/28bu8rAjgOjLTbRwLHjTHNgLftfkop5Rlyc2HFh/DJdXD6OAyeB73+BWUDHQvpmw2HeWBaNM1rBzN3TDfqVPHO20gKTU7GmDhjzFr7cSqwDahfwCb9gNnGmAxjzF4gBuhs/8QYY/YYYzKB2UA/sW48uh74wt5+KnC7276m2o+/AG4Qf7tRSSnlmVLjYcZdsORpqwrvoeXQ7AZHQ5q96gCPzl5Hh4ZVmfFAF6pXKu9oPJfigubWsy+rdQCi7KZHRGSjiEwWkWp2W33goNtmsXbb+dprACeMMdl52s/al/16st1fKaWcs+M7+Kgb7F8Ot7wFA2c6Vo3nMvG3PYydt4lrIkKZNqILIUHlHI3nUhU5OYlIZeBL4HFjTArwEdAUaA/EAf9xdc1nc3MR7QXtK29so0UkWkSi/fVGW6VUKcg6DYuehFkDILgejPkFOjlXjQfWLDBvLd3JK4u20adNHT4ZGkmF8gGOxVNcilStJyLlsBLTDGPMPABjTLzb658AC+2nsYD7ZE1hwGH7cX7tx4CqIlLWPjty7+/aV6yIlAWqAEl54zPGTAAmAERGRupEckqp4ndkk7W8xdHt0O0Ra6YHB8eWwEpMLy/cxuQ/9nJ3xzBevaMNZQN8Y7GJolTrCTAJ2GaMecut3X1Spv7AZvvxAmCgXWnXGIgAVgGrgQi7Mq88VtHEAmPNSvozcJe9/TBgvtu+htmP7wJ+MjqLqVKqNOXmwooP4JPr4fQJGPIV3Dze8cSUk2t46suNTP5jL/dfFc7rd7b1mcQERTtzugoYAmwSkfV22zNY1XbtsS6z7QPGABhjtojIXGArVqXfw8aYHAAReQRYAgQAk40xW+z9PQXMFpFXgHVYyRD7z89EJAbrjGngJbxXpZS6MKlH4OuHYPdPcFkfa3mLSs4Pe2dm5/LEnPUs2hTHozdE8ETPCJ+b1Fp87UQkMjLSREdHOx2GUsrb7VgM8x+GzFNWeXjH+x0dW3I5nZnDQzPWsGzHUZ69pSWjujcplv2KyBpjTGSx7KwY6AwRSinlLvMUfP8sRE+COm3gzkkQepnTUQGQmp7FyCnRrN6fxGt3tGFg54ZOh1RiNDkppZRL3Ear6OHYDo8penBJOpnJsMmr2BaXwnsDO3Bru3pOh1SiNDkppVRuLqz8EH58ESpUhyFfQ9PrnI7qjCPJ6QyZFMWBpFNMGNqR61vUdjqkEqfJSSnl384qergFbnvfI4oeXA4knuK+SStJSstkyv2d6dbUc2IrSZqclFL+a/u3VtFD1mno+w50HO4RRQ8uu+JTGTwpiozsXGY80JX2DbxjLabioMlJKeV/Mk/B9+MgejLUaWsXPTR3OqqzbIpNZujkKMoGlGHO6G5cVsd71mIqDpqclFL+xb3o4cpH4fpnPabowSVqTyIjp0ZTtWI5ZozqQqMalZwOqdRpclJK+YfcXFj5AfzwojVJ69D50KSH01Gd4+cdCTz42RrCqlVg+qgu1K1SwemQHKHJSSnl+1LirKKHPT9Di75W0UPF6k5HdY5FG+N4fM46mtcOZtqIztSo7FlndKVJk5NSyrdtXwTzH4HsdLj1XbhimEcVPbjMXX2QsfM2ckXDakwa3okqFbx7yYtLpclJKeWbMk/BkmdgzadQt51V9FAzwumo8jXp9728vHAr3SNq8vGQjlQsr7+a9W9AKeV74jbAFyMhMQauegyuexbKet6qsMYY3vsxhrd/2Emv1nV4d1B7Ast6/1pMxUGTk1LKd+Tmwor/wo8vuRU9XOt0VPkyxjB+0TYm/r6XO68I4/U7fWctpuKgyUkp5RtS4uDrB2HPMo8uegBrLaZxX21i9uqDDL8ynOf6tqJMGc8bB3OSJiellPfbthAWPALZGXDre3DFUI8segBrLaa/zl3Pwo1x/OX6Zvz1xuY+txZTcdDkpJTyXpkn7aKHKVC3Pdw50WOLHgDSs3J4aPoaft5xlGf6tGD0NU2dDsljaXJSSnmnw+utmR4SY+Cqx+G6cR5Z9OCSmp7FqKnRrNqXxL/6t+HeLr67FlNx0OSklPIuubmw4n348WWoFArDFkDja5yOqkDHT2Yy7NNVbDmcwjsD2tOvfX2nQ/J4mpyUUt4j5TB89SDs/QVa3mqNL3lo0YNLQko6gydFsS/xFB8P7kjPVr6/FlNx0OSklPIO276BBX+xih5uex86DPHYogeXg0mnuG9iFIlpGUy5vxNXNq3pdEheQ5OTUsqzZZ6E756GtVPtoodJULOZ01EVKiYhlcETV3E6K4fpo7rQoWE1p0PyKpqclFKe6/A6u+hhN1z9BPR4xqOLHlw2H0pm6ORVlBFhzpiutKgT4nRIXkeTk1LK8+TmwvL34KdXvKbowWX1viRGfLqakArWWkzhNf1vLabioMlJKeVZkg9ZMz3s/RVa3mbNJO7hRQ8uv+w8ypjPoqlXxVqLqV5V/1yLqTgUOpGTiDQQkZ9FZJuIbBGRx+z26iKyVER22X9Ws9tFRN4TkRgR2SgiV7jta5jdf5eIDHNr7ygim+xt3hP7dunzHUMp5aO2LoCProTYNXDbf+GeaV6TmBZvimPU1NU0qVmZuQ9208R0iYoyy2A28KQxpiXQFXhYRFoBY4EfjTERwI/2c4DeQIT9Mxr4CKxEAzwPdAE6A8+7JZuP7L6u7XrZ7ec7hlLKl2SetCrx5g6B6o3hwd/gCs+vxnP5PPogD89cS9uwqswa3ZWafrxIYHEpNDkZY+KMMWvtx6nANqA+0A+YanebCtxuP+4HTDOWlUBVEakL3AwsNcYkGWOOA0uBXvZrIcaYFcYYA0zLs6/8jqGU8hWH1sLH18Daz+Dqv8LIpVDDe6b1mfLHXv7+xUaualaTz0Z29vtFAovLBY05iUg40AGIAmobY+LASmAiUsvuVh846LZZrN1WUHtsPu0UcIy8cY3GOvOiYUOdEkQpr5Cb82fRQ+XaMOwbaNzd6aiKzBjDf3+K4T9Ld3Jz69q8N6iDrsVUjIqcnESkMvAl8LgxJqWAWXTze8FcRHuRGWMmABMAIiMjL2hbpZQDkg/BV2Ng32/Qqh/0fcdrxpbASkyvLt7OhF/3cEeH+rxxV1tdi6mYFSk5iUg5rMQ0wxgzz26OF5G69hlNXSDBbo8FGrhtHgYcttt75GlfZreH5dO/oGMopbzV1vmw4FHIyYJ+H0D7+7xmbAmstZie/Xozs1YdYGi3Rrxwa2tdi6kEFKVaT4BJwDZjzFtuLy0AXBV3w4D5bu1D7aq9rkCyfWluCXCTiFSzCyFuApbYr6WKSFf7WEPz7Cu/YyilvE1GGsx/BOYOhepNrKKHDoO9KjFl5eTy+Jz1zFp1gIeva8qLt2liKilFOXO6ChgCbBKR9XbbM8BrwFwRGQkcAO62X/sW6APEAKeA+wGMMUki8jKw2u73kjEmyX78EDAFqAAstn8o4BhKKW9yaK0100PSHuj+JPR4GgK8q3AgPSuH/5uxlp+2J/BUrxY81MN7ija8kVgFcr4jMjLSREdHOx2GUgqsooc/3oWfx0PlOnDHxxB+tdNRXbC0jGxGTV1N1N4kXup3OUO6NnI6pGInImuMMZFOx+GiM0QopUpGcqy1vMW+36B1f+j7NlTwvvvoT5zKZNinq9l8KJm372nP7R10LabSoMlJKVX8tnwN3zxmFz18CO3v9aqxJZeElHSGTFrF3sST/G9wR27UtZhKjSYnpVTxyUiDxU/B+ulQvyPc8YlX3VDr7mDSKQZPiuJoagafDu/EVc10LabSpMlJKVU8Dq2xix72Qve/QY+xXlf04BKTkMaQSVGczMhm+qguXKFrMZU6TU5KqUuTmwN/vAM//8sqehi+CMKvcjqqi7b5UDLDJq9CRJgzphst6+paTE7Q5KSUunjJsTBvDOz/3auLHlyi9yVx/5TVBAeWZfqoLjQJrex0SH5Lk5NS6uJs+coqesjNgds/gnaDvLLoweXXnUcZ89ka6lQJYvqoLtTXJS8cpclJKXVhMlJh8Vi76CES7vzEmvHBi323+QiPzlpHk9BKfDayC6HBuuSF0zQ5KaWKLnYNfDkSTuyHa/4O1z7ltUUPLl+uieUfX26kbVgVpgzvTJWK3v1+fIUmJ6VU4XJz4Pe3YdmrEFzXKnpodKXTUV2yaSv28dz8LVzVrAYThkRSKVB/JXoK/ZdQShXsxEFreYv9f8Dld8Itb0GFqk5HdUmMMXy4bDf/XrKDG1vV5v1BHQgqp2sxeRJNTkqp89s8DxY+bhc9/A/aDfTqogewEtNr323n41/20N9ei6mcrsXkcTQ5KaXOlZFqz/QwA8I6wR0TvL7oAay1mP45fzMzow4wuGtDXrrtcl3ywkNpclJKnS022prp4cR+uOYfcO0/vL7oAay1mP72+Qbmrz/Mg9c25alel1HAit7KYZqclFKW3Bz4/S34+VUIqQfDv4VG3ZyOqlikZ+XwyMy1/LAtgX/0uoz/69HM6ZBUITQ5KaWsood5o+HAcrj8LrjlP15f9OByMiObB6ZFs3x3Ii/3a82QbuFOh6SKQJOTUv5u85fwzRNgcqH/BGh7j9cXPbicOJXJ8E9Xs+lQMm8PaEf/DmFOh6SKSJOTUv4qIxW+/TtsmGUXPXwC1Rs7HVWxSUhNZ+ikVew5epIP77uCm1vXcTokdQE0OSnlj2Kj7ZkeDlizPFzzDwjwnV8HscdPMXhiFPEpGUwe3omrI3QtJm/jO59GpVThcnPgt7esmR5C6sP9i6FhV6ejKla7j6YxZGIUafZaTB0bee8s6f5Mk5NS/uLEAbvoYQW0udsqegiq4nRUxWrL4WSGTloFwKzRXWldz7fenz/R5KSUP9j0BSz8659FD+0GOB1RsVuzP4nhn66msr0WU1Ndi8mraXJSypelp8Dif9hFD52t5S2qhTsdVbH7fdcxHpgWTe2QQKaP6kJYtYpOh6QuUaETSonIZBFJEJHNbm0viMghEVlv//Rxe+1pEYkRkR0icrNbey+7LUZExrq1NxaRKBHZJSJzRKS83R5oP4+xXw8vrjetlF84uBo+7g4b50CPp63xJR9MTN9vOcKIKatpVKMicx/sponJRxRltsMpQK982t82xrS3f74FEJFWwECgtb3NhyISICIBwAdAb6AVMMjuC/C6va8I4Dgw0m4fCRw3xjQD3rb7KaUKk5sDv7wBk2+2LuPd/x30GOtT1XguX62L5aEZa2lVL4TZo7tSKzjI6ZBUMSk0ORljfgWSiri/fsBsY0yGMWYvEAN0tn9ijDF7jDGZwGygn1gTW10PfGFvPxW43W1fU+3HXwA3iE6EpVTBju+HKbfAz+Ot5S0e/B0adnE6qhLx2Yp9PDFnA10aV2f6qC5UrVje6ZBUMbqUeeIfEZGN9mU/V61mfeCgW59Yu+187TWAE8aY7DztZ+3Lfj3Z7q+Uys+mL+B/V0P8FuuG2js/8blqPJcPl8Xwz/lb6NmyFpOHd6KyLhLocy42OX0ENAXaA3HAf+z2/M5szEW0F7Svc4jIaBGJFpHoo0ePFhS3Ur4nPcUqEf9yJNRqCQ/+Zk1B5IOMMby2eDtvfLeDfu3r8dHgjrpIoI+6qK8bxph412MR+QRYaD+NBRq4dQ0DDtuP82s/BlQVkbL22ZF7f9e+YkWkLFCF81xeNMZMACYAREZG5pvAlPJJB1dZy1skH7SKHrr/zSfHlgBycw3PLdjM9JUHuLdLQ17udzkBuhaTz7qoMycRqev2tD/gquRbAAy0K+0aAxHAKmA1EGFX5pXHKppYYIwxwM/AXfb2w4D5bvsaZj++C/jJ7q+UysmGZa/DZLtWyYeLHgCyc3J58vMNTF95gDHXNmH87ZqYfF2hn2QRmQX0AGqKSCzwPNBDRNpjXWbbB4wBMMZsEZG5wFYgG3jYGJNj7+cRYAkQAEw2xmyxD/EUMFtEXgHWAZPs9knAZyISg3XGNPCS361SvuD4fusy3sGV0HYA9HkTgkKcjqrEpGfl8JdZ61i6NZ6/33wZ/9ejqS4S6AfE105GIiMjTXR0tNNhKFUyNn4Oi/5qPb7lLWh7t7PxlLCTGdmM/iyaP2ISefG21gy7MtzpkHyWiKwxxkQ6HYeLb14DUMrXpCdby1tsnAMNusIdE6BaI6ejKlHJp7K4f8oq1h88wX/ubsedHXUtJn+iyUkpT3cgCuaNguRD0OMZ6P6kz44tuRxNzWDo5FXsTkjjw/s60utyXYvJ3/j2J1wpb5aTDb/+G359A6o0gBHfQYPOTkdV4g6dOM2QiVHEJaczaXgk3SNCnQ5JOUCTk1Ke6Pg+u+ghCtoOhD7/9umiB5c9R9MYPDGK1PRsPhvZmcjw6k6HpByiyUkpT7NxLix6EhC4cxK0uavQTXzBtrgUhkyKItdYazFdXt83Z7dQRaPJSSlPkZ5sJaVNn0PDblbRQ9WGTkdVKtYeOM7wyauoWN5ai6lZLV2Lyd9pclLKExxYCfMesIoerhsHV//V54seXJbHHGPUtGhCgwOZPrILDarrkhdKk5NSznIveqjaEEYsgQadnI6q1CzdGs/DM9fSuEYlPhvZmVohuuSFsmhyUsopSXutoofYVdBuEPR+wy+KHlzmrz/EX+du4PL6VZh6fydd8kKdRZOTUk7YMMcaX5IyflX04DJ95X7+OX8zXRpXZ+IwXfJCnUs/EUqVprOKHq6EOz72m6IHl4+W7eb177ZzfYtafHjfFbrkhcqXJielSsv+FdZlvJRDcN2z0P2vUMZ/fjEbY/j3kh18uGw3t7arx1v3tKNcwKWsd6p8mSYnpUpSThbsWgobZsL2RdZZ0sjvIcxj5tcsFbm5hhe+2cK0FfsZ1LkBr9zeRpe8UAXS5KRUcTMGjmyE9bOsy3enjkGlUOj2CFz7DwgMdjrCUpWdk8s/vtzIvLWHGH1NE57u3UKXvFCF0uSkVHFJPWLN7rBhFiRshYDQm5DFAAAaz0lEQVTycFlvaHcvNLsBAso5HWGpy8jO4dFZ61iyJZ4nb2zOI9c308SkikSTk1KXIisddiyyzpJ2/wgmF8I6WWstte4PFf13brhTmdmM+WwNv+06xvO3tuL+qxo7HZLyIpqclLpQxlgTsq6fCVu+hoxkCAmDq5+w7leqGeF0hI5LPp3FiCmrWXfgOP++qy13RzZwOiTlZTQ5KVVUJw7AhtnWZbukPVCuIrS8DdoPgvBroIxWngEcS8tg6KRV7EpI5YN7r6B3m7pOh6S8kCYnpQqSkQpbF1gJad9vVlt4d+j+N2h1m98VNxTm8InTDJ4UxeETp5k4rBPXNte1mNTF0eSkVF65ObD3V+ssadsCyDoF1ZtY9ya1G+B3N80W1b5jJ7lvYhQpp7OYNqILnRv773ibunSanJRyObbLGkfaOMe6UTawCrS9x6q2a9AZtMrsvLYfSWHIpFXk5Bpdi0kVC01Oyr+dSoIt86xqu0PRIAFW2fdNr8BlfaCczpJdmPUHTzBs8iqCypVh1piuNKullzrVpdPkpPxPThbE/GCdJe38DnIyoVZrKyG1uQeCazsdoddYvvsYD0yNpkblQGaM0rWYVPEpNDmJyGSgL5BgjLncbqsOzAHCgX3APcaY42LdXfcu0Ac4BQw3xqy1txkGPGvv9hVjzFS7vSMwBagAfAs8Zowx5zvGJb9j5b/iNlqFDZs+h5NHoWJNiBxpVdvVaauX7S7QD1vj+b+ZawmvUZHPRnahtq7FpIpRUWpfpwC98rSNBX40xkQAP9rPAXoDEfbPaOAjOJPMnge6AJ2B50Wkmr3NR3Zf13a9CjmGUkWXlgDL/wsfXQUfd4fVE60l0AfNhie3Q+/XoG47TUwXaP76Qzw4fQ0t6gQzZ3Q3TUyq2BV65mSM+VVEwvM09wN62I+nAsuAp+z2acYYA6wUkaoiUtfuu9QYkwQgIkuBXiKyDAgxxqyw26cBtwOLCziGUgXLSocd31pnSTE/gsmB+h2hz5tw+Z1+PWtDcZgZdYBxX2+iU3h1Jg2LJDjI/6ZlUiXvYsecahtj4gCMMXEiUsturw8cdOsXa7cV1B6bT3tBx1DqXMZA7Gp71oZ51rpJIfXhqsesWRtCmzsdoU+Y8Otu/vXtdq67LJSPBnfUtZhUiSnugoj8ro2Yi2i/sIOKjMa6NEjDhnoPil85cRA2zraq7ZJ2Q9kK1s2x7QZB42v8ar2kkmSM4a2lO3n/pxhuaVuXt+9pT/myOiOGKjkXm5ziRaSufUZTF0iw22MB90m0woDDdnuPPO3L7PawfPoXdIxzGGMmABMAIiMjLzi5KS+TkWbdHLthFuz9DTDQ6Gpr8b5W/XTWhmKWm2t4aeFWpizfx8BODRjfX9diUiXvYpPTAmAY8Jr953y39kdEZDZW8UOynVyWAP9yK4K4CXjaGJMkIqki0hWIAoYC7xdyDOWPcnOt6YM2zLKmE8o6CdUaQ4+nrVkbqoU7HaFPys7JZey8TXyxJpZRVzdm3C0tdckLVSqKUko+C+usp6aIxGJV3b0GzBWRkcAB4G67+7dYZeQxWKXk9wPYSehlYLXd7yVXcQTwEH+Wki+2fyjgGMqfHIuxEtLGOZB8EAJDoM1d0P5eaNBFq+xKUEZ2Do/PXs/izUd4omdzHr1B12JSpUeswjrfERkZaaKjo50OQ12K08dh8zwrKcWuBikDTa+3xpFa3ALlKjgdoc9zX4vpub6tGHG1rsXk60RkjTEm0uk4XHSGCOUZcrKtxfrWz4QdiyEnA2q1ghtfhjZ3Q4guu1BaUtKzGPHpatYeOM4bd7blnk66FpMqfZqclLOObLYv282FkwlQsQZE3m+dJenNsaUuMS2DoZNXsTM+lfcHXcEtbfVLgXKGJidV+tISrCmE1s+C+E1Qphw0v9kaR2p2I5Qt73SEfiku+TSDJ0YRe/w0E4ZGct1lemuhco4mJ1U6sjOsy3UbZsGupdasDfWu0FkbPMT+RGstphOnspg2ojNdmtRwOiTl5zQ5qZJjDMRGw4aZVoFD+gkIrgtX/sW6bFerhdMRKmDHkVSGTIoiKyeXmQ90oW1YVadDUkqTkyoBybHWKrIbZkPiLmvWhpZ9rYTUpIfO2uBBNhw8wbBPV1E+oAxzx3QjorbewKw8gyYnVTwyT8K2b6xqu72/Ys3acJU1t12rfhAU4nSEKo8VuxMZNXU11SuXZ8bIrjSsoWsxKc+hyUldvNxc2P+7dYa0dT5kplkzNfQYC20HQHW9N8ZT/bQ9noemr6VhdWstpjpVdMkL5Vk0OakLl7jbKmzYMAeSD0D5YGjd36q2a9hNy7893DcbDvPEnPW0rBvC1BGdqV5JqyOV59HkpIrm9AnY8pWVlA5GWbM2NLkOej4Pl/WB8npJyBvMXnWAp7/aRKdG1Zk4PJIQXYtJeShNTur8crJh909WQtq+yJq1IbQF9HzRumynszZ4lYm/7eGVRdu4tnko/xvckQrltTBFeS5NTupc8VuswoZNn0NaPFSoDh2HWdV29TroZTsvY4zh7R928d6Pu7ilTV3eHqBrMSnPp8lJWU4es2dtmAlHNkKZstC8l5WQIm7SWRu8VG6u4eVFW/n0j33cExnGq3e01bWYlFfQ5OTPsjNg53fWNEIxSyE3G+q2h95vwOV3QSWdJcCb5eQaxn65kc/XxDLiqsY8e0tLymhiUl5Ck5O/MQYOrbVnbfjSWp4iuC50e9ietaGl0xGqYpCZncvjc9bx7aYjPHZDBI/3jNC1mJRX0eTkL5IPWQv2bZgFx3ZC2SBo0RfaD7Kq7nTWBp9xOjOHB6ev4ZedR3n2lpaM6t7E6ZCUumCanHxZ5knYttA6S9rzC2Cs+5BufQ9a3w5BVZyOUBWzlPQsRk2JZvX+JF67ow0DOzd0OiSlLoomJ1+TmwsHllvjSFu/tmZtqNoIrn0K2g2A6vot2lclncxk2ORVbItL4b2BHbi1XT2nQ1Lqomly8hWJu/+8bHfCNWvD7dDOnrWhjJYO+7IjyekMmRTFgaRTTBjaketb1HY6JKUuiSYnb5aebM3asH4WHFwJiDXr9/X/tMaTdNYGv3Ag8RT3TVpJUlomU0d0pquuxaR8gCYnb5ObA7t/tsaRti+C7HSoeRn0fMGetUEv5fiTnfGpDJ4YRWZOLjMf6Eq7BroWk/INmpy8RfxW65LdxrmQdgQqVIMOQ6xqu3pX6KwNfmhj7AmGTV5FuYAyzBndjcvq6FpMyndocvJkJ4/Bpi+ss6S4DdasDRE3WfcjNb8ZygY6HaFySNSeREZOjaZqxXLMGNWFRjUqOR2SUsVKk5Onyc6EXUuscaRdS+xZG9pBr9ehzV1QqabTESqH/bw9gQenryGsWgVmjOqqazEpn3RJyUlE9gGpQA6QbYyJFJHqwBwgHNgH3GOMOS7W7envAn2AU8BwY8xaez/DgGft3b5ijJlqt3cEpgAVgG+Bx4wx5lJi9kjGwOG1VkLa/IU1a0Pl2tD1IavarnYrpyNUHmLRxjgem72Oy+oEM21EZ2pU1rNn5ZuK48zpOmPMMbfnY4EfjTGvichY+/lTQG8gwv7pAnwEdLGT2fNAJGCANSKywBhz3O4zGliJlZx6AYuLIWbPkHLYLv+eDUe3Q0AgtLjFWrSvyXUQoCe26k9zVx9k7LyNdGxUjUnDO+laTMqnlcRvv35AD/vxVGAZVnLqB0yzz3xWikhVEalr911qjEkCEJGlQC8RWQaEGGNW2O3TgNvx9uSUecqqstswE/YsA5MLDbrCre9Cq9uhglZbqXNN+n0vLy/cyjXNQ/lY12JSfuBSk5MBvhcRA3xsjJkA1DbGxAEYY+JEpJbdtz5w0G3bWLutoPbYfNq9jzGwf7mVkLbMh8xUqNIQuv8N2g2EGk2djlB5KGMM7/64i3d+2EXvy+vwzsD2BJbVxKR836Ump6uMMYftBLRURLYX0De/WmdzEe3n7lhkNNblPxo29KC5xJL2WpfsNsyCE/uhfGXr7KjdQGh0lc7aoApkjOGVRduY9Pte7uoYxmt3tKFsgH5mlH+4pORkjDls/5kgIl8BnYF4EalrnzXVBRLs7rFAA7fNw4DDdnuPPO3L7PawfPrnF8cEYAJAZGSkswUT6SnWnHbrZ1lz3CHQ5Fq4bhy07AvlteRXFS4n1/DMvE3MiT7I8CvDea5vK12LSfmVi05OIlIJKGOMSbUf3wS8BCwAhgGv2X/OtzdZADwiIrOxCiKS7QS2BPiXiFSz+90EPG2MSRKRVBHpCkQBQ4H3LzbeEpWbA3t+ts6Sti2E7NNQIwJueM6ataFKWOH7UMqWmZ3LE3PXs2hjHI9e34wnbmyuazEpv3MpZ061ga/s/zRlgZnGmO9EZDUwV0RGAgeAu+3+32KVkcdglZLfD2AnoZeB1Xa/l1zFEcBD/FlKvhhPK4ZI2G6NI22cC6lxEFTVqrRrfy/U76izNqgLlp5lrcW0bMdRnunTgtHX6Hik8k/ia7cNRUZGmujo6JI7wMlEawXZDTPh8DqQAGvWhvaDoHkvnbVBXbTU9CxGTY1m1b4kxt/ehnu7eND4qfJ5IrLGGBPpdBwueiNNUWRnwq7vrcKGnUsgNwvqtIGbX4U2d0PlUKcjVF7u+MlMhn26iq2HU3hnQHv6tffOwlSliosmp/MxBuLW/zlrw6lEqFQLuoyx5rarc7nTESofEZ+SzuCJUexPOsXHQzpyQ0tdi0kpTU55pcTBprlWUjq6zZ61oY81jVDT63XWBlWsDiad4r6JUSSmZTDl/k5c2VTnTlQKNDlZsk5bszasn2lV3ZlcCOsMfd+G1v2t5SmUKma74lMZPCmK9KxcZjzQlfa6FpNSZ/hvcjIGDqy0Z234GjJSoEoD6P4ktB0INZs5HaHyYZtikxk6OYqyAWWYO0bXYlIqL/9LTsf3/Tlrw/F9UK4StOpnVds1ulpnbVAlbtXeJEZOWU1IBWstpvCaemO2Unn5R3LKSLXOjjbMgv1/AAKNr4Frx0LLWyGwstMRKj/xy86jjPksmnpVKzB9ZBfqVa3gdEhKeSTfTU65ObD3F6uwYds39qwNzeD6f1qzNlRtUPg+lCpGizfF8ejsdUTUCmbayM7U1LWYlDov30tO2enwwwuwYQ6kHoagKtYlu3b3QlikztqgSkRuriHxZCbxKekkpKYTn5JBfEq6/WM93haXQoeG1Zg8vBNVKuhaTEoVxPeSU8I2+OM9aNYTev0LmveGcrqMtbo4xhhS0rPPSTQJKekcsZ8npKSTkJpBdu7Zs62IQI1KgdQOCaR2SBBXN6vJYz0jqFje9/7bKVXcfO9/SUh9eDIaKtcqvK/ya6czc4g/k2TSSXCd7aRmEJ+cTnyq1Z6elXvOtiFBZalTJYjaIUE0Da1J7ZBA6lQJolZw0JlkFBocSDld4kKpi+J7yalyLU1Mfi4zO5ejaXaiSU7/M+HYCciVjFLTs8/ZtkK5ADvJBNIurOqZRPPnTyC1goN0JVqlSpjvJSfls3JyDYknM86c4bhfVrOeW48TT2aes225ADlzVhNRqzJXN6tJrZBAagcH2WdAgdQKCSI4sKwuT6GUB9DkpBxnjCH5dFaeIgK3ogL7MtvRtAxy8hnXqVnZGtepXzWIDg2rUtvt0prrbKdaxfK6WJ9SXkSTkypRJzOyzySaBHsM50hyBvGp6fYZj5WAMrLPHdepWrEctYODqBUSSPNaNf+8rGYnnTohQdSsXF6XLlfKB2lyUhclIzuHhDMJJ+PsMx23AoPUjHPHdSqWD6BOiJV0OjSsaj92O9uxE1JQOR3XUcpfaXJSZ8nJNSSmZZwZz8lbNh1vl00n5TOuUz6gjDWOExLEZXWC6R4RemY8x0o41uPgIL3HRylVME1OfsIYw4lTWcSnpnMk2b1s2rrM5rrkdjQ1gzzDOpQRCA22kk5YtYp0bFTtzGW1Wm5jO9UqltNiAqVUsdDk5APSXOM6Z+7NObdsOiElg8ycc8d1qlUsdya5tKgTTG37Elsdt8tsNSrpuI5SqnRpcvJg6Vk5HE3Nv2zafXznZGbOOdtWDixLrZBA6oQE0Sm8+pmy6dohQdSpYt2rExqs4zpKKc+kyckB2Tm5HEvLLLBsOj41nROnss7ZtnzZMmfGcFrWC6HHZbXOnOG4klGtkCAqB+o/rVLKe+lvsGKUm2s4firTSjR2qXR+ZdPH0s4d1wkoI4Ta9+s0rFGRzo2rn1U2XdtOPFUq6LiOUsr3aXIqAmMMqRnZZyWY/G4YTUhNJyvHnLN9jUrlz1SqtaobQm23CjZX4qlROZAAvUlUKaUAL0hOItILeBcIACYaY14rzv2nZ+Wck2wSUjM4kvzn4/iUdE7lM64THFT2THLp0ri6XUgQeKaooHZIIKHBgQSW1XEdpZS6EB6dnEQkAPgAuBGIBVaLyAJjzNbCts3KyeVYmivJnD07getxfEoGyafPHdcJLFvGuj8nOIjW9UK4vsWf4zqun1rBgVTScR2llCoRnv7btTMQY4zZAyAis4F+wHmT066ENCJf+YHEkxmYfMZ1atn36zSuWYmuTWqcSTSu5Q9qBwcRUkEn/1RKKSd5enKqDxx0ex4LdClog3JlhBtb1bJnoD57AtAalXTyT6WU8gaenpzyyyTnVByIyGhgNEDDhg159Y62JR2XUkqpEuTpt/3HAg3cnocBh/N2MsZMMMZEGmMiQ0NDSy04pZRSJcPTk9NqIEJEGotIeWAgsMDhmJRSSpUwj76sZ4zJFpFHgCVYpeSTjTFbHA5LKaVUCfPo5ARgjPkW+NbpOJRSSpUeT7+sp5RSyg9pclJKKeVxNDkppZTyOJqclFJKeRwxeef48XIikgrscDqOIqgJHHM6iCLQOIuPN8QIGmdx85Y4LzPGBDsdhIvHV+tdhB3GmEingyiMiERrnMXHG+L0hhhB4yxu3hSn0zG408t6SimlPI4mJ6WUUh7HF5PTBKcDKCKNs3h5Q5zeECNonMVN47wIPlcQoZRSyvv54pmTUkopL+dIchKRMBGZLyK7RGS3iLwrIuVFpIeIJIvIOhHZLiJvum0zXET+6/Z8sIhsFJEtIrJBRCaKSFX7tWUiEmk/3iciX7ptd5eITMkTz3wRWZGn7QUR+VsJ/RUopZQqQKknJ7HWP58HfG2MiQCaA5WB8XaX34wxHYAOQF8RuSqfffQCngB6G2NaA1cAy4Ha5zlspIi0Pk88Ve3tq4pI44t/Z8rTiUh/ETEi0sJ+Hi4ip0Vkvf0FZ7mIXObW/2oRWWV/UdpuL2qJ/SUq75eZsiISLyJ1RWSKiOy197teRJYXENNwO6Yb8onzLvv5MhHZ4ba/L/LsY4OIzMrTNkVEDolIoP28pojsu+i/PFXqPPTzWltEFtrH3yoi34pIG7dtk9z29YNbzOtEZJsd37CivH8n7nO6Hkg3xnwKYIzJEZEngL3Az65OxpjTIrIea6n2vMYBfzPGHHLtA5hcwDHfBJ4B7svntTuBb4B4rPWiXr3gd6S8xSDgd6x/5xfstt3GmPYAIjIG63MyTETqADOB240xa0WkJrBERA4Bi4EwEQk3xuyz99MT2GyMibO+f/F3Y8xZSaQAm+zYfrSfDwQ25OlznzHmnPtQRKQl1pfMa0SkkjHmpNvLOcAI4KMixqE8iyd+Xl8Clhpj3rVjaGuM2QS4YpoCLHTtS0TC7Zg72M+bAPNEpIwrB5yPE5f1WgNr3BuMMSnAAaCZq01EqgERwK/n2cfaCzjmXOAKEWmWz2uDgFn2z6AL2KfyIiJSGbgKGIn1nz0/IcBx+/HDwBRjzFoAY8wx4B/AWGNMLvA5MMBt24FYn6GL8RvQWUTK2XE2A9YXcdt7gc+A74Hb8rz2DvCEiPjizfY+zYM/r3WxVijHPs7GC9nYGLMH+CvwaGF9nUhOAuRXIuhq7y4iG4EjWBn4SIE7+/OUcreIDDhPtxzg38DTebatjfWL4HdjzE4gW0Quv7C3o7zE7cB39r9zkohcYbc3dX1+sP7TvGW3n/MlCoi228H6jz0QwL501gf40q3vv90udcwoJDYD/ADcDPQj/9WeZ7jt799u7QOAOeT/5eoA1jfvIYUcX3keT/28fgBMEpGfRWSciNS7iPe2FmhRWCcnktMW4KypPEQkBGgA7MYac2oLtAEeEpH259nHFQDGmE32ae5ioEIBx/0MuAZo6NY2AKgG7LWvx4dz/m8pyrsNAmbbj2fz5y/y3caY9saYpsDj/Hmvx/m+RBkAY8xqoLJ9zb83sNIYc9yt39/t/bY3xuR3OTmv2VifvfN9o73PbX9/BxCRTsBRY8x+rEuCV9hXHNz9C/g7WpnrbTzy82qMWQI0AT7BSjDrRCT0At+bFKWTEx/YH4GKIjIUQEQCgP8AU4BTrk72N4ZXgafy2cerwJsiEubWVlBiwhiTBbyN9Q/qMgjoZYwJN8aEAx3R5ORzRKQG1ljnRPtLyN+xvpjk/U+yAOsLDOTzJQrr87HV7XlhCaXIjDGrgMuBmvZnvygGAS3s97Qb6zLPnXn2G4N1ifCeS4lPlR5P/7waY5KMMTONMUOA1W4xFFUHYFthnUo9ORnrrt/+wN0isgvYCaRjDezl9T+sgd6zqujspdvfAxbbFSPLsS7dLSnk8JOwi0DsgbqGwEq3/e4FUkSki930rIjEun4u6I0qT3IXMM0Y08j+ItIAqwAnLE+/q7F+yYN1+WK468zd/oXxOvCGW/9ZwGCsXyT5XYq7UE+T//+Dc4hIGeBuoK3bl6t+5D9uOh7Q2yK8h8d+XkXkehGpaD8OBppiXT4u6vbhWAVq7xfW15GBUmPMQeDWfF5aZv+4+p3mz2q9vVhnV67XpgJTz7P/Hm6Pw90eZwDu10jPqQQ0xriu7UbxZ4WM8m6DgNfytH2JlQia2lWhAmQCowDsKqbBwCf2f0IB3jHGfOPagTFmq4icAtbkqZID6xr+s27POxtjMgsK0hizuICXZ4jIafvxMeAV4JCrYtX2K9BKROrm2e8WEVmLfSlceTxP/rx2BP4rItlYJzcT7UuGBWkqIuuAICAVeL+wSj3Q6YuUUkp5IB0kVUop5XH0/gelSoGI3A88lqf5D2PMw07Eo1RBPOHzqpf1lFJKeRy9rKeUUsrjaHJSSinlcTQ5KaWU8jianJRSSnkcTU5KKaU8zv8DOlODoGpQIvEAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "compare[['MEAN','STD']].plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# IS JSON\n", + "\n", + "This whole sample has: \n", + "- False: 307577 rows\n", + " - 61,54% are not valid JSON\n", + " \n", + "- True: 192228 rows\n", + " - 38,46% are valid JSON" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPoAAADuCAYAAAAQqxqwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFdpJREFUeJzt3XmUXGWZx/HvU72xN5sssuQmkLDLzghhVVTGQoHjAC5wcEfCJg7IZVEvI8MpNhVGDFuGHZkTFZFcCMNiQhIIiWwBRIFgQcIuhCKYhaTzzh+3mmlip7uqu6qee+/7fM6p091Fdd9fh/r13d9XnHMYY/KtoB3AGNN8VnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFH4SI9IjIE30ewQCvDUTk6dalM6Y27doBMmCxc24X7RDGDIet0YeguuaeJiKPVR/79POaHURkVnUrYI6IjK4+f0yf568SkbbW/wbGN1b0wa3eZ7P99upzbwKfcc7tBhwNXN7P930PuKy6NbAHMF9Etqu+fmz1+R7ga83/FYzvbNN9cP1tuncAvxSR3rKO6ef7HgbOEZHNgd85554XkU8DuwOzRQRgdZI/GsY0lRV9aE4D3gB2JtkqWrLyC5xzt4rII0ARuEdEvg0IcINz7qxWhjXGij403cB859wKETkO+Kf9bBEZBbzonLu8+vkngP8F7hCRnzvn3hSR9YG1nXMvtSJ0EMZrACOAzYHOGr/tfeA14JVyqfiPZmUzzSU2P/rAROR959xaKz03GvgtsAj4I3Cyc26t6qm3Sc65HUXkLOAYYBnwOvBV59w7InI0cBbJlsAy4ETn3MxGZA3CeF0gICnziJU+HwFsOMxFLAReBeYBc4EXgOerH+eWS8V/2rIx6WBFz7AgjAPggD6PUYpxlgOPAlOBKcD0cqm4UDGP6cOKniFBGG/NR4u9pW6iAfUAj5MUfyowrVwqvqsbyV9W9BSr7lMfDhxKUuyP6yYalhXAkyRr+9vKpeIs3Th+saKnTBDG7cBnSM6vHw6sqZuoaZ4BrgduKpeKbyhnyT0rekoEYTwG+C5wLLCRcpxWWg7cDVwHTCqXisuU8+SSFV1REMadwBHA8cBBynHS4C3gFuC6cqk4RztMnljRFVT3vU8GfoBfa+96zAJ+Ui4VJ2sHyQMregsFYdwBfAc4F9hUOU5WPAicXS4VZ2gHyTIregsEYVwguXgmAkbqpsmsu4BzyqXiE9pBssiK3mRBGB8B/BTYQTtLDjhgIvCjcqn4nHaYLLGiN0kQxgcDFwB7amfJoeXADcB55VJxnnaYLLCiN1gQxhsBVwOHaWfxwBLgPODicqnYox0mzazoDRSE8eEkJf+YdhbPzAa+US4Vn9EOklZW9AYIwngd4DLg68pRfPYBybGQUrlUXK4dJm2s6MMUhPEBJPuLI7SzGCBZu3+lXCrO1Q6SJlb0IQrCuIvkYNtpJCPHmPRYCJxULhVv1A6SFlb0IQjCeFfgJuyUWdrdCpxQLhXf0w6izYpepyCMv0pyA0atQzEZXc8B/1ouFV/UDqLJhnuuQxDGZwI3YyXPkjHAQ0EY764dRJOt0WtQvYT1cuBE7SxmyN4HjvT1Jhlbow8iCOPVgd9gJc+6tYA7gzD+pnYQDVb0AQRhvAFwH8k94yb72oEJQRj/WDtIq9mm+yoEYTySZOSTbbSzmKa4huSIvBeXzlrR+xGE8W4kt0VurJ3FNFUMHO3DxBRW9JVUz5FPAdZRjmJa44/AIeVS8QPtIM1k++h9BGE8GpiMldwnBwHXB2Gc66sbrehVQRhvBtyLjeHmo68AF2qHaCbbdAeCMF4fmAZsr53FqDq5XCr+UjtEM3hf9OqQy/cB+2lnMepWkFxU8zvtII1mm+5wLVZykygAtwRhPFY7SKN5XfQgjM8lmRnFmF6rAX8Iwnhb7SCN5O2mexDGRwG3YfeSm/6Vgb3LpeLr2kEawcuiB2G8FcnMnnmdwNA0xn3AZ8ulYuZL4t2mexDGbcCNWMnN4A4GTtIO0QjeFR34IbCPdgiTGRfmYX/dq033IIx3IZm8r0M7i8mUP5Hsr2d2dFlv1ujVwRxvwkpu6rcHycSYmeVN0YHzgR21Q5jMOicI48xOr+XFpnsQxvuT3KXk0x8203h/BXYtl4qLtYPUK/dv/CCM1yaZYCH3v6tpum3I6M0vPrz5LwYC7RAmN04Kwvgg7RD1yvWmexDG2wFPAW3aWUyuPAnsVi4VV2gHqVXe1+jnYyU3jbczGbtHIrdr9CCM9yCZcM+YZpgPjMnKgbk8r9H/UzuAybXNge9rh6hVLtfo1amMp2jnMLn3HhCUS8UF2kEGk9c1+gXaAYwX1iGZNjv1crdGD8L4UOBO7RzGGxWStfq72kEGkqs1enXI3vO1cxivdJOBtXquig4cTXLqw5hWOjUI43W1Qwwkb0U/VTuA8VI3KT+vnpuiVwcH+KR2DuOtb2gHGEhuik7K/6FN7u0ahHFqdxtzUfTqOHCp3nQyXkjtyiYXRQcOATbVDmG897UgjFM5glFeip7av6TGKxsCX9QO0Z/MFz0I4w2BL2jnMKYqlSudzBcd+BrQqR3CmKpDgjBO3W5kHoqeyr+gxlupPDCc6aJXT2ek9pSG8dZx2gFWlumiA4dqBzCmH9sHYbyFdoi+sl70g7UDGLMKB2gH6CuzRQ/CeE1sDjWTXlb0BtkfO9pu0suK3iC22W7SbHSaTrNluej7aQcwZhCpWatnsuhBGK8O7KKdw5hBWNGHaQ9s+mOTflb0YbKj7SYLtgvCeCPtEJDdou+tHcCYGu2vHQCyW/TttQMYU6MdtANABoteHdJ5S+0cxtQo0A4A0F7rC0VkDHAGMKLv9znnPtWEXAPZGOhq8TKNGaoR2gGgjqIDE4ErgWuAnubEqUkq/uGMqVEq3q/1bLovd86Nd87Ncs492vtoWrJVS8U/nDE12iIIY/Vd5HoC3Cki40RkUxFZv/fRtGSrZkU3WdJBCgYurWfTvfdm+jP6POeAUY2LUxMrusmaEcArmgFqLrpzbmQzg9TBim6yJgAe0gxQz1H3DuAE/v8CgCnAVc65ZU3INRAruska9fdsPZvu40n2N35V/frY6nPfbnSoQaj/oxlTJ/X3bD1F39M513cgxgdE5MlGBxpI9a61dVq5TGMaQP1gXD1H3XtEZKveL0RkFK0/n65+msKYIVAfCameNfoZwB9F5EVASDZHbEx1Ywanfkt1PUfd7xeR0cA2JEX/i3NuadOSGZMf6kWveVNYRI4EOp1zc0jmOvu1iOzWtGTG5EemNt1/5JybKCL7Ap8DLiE56v4vTUlmWmonefH5w9pmvKadI4+W0vEyFFUz1FP03gNvRWC8c+4OEYkaH8m02ucLjzx2Rcdlo0UYrZ0lp2ZpB6jnKPYrInIVcBRwl4h01fn9JoXGtd0x44qOy3YSYW3tLDmmebcnUN8a/SjgEOAS59y7IrIpH73u3WTMpR3jp3ypbdqB2jk8kP6ii8g6zrn3gNVILnuletfaUmC2iLQ559R/EVO7Ait6JnaeN2P3wvMHamfxxELtALWs0W8lmbX0UZK71aT6fO/na4nINc65s5sT8SNcC5aRa6uxdPH9Xac/tZm8nYpBCz3xtnaAQfexnXOHVj+OdM6Nqn788HNgE+AIEWn6gI3lUnERsKjZy8mr9am8Patr3NzN5O29tLN45u/aAYZ9MM051+Oc2w64uQF5avFyi5aTKyPl1Zdndp28cB1ZvKN2Fg+lf41eBxn8JQ3xUouWkxt7ybN/vr/zjNU7ZXmgncVT2V+j99Gq/Wcreh2OKEyb/T+dPx1REPcx7SweU1+j13N6LS2s6DU6rX3itFPabt9bJJP/n/PkDe0AjXwDfNDAnzUQK3oNftXxi6mfb5uVmkn+PPe8doB6bmoZKyJrVj8/RkR+JiIfjpzhnPtkMwL2w4o+gDZ6lk/qPHualTw1FhJV1O8hqGcffTywSER2Bn5IUrgbm5JqYFb0VViDJf+Y0XXKEzsWyvtpZzEf+qt2AKh/AgcHHAZc5py7DFSuj34VWK6w3FTbiAVvze4a9/ImsmAP7SzmIzJX9IUichZwDBCLSBsKN9SXS8UeYH6rl5tmY2Te32Z0nbJkTVmynXYW80/+oh0A6iv60STXt3/LOfc6sBlwcVNSDS4VfyXTYGzh6acnd4bdHdKzhXYW069UvFdrLrpz7nXn3M+cc9OqX7/snNPYRweYqbTcVPly2wOP3NxxwVYFcRpTY5naPKYdAGq7e226c25fEVnIRy+KEcA55zSGX35YYZmpclb7LQ9+ty0eK0KbdhazSm8QVeZqh4Aaiu6c27f6MU0DE8wEVuDlwBfOTei4ZOqn2x4/UDuJGdQM7QC9MlmUcqlYAf6snaPV2lm+7J7OMx+ykmeGFb0BpmsHaKW1WPTezK6TntqmMH+sdhZTMyt6A9ynHaBVNuXt12d1jXttQ3nPhtfOjsWk5EAcZLvoD5Dsp+fa9lKeO63r1J415INttLOYujxEVGn1TMOrlNmil0vFBSTDW+XWpwqPPTmp8+wN2mXFZtpZTN3u0A7QV2aLXnWvdoBmOa5t8sMTOi7ZtiCsq53FDIkVvYHu1g7QDP/Rft3UqP3GT4rQpZ3FDMnjRJVUDXmW9QEJZgB/A0ZqB2kM527puODBsW3P2C2m2fZ77QAry/QavVwqOuB67RyN0MmypQ90/vtMK3kuWNGb4AYyPt77OrxfeaTrxGdHFV7fWzuLGba5RJU52iFWlvmil0vFl0hOtWXS5vLWq7O6Tvz7evL+LtpZTEP8t3aA/mS+6FXXaQcYip3lheemdn6/sJos20o7i2mI5aT0vZiXov8OqGiHqMfnCrMe/33njzdpE7eJdhbTMHEaxofrTy6KXi4VFwO3aeeo1XfbJs24suMXO4igcYuvaZ6rtQOsSi6KXpXKTaaVXdR+5ZSz2m/dR4RO7SymoeYBk7VDrEpuil4uFR8BntXOsSrCihUTO6MHj2p/8ECRlk1fZVrnWqJKau+9yE3Rq36lHaA/q7F0ydTO02btWXjOpirOp/eBK7RDDCRvRb+alM22uh7vvfNI14nPb1l4q1UTXJjWG09UUZ9fbSC5Knq5VPwAOE87R69AXps3s+ukSrcs2kk7i2maxcCl2iEGk6uiV91ACsbS3kP++uz9naev1iXLc3IdvlmFa4gq6pMoDiZ3Ra9O8PBjzQyHFWb8aWLneVu02VTFefcBcJF2iFrkruhVv0FpGJ9T2347/RcdV+wiwloayzctNYGo8op2iFrksujVu9rOafVy/6vj8imndfx2X5uP3AvvorzlWI9cFh2gXCpOBh5sxbIKrOi5o/PcaV9om3lgK5ZnUuEnRJW/a4eoVW6LXtX0tXp1quLHdi68aFMV++MZUnrNxqrkuujlUnE68Idm/fwNefetWV3jXtpU3tmzWcswqfR9okqmpu7OddGrTgAWNPqHjpb55Ye7Tl60lizZvtE/26Ta7USVzM0pkPuil0vFV4FxjfyZ+xSefuaezjPX7pCeEY38uSb1FgKnaYcYitwXHaBcKt5Gg25jPbJtyqxbOi4YWRC3QSN+nsmUM4gqL2mHGAovil41Dnh1OD/gzPZfP3hR+9W7i7BGgzKZ7LiXqHKVdoih8qbo1ZldvjnU77+649IpJ7Tfub/NR+6lYb130sCbogOUS8V7gPH1fE87y5fd3RlO/2zbowc2J5XJgOOJKvO1QwyHV0WvOh14vpYXrsnihQ91nTxnu8LL+zY5k0mv64gqE7VDDJd3RS+XiouAY4GegV63Ce+8Mbtr3CsbSWX31iQzKfQoDT5jo8W7osOHw06t8qq5beXlF6d3nbJsDVm6bQtjmXR5EziCqLJEO0gjeFl0gHKpeCH9DCi5f+HJOXd1huu1y4rNFWKZdFgGfImoMk87SKN4W/Sq44EpvV8c03bvzBs6LhxTENbTi2RS4BSiynTtEI0kzmV62rJhC8J4feDhn7Tf8NrX2+7ZT8T7P36+u5qocrx2iEbzvugAE849euS32ifPBDbSzmJU3Q4cSVQZ8EBtFlnRe0Xdu5Fsxq+tnMTouBc4lKjygXaQZrDN1F5R5THgCGCpdhTTcjOAw/NacrCif1RUuR/4AskQvsYPjwNFosoi7SDNZJvu/Ym6DwAmgQ3wmHPPAAcRVd7SDtJstkbvT1SZCnyWjE3FbOryELCfDyUHK/qqRZWHgYNpwug0Rt1dwGeIKt78v7WiDySq/Ak4gJTN52aG5SbgsLzvk6/Mij6YqPIUsBfwsHYUM2w/A47L2sCOjWAH42oVdXcB15Dc+WayZSlwMlHlGu0gWqzo9Yq6Q+ACQLSjmJrMI7lBZbZ2EE1W9KGIur8IXA9280vK3Q98OUszqjSL7aMPRVT5A/AJ+tz5ZlLnQuBzVvKErdGHI+ouAGcAPwU6lNOYxDzgO0SVe7SDpIkVvRGi7t2BW4Ex2lE8NwH4AVHlPe0gaWNFb5Soew2SNfspYNMmt9h8krX4ZO0gaWVFb7SoeyfgCsBmV20+R7IWP52oYpcrD8CK3ixR97HAxcDG2lFyairJZvpj2kGywIreTFF3N8nm/PFAp3KavHgB+CFR5XbtIFliRW+FqHsEcC7wdWz/fagWAOcDv8zzABHNYkVvpah7JPAjkstorfC1eQ34OXAlUWWhdpissqJriLq3IplA4ivAaspp0uoF4CLgRqKKDe81TFZ0TVH3BiSzdH4PGKWcJi1mAZcCvyGqrNAOkxdW9DSIugU4hGSer8/j36XJC4CbgWuJKnO0w+SRFT1tou4tgSOBfwP+hfzeJbcMuBu4EZhkm+fNZUVPs6h7C+BLJKXfh+yX/m1gMhADk30aykmbFT0rou6PkwxY+Sng08DHdQPVxAFPkxR7EjAzj7OgZIEVPaui7q2BfauPvYBt0L8o502Sg2mPVB+ziSrv6kYyYEXPj6i7Hdga2KHPY3tgc2DdBi5pOfASMHelxxNElXIjFiAiG5AMGgGwCdAD9A7LvJdzzi6YqZMV3QfJeHcbkVx33/tYn+Qe+vY+j7bqx8XAu9VHpc/nbwPzWjm4oohEwPvOuUtWel5I3r92Cq4GdnWWD5Ij2vOqj8wSka2B3wPTSc5IHC4iTzrn1q3+9y8DBzvnvi0iGwPjgS2BFcApzrmZStHV+Xa+1mTf9sAE59yuwCsDvO5y4CLn3B7AUcC1rQiXVrZGN1kz1zlXy4iuBwPbJFv4AKwnIqs757ycQNOKbrLmH30+X8FHry3oe9+AYAfuPmSb7iazqgfiFojIaBEpkMxv3+s+4MTeL0Rkl1bnSxMrusm6M0mutrufZOy4XicCY0Vkjoj8GfiORri0sNNrxnjA1ujGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeOD/AM0W+8UGUHW2AAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "is_json_counts = df['is_json'].value_counts().compute()\n", + "is_json_counts.plot(kind='pie')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As the value_len increases the frequence of valid JSON on the columns 'value' also increases,\n", + "for the rows that have the value_len one std above the mean, we have the following:\n", + "- isJson True: 46691 rows\n", + " - 99,88% are valid JSON\n", + " \n", + "- isJson False: 54 rows\n", + " - 0,11% are not valid JSON\n", + "\n", + "\n", + "The valid json also represent 9.35% of the data because the number of non Json are too small to make a percentual difference. " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "one std above the mean = len: 46745 (9.35%)\n" + ] + } + ], + "source": [ + "print(\"one std above the mean = len: {0} ({1:0.2f}%)\".format(A_COUNT, A_COUNT / COUNT * 100))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_len
meanstdminmaxcount
is_json
False82460.05555613627.1180632813510465354
True271422.740185412552.29861327669449686146691
\n", + "
" + ], + "text/plain": [ + " value_len \n", + " mean std min max count\n", + "is_json \n", + "False 82460.055556 13627.118063 28135 104653 54\n", + "True 271422.740185 412552.298613 27669 4496861 46691" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "group = std_above.groupby('is_json')\n", + "group_result = group.agg({'value_len': ['mean', 'std', 'min', 'max', 'count']}).compute()\n", + "group_result" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NOT json count: 54 (0.01%)\n", + "IS json count: 46691 (9.34%)\n" + ] + } + ], + "source": [ + "a = group_result['value_len']['count']\n", + "print(\"NOT json count: {0} ({1:0.2f}%)\".format(a[0], a[0] / COUNT * 100))\n", + "print(\"IS json count: {0} ({1:0.2f}%)\".format(a[1], a[1] / COUNT * 100))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQIAAADuCAYAAADSvgkdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAEh1JREFUeJzt3XmQnVWdxvHv6bCDvAhhSSAYdmWRyD6Cg0HQQbawODAFOkOGJYIwTlGKoIynQEUccAw4YA2jcWBAKAUxhkFBtlKjISyhhmUgxRAQIQsk8wYyMEnnnvnjvE132l7u7bv87nnf51N1qzu3b/d9oO99+rzLeY8LISAi1dZjHUBE7KkIRERFICIqAhFBRSAiqAhEBBWBiKAiEBFUBCKCikBEUBGICCoCEUFFICKoCEQEFYGIoCIQEVQEIoKKQERQEYgIKgIRQUUgIqgIRAQVgYigIhARVAQigopARFARiAgqAhFBRSAiqAhEBFjPOoB0iM82BCYWt62ADYH1gQ2Ir4MasLb42AssBxYXt2X4vGaQWjrEhRCsM0gr+GwcsCewH7Az8Q2/fXGbCIxv4qevBV6nvxgWA4uAJ4An8PnLTfxs6QIqghT5zAF7AAcUtwOBKcAmRoneoK8U4PHi40KNItKhIkiFz3YDTgCOJr75N7cNNKqVwH3AHOBufL7MOI+MQEXQreJf/YOJb/4TgA/YBmpKDZhPLIU5+HyBcR4ZREXQTXy2HvBxYBpwHLCdbaC2eYVYCjfj87nWYURF0B18Nhk4GzgTmGAbpuOeBb4P3KTNBzsqAitx6H80cAFxFFD1czpWAz8GrsPn86zDVI2KoNN8tgnwGeDvgPcbp+lW84GZwG34fK11mCpQEXRKPKHnfOASmjumXyXPApfh8zusg5SdiqDdfNYDnA5cAbzPOE2qHgUuxef3WQcpKxVBO/nsL4BvAvtaRymJB4FLtA+h9VQE7eCzA4CrgCOso5TUz4Av4vPnrYOUhYqglXy2OfBtYDrgjNOU3TvAV4FrtEOxeSqCVvHZUcTj4ZOso1TMI8B0fP60dZCUqQia5bPNgH8EZlhHqbDVwOXAVfi81zpMilQEzfDZ4cAsYCfrKALEWY9n4vMnrYOkRkUwFj7bGLgSuBDtC+g2a4B/wOfftA6SEhVBo3w2ibjX+kPWUWREPyaODlZZB0mBiqARPjsE+CnlnRVYNk8C0/D5Iusg3a7qE13q57NPAw+hEkjJvsB8fDbVOki304hgNPEU4SuBL1pHkTHrBS7C59daB+lWKoKR+Ow9wK3AsdZRpCVmAefi8zXWQbqNimA4PtuOeM29va2jSEvdDZyCz9+xDtJNVARD8dn2wAPA7tZRpC0eAI7XEYV+KoLBfLYj8YWyi3UUaau5wNH4fKV1kG6gIhgolsDDwGTjJNIZvwM+gc/ftA5iTYcP+/hsAnA/KoEq+TPgbny2qXUQayoCAJ+NB34F7GodRTruI8AcfLaRdRBLKoL41+Be4rqBUk0fBX5gHcJStYsgXlL8JjRvQOCv8NmXrUNYqXYRwGXASdYhpGtcgc8q+Xqo7lEDn00D7kTTiGVdq4DDqrY+YzWLwGd7Ew8dbWYdRbrSH4AD8fkS6yCdUr1NA59tSbyegEpAhjMJuKtYlKYSqlUEPhsH3A7sbB1Fut4hwHXWITqlWkUAlwJHWoeQZJyNz462DtEJ1dlH4LN9gMeA9a2jSFJeBfbG5yusg7RTNUYEPluPOBddJSCNmkgFNhGqUQTx6kL7W4eQZJ2Oz060DtFO5d808NlewOPABtZRJGlLiZsIy6yDtEO5RwTxKMEsVALSvG2AG6xDtEu5iwAuAg60DiGlcXJZT0Eu76aBz3YAFgKVnl4qLbcQ2LNsayyWeUTgUQlI6+0GnGMdotXKOSLw2R7A08A46yhSSkuBXfD5W9ZBWqWsI4KvoRKQ9tkG+IJ1iFYq34jAZ/sD89H0YmmvVcCu+HyxdZBWKOOI4EpUAtJ+mwJftQ7RKuUaEcTFLh+wjiGV0Qvshc+ftw7SrLKNCL5hHUAqZT1KsjhueUYEPjsM+LV1DKmcd4AdUz/1uEwjggusA0glbQR81jpEs8oxIoiLli4iDtVEOm0JcVSw2jrIWJVlRDADlYDY2ZbEL4uffhHEGYZ/ax1DKm+GdYBmpF8EcAwwwTqEVN7h+Oz91iHGqgxFcJZ1AJHC2dYBxirtIvDZtsAnrWOIFD5lHWCs0i4COB5NLpLuMQmf7WcdYixSL4JjrQOIDDLNOsBYpFsEPtsILVYi3UdF0GFHAJtYhxAZZB98tpN1iEalXATHWQcQGUZyo4KUi+AY6wAiwzjBOkCj0pxr4LMpwBPWMUSGsRbYFp+/YR2kXqmOCDQakG42DjjUOkQjUi2CpP4nSyUltbBOqkWQ5EkbUikqgrby2QTitE+RbnaAdYBGpFcEGg1IGrZK6XwCFYFI+ySzeaAiEGkfFUEbqQgkFcnsJ0irCHy2FbCjdQyROu1jHaBeaRUBJHspKKmkrYpZsl2vriJwzt1fz30doGsTSmq2tw5QjxEvAe6c24g41Xe8c+699C8uujkwsc3ZhrKdwXOKNGN74AXrEKMZbS2Ac4HPE9/0j9FfBCuBf25jruGoCCQ16Y8IQggzgZnOuQtCCNd1KNNIVASSmvSLoE8I4Trn3IeByQO/J4RwU5tyDUf7CCQ15SkC59zNwC7AAuJca4AAdLoINCKQ1JSnCIgnRuwZ7K9ioiKQ1FjsVG9YvecRPEV3vAm3sQ4g0qBN632gc26tc27BgNvkER472Tn3VCsCQv0jgvHAM865R4Aa8OHi/reJmwrLin8fFEJoz9LQcbFTrXgsqdmwgce+HUKY0rYkI6j3jeWHuX8q8FYI4eqBdzrnHPF6iLUmsg2mFY0kRRs0883FqOBm+kcWnwshzB30mL2AWcVz9QAnhxAWOufOAC4s7p8HnBdCWMsQ6j1q8PAwIacO+HxX4C7gN8DBwDTn3JMhhC2Kr58GHBlCOMs5ty1wA3HeQA24MITw+1FiaDQgXSsEAvG13PexBtQCbk0D5/Fv7JxbUHz+YgjhRGApcFQI4R3n3G7Aj/jTyUwzgJkhhFuccxsA45xzHwBOBQ4NIaxxzl0PnM4wO/jrPWrwZvEfCLFd1gdWAd8e9NA9gTNDCDOccyP97GuBb4UQfl803hxg71FiaETQRsULeZ0X8aAbAWrgihd432NdrXhh1AIu9H/N9X0MAULo/1gb4r5BN+j7vDbg/uJzQuj/vLbuY1yNnlDDER/fw4DH0fe1Gj3Uwrtf59376GFt/+ch4NzagV8LPcXz9bgajrX0uP7Hvnuy3Tp6GZfPqP/XMNSmwfrAd51zU4ib4bsP8X2/A77snNsBuLMYDXwM2B+YHwfobEwslSHVOyJ4z8B/O+emAQcN8dAXQgjz6/iRRwJ7FAEB3uuc2ziE8PYI39PKzQwZxDkc8cU87B+w+NsafODI4EDSkG+5rvVyk9//98ASYF/i7+adwQ8IIdzqnJtHvLr3L51zZxH/L/1bCOGSep5kTLMPQwh3EZccG2zVgM9rrPsrGzgLyxF3LE4pbtuPUgIAa8aSVcRYs6/bDHit2N/2aYYYGTvndgb+O4RwLTAb+CBwP3CKc26b4jFbOufeN9yT1LtpcNKAf/YQt1FG/FMQQqg551YU2zUvACfSf3ThV8D5wD8VP39KCGHB0D/pXSoCSVFvk99/PXCHc+5TwIOs+8e2z6nAGc65NcBi4PIQwnLn3FeAe51zPcT3z/nAS0M9SV0rHTnnZg34Zy+wCLgROI/iqEGxs/AnA7dxnHOnAt8gDo+eATYsdhZuTdxZuDuxjB4MIZw/ahCf9aJ9BZKWp/B511+gJK0lz3y2AtjCOoZIA36Nz//cOsRo6r0wyQ7OuZ8655Y655Y45+4o9lB22msGzynSjCRes/XuLJxF3AkxkTiJ4ufFfZ32R4PnFGnGq9YB6lFvEWwdQpgVQugtbj8Etm5jruEk8T9VZIBSjQhed86d4ZwbV9zOACyWfFYRSGpKVQTTgb8kHpp4DTgFOLNdoUagIpDUJFEE9Z6/fwXw1yGEFRBPTgCuJhZEJ6kIJDVJFEG9I4IP9pUAQAhhOfCh9kQakYpAUpPEa7beIugpLmcOvDsisJgN+AeD5xQZqzfx+YrRH2av3iK4BpjrnLvCOXc5MBf4VvtiDcPnrwDLO/68ImPzhHWAetVVBMXVik8mzoJaBpwUQri5ncFG8LjR84o06jHrAPWqe3gfQniGOF/A2qPEacwi3S6ZP1qpLYIKsQhEUpDMiEBFINIebwHPWYeoV3pF4POXgNetY4iMYgE+T+aqWukVQZTMkEsqK5n9A5BuEdRzXUQRS0ltwqZaBA9ZBxAZQQDutQ7RiFSL4GHgf6xDiAzjEXy+xDpEI9IsAp/3AvdYxxAZxmzrAI1Kswiin1kHEBnGz60DNCrlIrgHXeJcus8ifP6f1iEalW4R+Hwl2mko3Se50QCkXARRcttiUnpJviZVBCKts5J4RCs5aReBz18mXhtBpBvchs+T3G+VdhFEN1oHECkk+1osQxHcjk4uEnsL8HlSpxUPlH4R+Pxt4BbrGFJ5yY4GoAxFEF1vHUAqbSVgdem+lihHEfj8GeA+6xhSWbPw+ZvWIZpRjiKIvmMdQCqpBlxnHaJZZSqCe0jo0lBSGnPw+QvWIZpVniLweQC+bh1DKiUA3jpEK5SnCKJbgOQmfEiybsfnySxiMpJyFUG8WOQl1jGkEtYAX7EO0SrlKgIAn99Noud7S1L+tQz7BvqUrwiii60DSKmtAi63DtFK5SwCn88D7rSOIaU1E58vtg7RSuUsguhSoNc6hJTOcixWAm+z8haBz58D/sU6hpTOZfg8tw7RauUtguhLwMvWIaQ0HgJusA7RDuUugnj+93TiiR8izVgFTC9OXCudchcBgM/vB75nHUOSdzE+f9E6RLuUvwiiLwCl/SVK2z1Eyae6V6MIfL4KOBNtIkjjSr1J0KcaRQDg84cpwXRR6bhSbxL0qU4RRJcA/2UdQpLxC0q+SdDHhVDqEc+f8tnuwDxgC+so0tWeBw7G55W4MG7VRgTg8+eBU4G11lGka+XA8VUpAahiEQD4/F7gIusY0pVqwGnFmamVUc0iAPD5TOD71jGk61yMz39hHaLTqlsE0XnAb6xDSNe4GZ9fbR3CQrWLwOergZOAl6yjiLl5wNnWIaxUuwgAfL4M+CSwzDqKmHkGOBaf/591ECsqAuhbIOUoYIV1FOm4hcDH8Pnr1kEsqQj6+PxJ4OPEQ0dSDS8CR5TtakNjoSIYKK5m+wm0unIVvABMxeevWAfpBiqCweL1Do8AKj1ULLnngMPxuXYSF1QEQ4mLVnwUqPyQsYSeJpbAH62DdBMVwXB8/jRwGPCsdRRpmfuAj+DzJdZBuo2KYCRxAYtDgDnWUaRpM4Gj8bmODA2herMPx8JnPcDX0HJqKVoNnIfPdTr5CFQEjfDZacAPgI2to0hdlgIn4fPfWgfpdiqCRvlsP+AuYJJ1FBnRAuAEfK7L2ddB+wga5fPHgQOAB62jyLB+CByqEqifRgRj5TNHnL14FbCpcRqJXgPOwefaudsgFUGzfLYT8boGU62jVNy/AxfqqMDYqAhaIY4OPkscHWxmnKZqFgPn4vPZ1kFSpiJoJZ9NJo4OjjBOUhW3Ahfg8+XWQVKnImi1ODr4G+ByYAfbMKX1JPClKl5SrF1UBO3is42A84FLgS2N05TFi8BlwK1lX3mo01QE7eazjLj24ufR0YWxWkY8s/N7xeXlpMVUBJ3is22Jf83OAdY3TpOKt4BrgGuKJe6lTVQEneaznYlrKnwGHWEYzhLiUvbX4/Ol1mGqQEVgJW4yTAc+B+xsnKZbPEGcJfgjbQJ0lorAWpzZeBTxUtrHU73NhreA24Ab8fkj1mGqSkXQTXy2DXGT4RTgIMDZBmqb1cS5Gj8Bbtf2vz0VQbeKOxePAY4jjhhSP+LwBvAfwGzgl3rzdxcVQQriOQlTiaVwLOlMgV5IfOPPBn6Lz7UCdZdSEaQoTnTaH9hvwMfxppngFeCx4vYo8Jj2+KdDRVAWPptEfynsDUwEJgDbARu06Fn+F3iVON33VeIVgeMbX2/6pKkIyi7OfdiSWAgT6C+HTYH1gHHFR4BeYE1xe5s4s+9V+t78PtcqUCWlIhARXapMRFQEIoKKQERQEYgIKgIRQUUgIqgIRAQVgYigIhARVAQigopARFARiAgqAhFBRSAiqAhEBBWBiKAiEBFUBCKCikBEUBGICCoCEUFFICKoCEQEFYGIoCIQEVQEIoKKQERQEYgIKgIRQUUgIqgIRAT4fxoxDoxWW+6XAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "group_result['value_len']['count'].plot(kind='pie')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### All greater values are JSON\n", + "\n", + "There is absolute no value greater than 104653 (max value for non-json) that represents a valid JSON. \n", + "\n", + "This implies that all the greater values are JSON but they represent very low percentage of the whole data (6.76%). " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "104653\n", + "len: 33788 (6.76%)\n" + ] + } + ], + "source": [ + "max_non_json_value = group_result['value_len']['max'][0]\n", + "allJson = df[df['value_len'] > max_non_json_value ]\n", + "length = len(allJson)\n", + "print(\"len: {0} ({1:0.2f}%)\".format(length, length / COUNT * 100))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From c5ec9b967a2c62cd4a94d52b2dbffd5799398b3e Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Sun, 31 Mar 2019 14:11:53 -0300 Subject: [PATCH 05/23] Data prep saving other samples --- .../isJson_dataPrep.ipynb | 204 +++++++++++++++++- 1 file changed, 199 insertions(+), 5 deletions(-) diff --git a/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_dataPrep.ipynb b/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_dataPrep.ipynb index 29b5242..547739c 100644 --- a/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_dataPrep.ipynb +++ b/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_dataPrep.ipynb @@ -30,7 +30,7 @@ "\n", "

Client

\n", "\n", "\n", @@ -46,7 +46,7 @@ "" ], "text/plain": [ - "" + "" ] }, "execution_count": 1, @@ -184,7 +184,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# is JSON analysis\n", + "# is JSON Column\n", "\n", "After manual initial analysis I have think that the huge values are json structured, to validate that I included an new column that is a boolean value with the validation of json" ] @@ -244,7 +244,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -356,7 +356,7 @@ "4 False " ] }, - "execution_count": 10, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -366,6 +366,200 @@ "df = dd.read_parquet('is_json_above_mean.parquet')\n", "df.head()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Saving other possible usefull samples to future analyses" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/overscripted/lib/python3.6/site-packages/fastparquet/util.py:221: FutureWarning: A future version of pandas will default to `skipna=True`. To silence this warning, pass `skipna=True|False` explicitly.\n", + " inferred_dtype = infer_dtype(column)\n", + "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=20976)\n", + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=20976)\n", + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=20976)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "distributed.nanny - WARNING - Worker process 20976 was killed by unknown signal\n", + "distributed.nanny - WARNING - Restarting worker\n" + ] + } + ], + "source": [ + "df[df['is_json'] == True].to_parquet('all_json_above_mean.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[df['is_json'] == False].to_parquet('all_NON_json_above_mean.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From c3fb738d358db68418f068525bfa141bdd9ddcc4 Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Sun, 31 Mar 2019 14:18:07 -0300 Subject: [PATCH 06/23] Update readme with future questions --- .../README.md | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/analyses/2019_03_aliamcami_greatest_values_are_json/README.md b/analyses/2019_03_aliamcami_greatest_values_are_json/README.md index 67e78a1..6c7d919 100644 --- a/analyses/2019_03_aliamcami_greatest_values_are_json/README.md +++ b/analyses/2019_03_aliamcami_greatest_values_are_json/README.md @@ -24,3 +24,48 @@ All the greatest values are JSON, but they represent very little percentage of t ## The top 46745 gratest value_len are valid JSONs, that is 9.35% of the filtered sample (value_len > mean) and 0,41% of the original 10% sample. + + +--- + +# Future questions + +## About JSONs: +- **The JSON values are always from the same location or related domains?** +- **Are there a set of location domains that always produces a JSON?** +- Does the JSON values follow a structure pattern? What pattern? +- What data does the JSON hold? Is there any pattern on content? +- Do they have nested JSON? Css? Html? Javascript? Recursive study on JSON properties. + +- Is a JSON's structure for a single script_url domain always the same? +- Is every JSON with the same structure produced by the same script_url domain? + +## General +I'm think some things here maybe a crawler investigation or just wiki reading, since someone may have already described and explained. I just need to find, read and understand it. + +- Are there other valid data types like html, css... in the values column or just JSON? +- Where does the value comes from? What is it used for? + +## Smal: value_len < mean +- What are the small values? +- Does the smaller values have any pattern? +- What the majority data type? + +## Medium: mean < value_len < (mean + std) +- How many rows are there in the intersection of *“no JSON”* and *“everything is JSON”* ? +- What are they? Are they from a specific script_url domain? Or realated domains? + +## Big: value_len > (mean + std) +- What are the big non-JSON values? + +## Security and data sharing: +- Do the value columns have any javascript? nested javascript? +- Do the javascripts in the dataset contain known malicious behaviors? +- Can they collect data that threatens user's privacy? + +## Statistical knowledge / coincidence: +The **mean** of the original 10% sample is pretty similar to the **std** of the sample taken after filtering for values above the mean +- why? +- Is it a coincidence? +- Is it always like this? +- Is it a statistical pattern? \ No newline at end of file From 1a5bcdb740460048a9f149b0466958088d1e895e Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Sun, 31 Mar 2019 14:57:17 -0300 Subject: [PATCH 07/23] Add of 'domain' column to data prep --- .../isJson_dataPrep.ipynb | 993 ++++++++++++++++-- 1 file changed, 934 insertions(+), 59 deletions(-) diff --git a/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_dataPrep.ipynb b/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_dataPrep.ipynb index 547739c..5175e39 100644 --- a/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_dataPrep.ipynb +++ b/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_dataPrep.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -30,7 +30,7 @@ "\n", "

Client

\n", "\n", "\n", @@ -46,10 +46,10 @@ "" ], "text/plain": [ - "" + "" ] }, - "execution_count": 1, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -75,17 +75,27 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['value_1000', 'value', 'value_len', 'symbol', 'script_url', 'location'], dtype='object')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "#Original sample\n", "df = dd.read_parquet('sample_0.parquet', \n", " engine='pyarrow', \n", - " columns=['value_1000', 'value', 'value_len', 'symbol', 'script_url'])\n", + " columns=['value_1000', 'value', 'value_len', 'symbol', 'script_url', 'location'])\n", "\n", "# df.astype({'value_1000': str, 'value': str,'value_len': int,'symbol': int,'script_url': str})\n", - "df_index={'value_1000': str, 'value': str,'value_len': int,'symbol': int,'script_url': str}\n", "df.columns" ] }, @@ -103,39 +113,28 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 4, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "499805" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "#Save\n", - "dff = df[df['value_len'] > 1356]\n", - "dd.to_parquet(df=dff, path='filtered_above_mean.parquet', engine='pyarrow')\n", + "df = df[df['value_len'] > 1356]\n", + "dd.to_parquet(df=df, path='filtered_above_mean.parquet', engine='pyarrow')\n", "# len(dff)" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Index(['value_1000', 'value', 'value_len', 'symbol', 'script_url'], dtype='object')" + "Index(['value_1000', 'value', 'value_len', 'symbol', 'script_url', 'location'], dtype='object')" ] }, - "execution_count": 2, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -180,6 +179,196 @@ "print(df_mean, df_min, df_max, df_std)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Domains\n", + "The following code is from this same project: ~/analyses/hello_world.ipynb\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import tldextract\n", + "\n", + "def extract_domain(url):\n", + " \"\"\"Use tldextract to return the base domain from a url\"\"\"\n", + " try:\n", + " extracted = tldextract.extract(url)\n", + " return '{}.{}'.format(extracted.domain, extracted.suffix)\n", + " except Exception as e:\n", + " return 'ERROR'" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "df.astype({'value_1000': str, 'value': str,'value_len': int,'symbol': int,'script_url': str, 'location': str})\n", + "df['location_domain'] = df.location.apply(extract_domain)\n", + "df['script_domain'] = df.script_url.apply(extract_domain)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "#save\n", + "df.to_parquet('0_sample_domains.parquet', engine='pyarrow')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_1000valuevalue_lensymbolscript_urllocationlocation_domainscript_domain
0{\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...{\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...3713window.sessionStoragehttps://assets.adobedtm.com/caacec67651710193d...https://www.canada.ca/en/services.htmlcanada.caadobedtm.com
1{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...103878window.localStoragehttps://g.alicdn.com/alilog/mlog/aplus_v2.jshttps://maniform.world.tmall.com/category-1282...tmall.comalicdn.com
2{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...103878window.localStoragehttps://g.alicdn.com/alilog/mlog/aplus_v2.jshttps://maniform.world.tmall.com/category-1282...tmall.comalicdn.com
3usunico=17/12/2017:0-00155123:830; SessionASM=...usunico=17/12/2017:0-00155123:830; SessionASM=...1358window.document.cookiehttps://www.coches.net/scripts/common.min.js?2...https://www.coches.net/fiat/segunda-mano/coches.netcoches.net
4usunico=17/12/2017:0-00155123:830; SessionASM=...usunico=17/12/2017:0-00155123:830; SessionASM=...1358window.document.cookiehttps://www.coches.net/scripts/common.min.js?2...https://www.coches.net/fiat/segunda-mano/coches.netcoches.net
\n", + "
" + ], + "text/plain": [ + " value_1000 \\\n", + "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... \n", + "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", + "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", + "3 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", + "4 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", + "\n", + " value value_len \\\n", + "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... 3713 \n", + "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... 103878 \n", + "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... 103878 \n", + "3 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", + "4 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", + "\n", + " symbol script_url \\\n", + "0 window.sessionStorage https://assets.adobedtm.com/caacec67651710193d... \n", + "1 window.localStorage https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", + "2 window.localStorage https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", + "3 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", + "4 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", + "\n", + " location location_domain \\\n", + "0 https://www.canada.ca/en/services.html canada.ca \n", + "1 https://maniform.world.tmall.com/category-1282... tmall.com \n", + "2 https://maniform.world.tmall.com/category-1282... tmall.com \n", + "3 https://www.coches.net/fiat/segunda-mano/ coches.net \n", + "4 https://www.coches.net/fiat/segunda-mano/ coches.net \n", + "\n", + " script_domain \n", + "0 adobedtm.com \n", + "1 alicdn.com \n", + "2 alicdn.com \n", + "3 coches.net \n", + "4 coches.net " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#read\n", + "df = dd.read_parquet('0_sample_domains.parquet', engine='pyarrow')\n", + "df.head()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -191,7 +380,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -207,11 +396,11 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ - "df.astype({'value_1000': str, 'value': str,'value_len': int,'symbol': int,'script_url': str})\n", + "\n", "df['is_json'] = df['value'].apply(is_json)" ] }, @@ -225,7 +414,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -244,7 +433,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -273,6 +462,9 @@ " value_len\n", " symbol\n", " script_url\n", + " location\n", + " location_domain\n", + " script_domain\n", " is_json\n", " \n", " \n", @@ -284,6 +476,9 @@ " 3713\n", " window.sessionStorage\n", " https://assets.adobedtm.com/caacec67651710193d...\n", + " https://www.canada.ca/en/services.html\n", + " canada.ca\n", + " adobedtm.com\n", " True\n", " \n", " \n", @@ -293,6 +488,9 @@ " 103878\n", " window.localStorage\n", " https://g.alicdn.com/alilog/mlog/aplus_v2.js\n", + " https://maniform.world.tmall.com/category-1282...\n", + " tmall.com\n", + " alicdn.com\n", " True\n", " \n", " \n", @@ -302,6 +500,9 @@ " 103878\n", " window.localStorage\n", " https://g.alicdn.com/alilog/mlog/aplus_v2.js\n", + " https://maniform.world.tmall.com/category-1282...\n", + " tmall.com\n", + " alicdn.com\n", " True\n", " \n", " \n", @@ -311,6 +512,9 @@ " 1358\n", " window.document.cookie\n", " https://www.coches.net/scripts/common.min.js?2...\n", + " https://www.coches.net/fiat/segunda-mano/\n", + " coches.net\n", + " coches.net\n", " False\n", " \n", " \n", @@ -320,6 +524,9 @@ " 1358\n", " window.document.cookie\n", " https://www.coches.net/scripts/common.min.js?2...\n", + " https://www.coches.net/fiat/segunda-mano/\n", + " coches.net\n", + " coches.net\n", " False\n", " \n", " \n", @@ -348,15 +555,22 @@ "3 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", "4 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", "\n", - " is_json \n", - "0 True \n", - "1 True \n", - "2 True \n", - "3 False \n", - "4 False " + " location location_domain \\\n", + "0 https://www.canada.ca/en/services.html canada.ca \n", + "1 https://maniform.world.tmall.com/category-1282... tmall.com \n", + "2 https://maniform.world.tmall.com/category-1282... tmall.com \n", + "3 https://www.coches.net/fiat/segunda-mano/ coches.net \n", + "4 https://www.coches.net/fiat/segunda-mano/ coches.net \n", + "\n", + " script_domain is_json \n", + "0 adobedtm.com True \n", + "1 alicdn.com True \n", + "2 alicdn.com True \n", + "3 coches.net False \n", + "4 coches.net False " ] }, - "execution_count": 2, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -376,17 +590,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/anaconda3/envs/overscripted/lib/python3.6/site-packages/fastparquet/util.py:221: FutureWarning: A future version of pandas will default to `skipna=True`. To silence this warning, pass `skipna=True|False` explicitly.\n", - " inferred_dtype = infer_dtype(column)\n", "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", - "tornado.application - ERROR - Exception in callback >\n", + "tornado.application - ERROR - Exception in callback >\n", "Traceback (most recent call last):\n", " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", " ret = self._cache[fun]\n", @@ -433,8 +645,8 @@ " self.gen.throw(type, value, traceback)\n", " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=20976)\n", - "tornado.application - ERROR - Exception in callback >\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=21460)\n", + "tornado.application - ERROR - Exception in callback >\n", "Traceback (most recent call last):\n", " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", " ret = self._cache[fun]\n", @@ -481,8 +693,11 @@ " self.gen.throw(type, value, traceback)\n", " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=20976)\n", - "tornado.application - ERROR - Exception in callback >\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=21460)\n", + "distributed.nanny - WARNING - Worker process 21460 was killed by unknown signal\n", + "distributed.nanny - WARNING - Restarting worker\n", + "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", + "tornado.application - ERROR - Exception in callback >\n", "Traceback (most recent call last):\n", " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", " ret = self._cache[fun]\n", @@ -529,25 +744,685 @@ " self.gen.throw(type, value, traceback)\n", " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=20976)\n" + "psutil.AccessDenied: psutil.AccessDenied (pid=21468)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "distributed.nanny - WARNING - Worker process 20976 was killed by unknown signal\n", - "distributed.nanny - WARNING - Restarting worker\n" - ] - } - ], - "source": [ - "df[df['is_json'] == True].to_parquet('all_json_above_mean.parquet')" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=21468)\n", + "distributed.nanny - WARNING - Worker process 21468 was killed by unknown signal\n", + "distributed.nanny - WARNING - Restarting worker\n", + "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=21484)\n", + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "psutil.AccessDenied: psutil.AccessDenied (pid=21484)\n", + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=21484)\n", + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=21484)\n", + "distributed.nanny - WARNING - Worker process 21484 was killed by unknown signal\n", + "distributed.nanny - WARNING - Restarting worker\n", + "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=21476)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=21476)\n", + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=21476)\n", + "distributed.nanny - WARNING - Worker process 21476 was killed by unknown signal\n", + "distributed.nanny - WARNING - Restarting worker\n", + "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=21497)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=21497)\n", + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=21497)\n", + "distributed.nanny - WARNING - Worker process 21497 was killed by unknown signal\n", + "distributed.nanny - WARNING - Restarting worker\n", + "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=21489)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=21489)\n", + "distributed.nanny - WARNING - Worker process 21489 was killed by unknown signal\n", + "distributed.nanny - WARNING - Restarting worker\n" + ] + } + ], + "source": [ + "df[df['is_json'] == True].to_parquet('all_json_above_mean.parquet', engine='pyarrow')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ From efc051ecd1ec58daf6191954aad35786bda6163b Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Sun, 31 Mar 2019 17:36:32 -0300 Subject: [PATCH 08/23] Update jsJson_dataPrep to include an extra column with the md5 of value columns --- .../isJson_dataPrep.ipynb | 767 +++++++++++++++++- 1 file changed, 762 insertions(+), 5 deletions(-) diff --git a/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_dataPrep.ipynb b/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_dataPrep.ipynb index 5175e39..f1cb759 100644 --- a/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_dataPrep.ipynb +++ b/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_dataPrep.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -30,7 +30,7 @@ "\n", "

Client

\n", "\n", "\n", @@ -46,10 +46,10 @@ "" ], "text/plain": [ - "" + "" ] }, - "execution_count": 2, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -431,6 +431,763 @@ "df.to_parquet('is_json_above_mean.parquet')" ] }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_1000valuevalue_lensymbolscript_urllocationlocation_domainscript_domainis_json
0{\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...{\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...3713window.sessionStoragehttps://assets.adobedtm.com/caacec67651710193d...https://www.canada.ca/en/services.htmlcanada.caadobedtm.comTrue
1{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...103878window.localStoragehttps://g.alicdn.com/alilog/mlog/aplus_v2.jshttps://maniform.world.tmall.com/category-1282...tmall.comalicdn.comTrue
2{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...103878window.localStoragehttps://g.alicdn.com/alilog/mlog/aplus_v2.jshttps://maniform.world.tmall.com/category-1282...tmall.comalicdn.comTrue
3usunico=17/12/2017:0-00155123:830; SessionASM=...usunico=17/12/2017:0-00155123:830; SessionASM=...1358window.document.cookiehttps://www.coches.net/scripts/common.min.js?2...https://www.coches.net/fiat/segunda-mano/coches.netcoches.netFalse
4usunico=17/12/2017:0-00155123:830; SessionASM=...usunico=17/12/2017:0-00155123:830; SessionASM=...1358window.document.cookiehttps://www.coches.net/scripts/common.min.js?2...https://www.coches.net/fiat/segunda-mano/coches.netcoches.netFalse
\n", + "
" + ], + "text/plain": [ + " value_1000 \\\n", + "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... \n", + "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", + "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", + "3 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", + "4 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", + "\n", + " value value_len \\\n", + "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... 3713 \n", + "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... 103878 \n", + "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... 103878 \n", + "3 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", + "4 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", + "\n", + " symbol script_url \\\n", + "0 window.sessionStorage https://assets.adobedtm.com/caacec67651710193d... \n", + "1 window.localStorage https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", + "2 window.localStorage https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", + "3 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", + "4 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", + "\n", + " location location_domain \\\n", + "0 https://www.canada.ca/en/services.html canada.ca \n", + "1 https://maniform.world.tmall.com/category-1282... tmall.com \n", + "2 https://maniform.world.tmall.com/category-1282... tmall.com \n", + "3 https://www.coches.net/fiat/segunda-mano/ coches.net \n", + "4 https://www.coches.net/fiat/segunda-mano/ coches.net \n", + "\n", + " script_domain is_json \n", + "0 adobedtm.com True \n", + "1 alicdn.com True \n", + "2 alicdn.com True \n", + "3 coches.net False \n", + "4 coches.net False " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#read\n", + "df = dd.read_parquet('is_json_above_mean.parquet')\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Value md5\n", + "Include new columns called \"value_md5\" that is the md5 of value column" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import hashlib\n", + "\n", + "def md5(value):\n", + " return hashlib.md5(value.encode('utf-8')).hexdigest()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "df['value_md5'] = df['value'].apply(md5)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_1000valuevalue_lensymbolscript_urllocationlocation_domainscript_domainis_jsonvalue_md5
0{\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...{\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...3713window.sessionStoragehttps://assets.adobedtm.com/caacec67651710193d...https://www.canada.ca/en/services.htmlcanada.caadobedtm.comTruecff77029e3ae45dd439a62987b1d8340
1{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...103878window.localStoragehttps://g.alicdn.com/alilog/mlog/aplus_v2.jshttps://maniform.world.tmall.com/category-1282...tmall.comalicdn.comTrue9ac0a0a0afb677c8fd985a7c2f4ddbc5
2{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...103878window.localStoragehttps://g.alicdn.com/alilog/mlog/aplus_v2.jshttps://maniform.world.tmall.com/category-1282...tmall.comalicdn.comTrue9ac0a0a0afb677c8fd985a7c2f4ddbc5
3usunico=17/12/2017:0-00155123:830; SessionASM=...usunico=17/12/2017:0-00155123:830; SessionASM=...1358window.document.cookiehttps://www.coches.net/scripts/common.min.js?2...https://www.coches.net/fiat/segunda-mano/coches.netcoches.netFalsedb64465b639e01993d9212390f057628
4usunico=17/12/2017:0-00155123:830; SessionASM=...usunico=17/12/2017:0-00155123:830; SessionASM=...1358window.document.cookiehttps://www.coches.net/scripts/common.min.js?2...https://www.coches.net/fiat/segunda-mano/coches.netcoches.netFalsedb64465b639e01993d9212390f057628
\n", + "
" + ], + "text/plain": [ + " value_1000 \\\n", + "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... \n", + "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", + "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", + "3 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", + "4 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", + "\n", + " value value_len \\\n", + "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... 3713 \n", + "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... 103878 \n", + "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... 103878 \n", + "3 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", + "4 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", + "\n", + " symbol script_url \\\n", + "0 window.sessionStorage https://assets.adobedtm.com/caacec67651710193d... \n", + "1 window.localStorage https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", + "2 window.localStorage https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", + "3 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", + "4 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", + "\n", + " location location_domain \\\n", + "0 https://www.canada.ca/en/services.html canada.ca \n", + "1 https://maniform.world.tmall.com/category-1282... tmall.com \n", + "2 https://maniform.world.tmall.com/category-1282... tmall.com \n", + "3 https://www.coches.net/fiat/segunda-mano/ coches.net \n", + "4 https://www.coches.net/fiat/segunda-mano/ coches.net \n", + "\n", + " script_domain is_json value_md5 \n", + "0 adobedtm.com True cff77029e3ae45dd439a62987b1d8340 \n", + "1 alicdn.com True 9ac0a0a0afb677c8fd985a7c2f4ddbc5 \n", + "2 alicdn.com True 9ac0a0a0afb677c8fd985a7c2f4ddbc5 \n", + "3 coches.net False db64465b639e01993d9212390f057628 \n", + "4 coches.net False db64465b639e01993d9212390f057628 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/overscripted/lib/python3.6/site-packages/fastparquet/util.py:221: FutureWarning: A future version of pandas will default to `skipna=True`. To silence this warning, pass `skipna=True|False` explicitly.\n", + " inferred_dtype = infer_dtype(column)\n", + "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=1373)\n", + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=1373)\n", + "distributed.nanny - WARNING - Worker process 1373 was killed by unknown signal\n", + "distributed.nanny - WARNING - Restarting worker\n", + "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=1375)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=1375)\n", + "distributed.nanny - WARNING - Worker process 1375 was killed by unknown signal\n", + "distributed.nanny - WARNING - Restarting worker\n", + "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=1421)\n", + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=1421)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=1421)\n", + "tornado.application - ERROR - Exception in callback >\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: 'Process' object has no attribute '_cache'\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", + " ret = self._cache[fun]\n", + "AttributeError: _cache\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", + " yield\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + "ProcessLookupError: [Errno 3] No such process\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", + " return self.callback()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", + " memory = proc.memory_info().rss\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", + " return self._proc.memory_info()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", + " rawtuple = self._get_pidtaskinfo()\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", + " return fun(self, *args, **kwargs)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", + " return fun(self)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", + " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", + " self.gen.throw(type, value, traceback)\n", + " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", + " raise AccessDenied(proc.pid, proc._name)\n", + "psutil.AccessDenied: psutil.AccessDenied (pid=1421)\n", + "distributed.nanny - WARNING - Worker process 1421 was killed by unknown signal\n", + "distributed.nanny - WARNING - Restarting worker\n" + ] + } + ], + "source": [ + "#save\n", + "df.to_parquet('is_json_above_mean_md5.parquet')" + ] + }, { "cell_type": "code", "execution_count": 16, @@ -577,7 +1334,7 @@ ], "source": [ "#read\n", - "df = dd.read_parquet('is_json_above_mean.parquet')\n", + "df = dd.read_parquet('is_json_above_mean_md5.parquet')\n", "df.head()" ] }, From 2b617decabb37103ca60012742af1ae2511b821e Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Sun, 31 Mar 2019 19:42:13 -0300 Subject: [PATCH 09/23] Rename 'isJson_Sample_Comparasion' to 'isJson_Quantitative_Comparasion' --- ...le_Comparasion.ipynb => isJson_Quantitative_Comparasion.ipynb} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename analyses/2019_03_aliamcami_greatest_values_are_json/{isJson_Sample_Comparasion.ipynb => isJson_Quantitative_Comparasion.ipynb} (100%) diff --git a/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_Sample_Comparasion.ipynb b/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_Quantitative_Comparasion.ipynb similarity index 100% rename from analyses/2019_03_aliamcami_greatest_values_are_json/isJson_Sample_Comparasion.ipynb rename to analyses/2019_03_aliamcami_greatest_values_are_json/isJson_Quantitative_Comparasion.ipynb From 68700ecf597d3ae859295a68dd33f67f68288232 Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Sun, 31 Mar 2019 19:44:46 -0300 Subject: [PATCH 10/23] Rename folder from ''2019_03_aliamcami_greatest_values_are_json' to '2019_03_aliamcami_value_analyses' --- .../README.md | 0 .../isJson_Quantitative_Comparasion.ipynb | 0 .../isJson_Quantity_Analysis.ipynb | 0 .../isJson_dataPrep.ipynb | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename analyses/{2019_03_aliamcami_greatest_values_are_json => 2019_03_aliamcami_value_analyses}/README.md (100%) rename analyses/{2019_03_aliamcami_greatest_values_are_json => 2019_03_aliamcami_value_analyses}/isJson_Quantitative_Comparasion.ipynb (100%) rename analyses/{2019_03_aliamcami_greatest_values_are_json => 2019_03_aliamcami_value_analyses}/isJson_Quantity_Analysis.ipynb (100%) rename analyses/{2019_03_aliamcami_greatest_values_are_json => 2019_03_aliamcami_value_analyses}/isJson_dataPrep.ipynb (100%) diff --git a/analyses/2019_03_aliamcami_greatest_values_are_json/README.md b/analyses/2019_03_aliamcami_value_analyses/README.md similarity index 100% rename from analyses/2019_03_aliamcami_greatest_values_are_json/README.md rename to analyses/2019_03_aliamcami_value_analyses/README.md diff --git a/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_Quantitative_Comparasion.ipynb b/analyses/2019_03_aliamcami_value_analyses/isJson_Quantitative_Comparasion.ipynb similarity index 100% rename from analyses/2019_03_aliamcami_greatest_values_are_json/isJson_Quantitative_Comparasion.ipynb rename to analyses/2019_03_aliamcami_value_analyses/isJson_Quantitative_Comparasion.ipynb diff --git a/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_Quantity_Analysis.ipynb b/analyses/2019_03_aliamcami_value_analyses/isJson_Quantity_Analysis.ipynb similarity index 100% rename from analyses/2019_03_aliamcami_greatest_values_are_json/isJson_Quantity_Analysis.ipynb rename to analyses/2019_03_aliamcami_value_analyses/isJson_Quantity_Analysis.ipynb diff --git a/analyses/2019_03_aliamcami_greatest_values_are_json/isJson_dataPrep.ipynb b/analyses/2019_03_aliamcami_value_analyses/isJson_dataPrep.ipynb similarity index 100% rename from analyses/2019_03_aliamcami_greatest_values_are_json/isJson_dataPrep.ipynb rename to analyses/2019_03_aliamcami_value_analyses/isJson_dataPrep.ipynb From 0820ceadb29e0f957c9c7c5034e184187c5fdfc1 Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Sun, 31 Mar 2019 19:45:55 -0300 Subject: [PATCH 11/23] Removal of outdated notebook --- .../isJson_Quantity_Analysis.ipynb | 509 ------------------ 1 file changed, 509 deletions(-) delete mode 100644 analyses/2019_03_aliamcami_value_analyses/isJson_Quantity_Analysis.ipynb diff --git a/analyses/2019_03_aliamcami_value_analyses/isJson_Quantity_Analysis.ipynb b/analyses/2019_03_aliamcami_value_analyses/isJson_Quantity_Analysis.ipynb deleted file mode 100644 index 0a209bd..0000000 --- a/analyses/2019_03_aliamcami_value_analyses/isJson_Quantity_Analysis.ipynb +++ /dev/null @@ -1,509 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Start client" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/anaconda3/envs/overscripted/lib/python3.6/site-packages/dask/config.py:168: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", - " data = yaml.load(f.read()) or {}\n", - "/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/config.py:20: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", - " defaults = yaml.load(f)\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "

Client

\n", - "\n", - "
\n", - "

Cluster

\n", - "
    \n", - "
  • Workers: 4
  • \n", - "
  • Cores: 4
  • \n", - "
  • Memory: 8.59 GB
  • \n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import dask.dataframe as dd\n", - "from dask.distributed import Client\n", - "\n", - "#Initializing client\n", - "client = Client()\n", - "client" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data\n", - "Using filtered and evaluated for json data named 'is_json_above_mean.parquet'. You can get this by running the 'isJson_dataPrep.ipynb'\n", - "\n", - "This new sample has 499805, meaning that its only 4,42% of the original sample (most values are smaller than the sample's mean of 1356)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
value_1000valuevalue_lensymbolscript_urlis_json
0{\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...{\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...3713window.sessionStoragehttps://assets.adobedtm.com/caacec67651710193d...True
1{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...103878window.localStoragehttps://g.alicdn.com/alilog/mlog/aplus_v2.jsTrue
2{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...103878window.localStoragehttps://g.alicdn.com/alilog/mlog/aplus_v2.jsTrue
3usunico=17/12/2017:0-00155123:830; SessionASM=...usunico=17/12/2017:0-00155123:830; SessionASM=...1358window.document.cookiehttps://www.coches.net/scripts/common.min.js?2...False
4usunico=17/12/2017:0-00155123:830; SessionASM=...usunico=17/12/2017:0-00155123:830; SessionASM=...1358window.document.cookiehttps://www.coches.net/scripts/common.min.js?2...False
\n", - "
" - ], - "text/plain": [ - " value_1000 \\\n", - "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... \n", - "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", - "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", - "3 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", - "4 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", - "\n", - " value value_len \\\n", - "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... 3713 \n", - "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... 103878 \n", - "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... 103878 \n", - "3 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", - "4 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", - "\n", - " symbol script_url \\\n", - "0 window.sessionStorage https://assets.adobedtm.com/caacec67651710193d... \n", - "1 window.localStorage https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", - "2 window.localStorage https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", - "3 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", - "4 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", - "\n", - " is_json \n", - "0 True \n", - "1 True \n", - "2 True \n", - "3 False \n", - "4 False " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = dd.read_parquet('is_json_above_mean.parquet')\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## DF overview\n", - "Some overview about the sample after the data prep: \n", - "- Rows: 499805\n", - "- Mean: 27829.33,\n", - "- Min: 1357,\n", - "- Max: 4496861\n", - "- Std: 122092.41" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "27829.332847810645 1357 4496861 122092.41371885882\n" - ] - } - ], - "source": [ - "df_mean = df['value_len'].mean()\n", - "df_min = df['value_len'].min()\n", - "df_max = df['value_len'].max()\n", - "df_std = df['value_len'].std()\n", - "(df_mean, df_min, df_max, df_std) = dd.compute(df_mean, df_min, df_max, df_std);\n", - "print(df_mean, df_min, df_max, df_std)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Quantity analysis " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Whole\n", - "This whole sample has: \n", - "- False: 307577 rows\n", - " - 61,54% are not valid JSON\n", - " \n", - " \n", - "- True: 192228 rows\n", - " - 38,46% are valid JSON" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "False 307577\n", - "True 192228\n", - "Name: is_json, dtype: int64" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['is_json'].value_counts().compute()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ORIGINAL SAMPLE: One std above the mean\n", - "Original Sample Data: \n", - "- Mean: 1356.97\n", - "- Std: 26310.62\n", - "\n", - "I'll be using the original sample's mean and std to make the following analyses\n" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "ROW_COUNT = 499805\n", - "MEAN = 1356\n", - "STD = 26310" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "- 46745 rows have the value_len greater than 27666\n", - "- This represent 9.35% of the rows on this sample\n", - "\n", - "As the value_len increases the percentage of valid JSON on the columns 'value' also increases, for this filtered sample the following data was verified: \n", - "- True: 46691 rows\n", - " - 99,88% are valid JSON\n", - " \n", - "- False: 54 rows\n", - " - 0,11% are not valid JSON\n" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "len: 46745 (9.35%)\n" - ] - } - ], - "source": [ - "dfa = df[df['value_len'] > (MEAN + STD)]\n", - "length = len(dfa)\n", - "print(\"len: {0} ({1:0.2f}%)\".format(length, length / ROW_COUNT * 100))" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True 46691\n", - "False 54\n", - "Name: is_json, dtype: int64" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dfa['is_json'].value_counts().compute()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## All greater values are JSON\n", - "\n", - "There is absolute no value greater than 104653 that represents a valid JSON. \n", - "\n", - "This implies that all the greater values are JSON but they represent very low percentage of the whole data. \n", - "\n", - "The top 46745 gratest value_len are valid JSONs, that is 9.35% of this sample (value_len > mean) and 0,41% of the original sample with all the smaller values. " - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ - "group = dfa.groupby('is_json')\n", - "group_result = group.agg({'value_len': ['mean', 'std', 'min', 'max', 'count']}).compute()" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
value_len
meanstdminmaxcount
is_json
False82460.05555613627.1180632813510465354
True271422.740185412552.29861327669449686146691
\n", - "
" - ], - "text/plain": [ - " value_len \n", - " mean std min max count\n", - "is_json \n", - "False 82460.055556 13627.118063 28135 104653 54\n", - "True 271422.740185 412552.298613 27669 4496861 46691" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "group_result" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "len: 46745 (9.35%)\n" - ] - } - ], - "source": [ - "allJson = df[df['value_len'] > 104653]\n", - "length = len(dfa)\n", - "print(\"len: {0} ({1:0.2f}%)\".format(length, length / ROW_COUNT * 100))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 327429a3f6ae02bfb32ad84fbbd5d854aba50c6e Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Sun, 31 Mar 2019 19:46:53 -0300 Subject: [PATCH 12/23] Add analyse for the correlation the domain and the value have with each other --- .../isJson_correlation_domain_and_value.ipynb | 922 ++++++++++++++++++ 1 file changed, 922 insertions(+) create mode 100644 analyses/2019_03_aliamcami_value_analyses/isJson_correlation_domain_and_value.ipynb diff --git a/analyses/2019_03_aliamcami_value_analyses/isJson_correlation_domain_and_value.ipynb b/analyses/2019_03_aliamcami_value_analyses/isJson_correlation_domain_and_value.ipynb new file mode 100644 index 0000000..f171d76 --- /dev/null +++ b/analyses/2019_03_aliamcami_value_analyses/isJson_correlation_domain_and_value.ipynb @@ -0,0 +1,922 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Start Dask" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/overscripted/lib/python3.6/site-packages/dask/config.py:168: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", + " data = yaml.load(f.read()) or {}\n", + "/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/config.py:20: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", + " defaults = yaml.load(f)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 4
  • \n", + "
  • Cores: 4
  • \n", + "
  • Memory: 8.59 GB
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import dask.dataframe as dd\n", + "from dask.distributed import Client\n", + "\n", + "#Initializing client\n", + "client = Client()\n", + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Objective\n", + "\n", + "The objective of this notebook is to answer two main questions: \n", + " - \"The JSON values are always from the same location or related domains?\" \n", + " - \"Are there a set of location domains that always produces a JSON?\"\n", + "\n", + "To answer this we will use the sample data set produced by the notebook \"isJson_dataPrep.ipynb\" called 'all_json_above_mean.parquet' for first question and 'is_json_above_mean_md5.parquet' for seccond, this contains two extra calculated columns that will be important: 'is_json' and 'location_domain'.\n", + "\n", + "\n", + "OBS.: For \"value\" comparison I will use instead value_md5, because its reliable and faster. Value_md5 is the calculated md5 for the value columns \n", + "OBS2.: To see validation that all biggest values are json please reffer to 'isJson_Sample_Comparasion.ipynb'\n", + "\n", + "### Findings: \n", + "\n", + "On this notebook I was able to validate couple facts about the two proposed questions, which are: \n", + "- One domain produces multiple JSONs\n", + "- One JSON is usually (99.9%) produced by a single domain. \n", + "\n", + "\n", + "- One domain can produce values there are both Json or not, but most produce only one type\n", + "- Most of the domains that produce a single type produces JSON type. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Are there a set of location domains that always produces a JSON?\n", + "The dataset used to this analise contains non-json values as well for the sake of proving that one domain may or may not produce only json values." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['value_md5', 'is_json', 'location_domain'], dtype='object')" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = dd.read_parquet('is_json_above_mean_md5.parquet', engine='pyarrow', columns=['value_md5', 'is_json', 'location_domain'])\n", + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_md5is_jsonlocation_domain
0cff77029e3ae45dd439a62987b1d8340Truecanada.ca
19ac0a0a0afb677c8fd985a7c2f4ddbc5Truetmall.com
29ac0a0a0afb677c8fd985a7c2f4ddbc5Truetmall.com
3db64465b639e01993d9212390f057628Falsecoches.net
4db64465b639e01993d9212390f057628Falsecoches.net
\n", + "
" + ], + "text/plain": [ + " value_md5 is_json location_domain\n", + "0 cff77029e3ae45dd439a62987b1d8340 True canada.ca\n", + "1 9ac0a0a0afb677c8fd985a7c2f4ddbc5 True tmall.com\n", + "2 9ac0a0a0afb677c8fd985a7c2f4ddbc5 True tmall.com\n", + "3 db64465b639e01993d9212390f057628 False coches.net\n", + "4 db64465b639e01993d9212390f057628 False coches.net" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "location_domain_group = df.compute().groupby('location_domain')" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "metadata": {}, + "outputs": [], + "source": [ + "agg = location_domain_group.agg({'value_md5': ['nunique', 'count'], 'is_json': ['sum', 'nunique']})" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_md5is_json
nuniquecountsumnunique
location_domain
0123movies.com222.01
10010.com288.01
1001freefonts.com20155155.01
10fastfingers.com42828.01
10jqka.com.cn73030.01
\n", + "
" + ], + "text/plain": [ + " value_md5 is_json \n", + " nunique count sum nunique\n", + "location_domain \n", + "0123movies.com 2 2 2.0 1\n", + "10010.com 2 8 8.0 1\n", + "1001freefonts.com 20 155 155.0 1\n", + "10fastfingers.com 4 28 28.0 1\n", + "10jqka.com.cn 7 30 30.0 1" + ] + }, + "execution_count": 161, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agg.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1563" + ] + }, + "execution_count": 178, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Count the number of domains that only produce one type of value (json or non-json)\n", + "f1 = agg['is_json']['nunique'] == 1\n", + "agg_1 = agg[f1]\n", + "oneType = len(agg_1['is_json'])\n", + "oneType" + ] + }, + { + "cell_type": "code", + "execution_count": 180, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1226" + ] + }, + "execution_count": 180, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Out of the ones there have only one type of output, these are the ones that have as JSON\n", + "f2 = agg['is_json']['sum'] > 0\n", + "agg_1a = agg[f1 & f2]\n", + "oneType_json = len(agg_1a['is_json'])\n", + "oneType_json" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 185, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "series = pd.Series([(oneType - oneType_json), (oneType_json)], index=['non-json', 'json'], name='One type')\n", + "series.plot.pie(figsize=(6, 6), autopct='%1.0f%%')" + ] + }, + { + "cell_type": "code", + "execution_count": 179, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "294" + ] + }, + "execution_count": 179, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Count the number of domains that only produce one BOTH json and non-json values\n", + "agg_2 = agg[agg['is_json']['nunique'] == 2]\n", + "twoTypes = len(agg_2['is_json'])\n", + "twoTypes" + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 187, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWYAAAFbCAYAAADmwiRlAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3XecVNXBxvHf2cLSERBRNHqNhVgQKxZsmDcxyaQYo1hi7xiNxHqNbX3VOKZZo1Gj0dcSTdTEmJvEqNgLMRABex2NSBAQhs7s7pz3jzvgosDO7t6Zc++d5/v5zGeX2dmdB3GfPXvm3HOMtRYREYmPOtcBRERkZSpmEZGYUTGLiMSMillEJGZUzCIiMaNiFhGJGRWziEjMqJhFRGJGxSwiEjMqZhGRmFExi4jEjIpZRCRmVMwiIjGjYhYRiRkVs4hIzKiYRURiRsUsIhIzKmYRkZhRMYuIxIyKWUQkZlTMIiIxo2IWEYkZFbOISMyomEVEYkbFLCISMypmEZGYUTGLiMSMillEJGZUzCIiMaNiFhGJGRWziEjMqJhFRGJGxSwiEjMqZhGRmFExi4jEjIpZRCRmVMwiIjGjYhYRiRkVs4hIzKiYRURiRsUsIhIzDa4DiHSH5wf1wFrAQKBXuw/Zzzy0/Z+LwHzg41w201LZhCKdZ6z97P+/Iu55ftAD2BzYDFivdFu39HYosDZhGfcHTDeeKg/MAj4uvf3s+zOAN3LZzPRuPIdIp6iYxalSAQ8HtgS2avd2U+L1G9084FXglfa3XDYzw2kqSSUVs1SN5wdNwK7AnsBIwgLehHgVcGfNJSzpV4HJwNO5bOZVt5Ek6VTMUjGl0fDOwBhgb8JS7ukyU5XMBp4BngaeAF7KZTNFp4kkUVTMEhnPDxqBUYQlPIawiHu7zBQTc4AJwCPAI7lsJuc2jsSdilm6xfODnsA3gINLb/u4TZQIbwH3AffkspmprsNI/KiYpdNKI+N9gYOA7wD93CZKtFeBe4Hf5bKZt1yHkXhQMUtZSuuF9yEcGX+XcKmaROvfwO+Ae3PZzAeuw4g7KmZZI88PdgSOAb4HrOM4Tq2wwPPAPcAduWxmnuM8UmUqZvmc0rK2g4AfEL6YJ+4sBG4Drs5lM287ziJVomKWFTw/WJ+wjI8DhjiOIysrAn8FrsxlMxNch5HKUjELnh9sA5xJOH/c6DiOdGwqcBVwdy6bWeY6jERPxVzDPD/4H+As4Kuus0iXzAR+DVyfy2Y+dh1GoqNirkGeH+wGXAHs7jqLRGIpcA1wuV4oTAcVcw3x/GAL4HLCtceSPp8APwGu0xRHsqmYa0DpRb2LgaOAerdppAreBy4E7tQeHcmkYk4xzw8GAD5wGitvIi+1YSpwTi6b+bvrINI5KuYUKq1DPgX4MTDIcRxxbwJwdi6bmeQ6iJRHxZwynh/sBfyGcKN5keUscDtwRi6b+cR1GFkzFXNKeH7QH/gpcALdO2pJ0m0W8KNcNnOX6yCyeirmFPD84BvAjcAGrrNIYvwdGKe9oeNJxZxgnh8MJrwC7DDXWSSRFgHnEF6goiKIERVzQnl+cCBwHdrxTbpvAnBMLpt533UQCamYE8bzg3WB6wn3RBaJygLCFwZvdh1EVMyJ4vnBPoR79GrnN6mUPwJH5bKZ+a6D1DIVcwJ4fmAI5wIvRVfuSeW9CXwvl8287DpIrVIxx1zp6r3b0f4WUl2LgBNy2czdroPUIhVzjHl+MAJ4AF0sIu5cSzj33OI6SC1RMceU5weHE+6129t1Fql5zwFjc9nMdNdBaoWKOWY8P+hBuDZ5nOssIu3MBA7OZTNPuA5SC+pcB5BPlZbCPYVKWeJnKPCo5wdnuA5SCzRijgnPDzYHHgY8x1FEOnIVcLquFqwcFXMMeH6wC/AXYLDrLCJluoPwasFW10HSSFMZjnl+8C3gMVTKkiyHA3/0/EAHMFSAitkhzw+OJLzSSisvJIm+CTxcWmsvEVIxO+L5wanAb9GVfJJsewBPen4w1HWQNFExO+D5wXmEx81rQ3tJg5HAs54fbOw6SFroxb8q8/wgS7jvhUjazAD2zWUz01wHSToVcxV5fnAx4bHyImk1G9g9l8284TpIkqmYq8Tzgx8CV7vOIVIFHwCjc9nMh66DJJWKuQo8P/g+4bpPzSlLrXgN2COXzcxxHSSJVMwV5vlBBvgT0OA6i0iV/RP4ci6bWeg6SNJoVUYFeX6wO/AHVMpSm0YBD5Q25pJOUDFXiOcHIwkvs9aVUVLLvgLc4fmBuqYT9B+rAjw/2AT4O6ArokRgLPAr1yGSRMUcsdLWnY8A67rOIhIjJ3l+8L+uQySFXvyLkOcHjcDjwGjXWURi6vBcNnOn6xBxpxFztH6GSllkTW7y/GB71yHiTiPmiHh+cDDwO9c5RBLgA2DHXDYzy3WQuFIxR8Dzg62AiUAf11lEEuIJ4CvaaH/VNJXRTZ4f9AceQKUs0hl7A5e7DhFXKubuuw3Y3HUIkQQ6w/ODb7sOEUcq5m7w/OAc4Luuc4gklAFu8/zAcx0kbjTH3EWeH+wD/AOdQCLSXf8k3PCo4DpIXGjE3AWeH6xNuAJDpSzSfaPQfPNKVMxdcw2wjusQIiky3vODXV2HiAtNZXRS6cWKB13nEEmh14DtctnMMtdBXIv1iNkYE6t9XD0/WAu4wXUOkZTaAh29BsS8mGPoF8Aw1yFEUuxszw+2cx3CtdgXszFmPWPMU8aYl4wxLxtj9ijdf4gxZlrpvivaPX6hMeYyY8wUY8wLxpihUeTw/OArwDFRfC0RWa0G4FbPD2r6cInYFzNwKPCwtXZbYCTwkjFmGHAFsA+wLbCTMWa/0uP7AC9Ya0cCTwHHdzeA5wd9gZu7+3VEpCzbAme7DuFSEor5ReBoY0wzMMJauwDYCXjCWjvLWtsK3AXsWXp8gfDkEIBJgBdBhiuAjSL4OiJSngs9P9jCdQhXYl/M1tqnCEt3OnCHMeYI1nzadIv9dKlJG908b8/zgz2Bcd35GiLSaU3ALbV6JFXs/9LGmI2Aj621NwO3ANsT7uS2lzFmbWNMPXAI8GTUz13a+P4m1vyDQEQqY1fgZNchXIhtMRtjGoBlhLtQvWSM+TfwPeBqa+0M4FzC00KmAJOttZVYWzwOGF6Brysi5bmotINjTYntBSbGmJHAzdbaUS6ev7Rm+W1gsIvnF5EVLstlM+e7DlFNsRwxG2NOItyLwuU/xgWolEXi4EelQ45rRiyL2Vr7a2vtltbaf7h4fs8Pvgic4uK5ReRzegPNrkNUUyyLOQYuA3q4DiEiKxzr+UHNvN6jYv4Mzw9GAge5ziEiK2kAfuI6RLWomD/vMrQ8TiSO9vf8YBfXIapBxdyO5wejgYzrHCKyWld0/JDkUzGv7DLXAURkjfb0/OCbrkNUmoq5xPODnYG9XOcQkQ6lfk2zivlTP3IdQETKsnPa55pVzIDnB18gvNxbRJJhvOsAlaRiDp1CN3ehE5Gq+l5pQJVKNV/Mnh/0AU5wnUNEOqWBFF+dW/PFDBwFrOU6hIh02vGeH/R2HaISarqYPT8wwGmuc4hIlwwEjnQdohJqupgJLybZzHUIEemy00oDrFSp9WLWEjmRZBsOfN11iKjVbDF7frA14SnbIpJsqZuOrNliBg53HUBEIvGVtC2dq8liLs1JaWtPkXQwwPddh4hSTRYz4em7G7kOISKROcx1gCjVajEf7DqAiERqK88PtnUdIio1V8yeH9QDY13nEJHIpWbUXIv7Q+wNDHUdorPmv/gnFk75BxhoHOKx9jfGYxrCYwk/eeTXLJz2KBuefl/42EkPsfClv1Hffwjr7H8+pr6RpR++wuI3nmPQl493+dcQqaQDgTNdh4hCzY2YSeA0RuuC2cyf9BDrHnklw469HopFFr32FADLZrxFcdmilR6/cMrDrHfMdfQYuglL3puMtZb8s/cwYPQhLuKLVMuGnh+Mch0iCjVVzJ4fNAL7u87RJcU2bGsBW2zDti6jvu8gbLGNuU/cylp7H/35x7e1YVuWYeoaWPTKBHptsiP1PftWP7dIdR3gOkAUaqqYgX2BQa5DdFZDv7XpP+q7TL/haD687nBMU296bbw9Cyb/hd6b7kxD35X/Sv1H7c+MO86guDhP0/pbsOjlx+i3nY4ylJqQin3Va22OOZFrl9uWLmTxWxNZ/6RbqGvqw6wHsyx8+TEWv/4sQw+9/HOP77v1PvTdOryocd4zd9Nvh2+z5N1JLHr5Mer7D2HgPsdiTK39TJYa8UXPD7bPZTOTXQfpjpr57ixdVPI11zm6YmnuJRoGDKW+9wBMfQO9N9+Vec/cTcu8j5h+4/F8eMMx2JZlTL9x5Rf2WhfMofDft+i92S7kn7+Htb9zTvhCYG6Ko7+JSFV8w3WA7upwxGyMOdBa+4eO7kuArYG1XYfoiob+Qyh89AbFlqWYhiaWvj+F/jvtR/8dvrXiMR/88gDWP/HmlT5v3tN3stYe4Qoi21IAY8AYbOuyquYXqbIxwKWuQ3RHOSPmc8u8L+7GuA7QVU3DhtN7+Ghm3DaeGbf+AKyl38g1D/4LM98BoMfQTQDou81XmXHLKRRmvkOvjXeoeGYRh3b1/KCH6xDdYay1q/6AMV8n/JVgLHBvuw/1B7a01iZqWYrnB38CvuM6h4hUxV65bOYp1yG6ak0j5o+AfwFLgUntbn8mXN2QGJ4f1AF7uc4hIlWT2N+QYQ1zzNbaKcAUY8xd1trWKmaqhG3RuX4itWQMcLHrEF1VzhzzW8aYdz97q3iyaCX6p6eIdNounh/0dB2iq8pZx7xju/d7El6PnrSLNFTMIrWliXB738ddB+mKDkfM1to57W7TrbVXkaAjmTw/aAD2dJ1DRKousQOyctYxb9/uj3WEI+h+FUsUvR1IVl4RiUZ6ixn4Rbv3W4EcydrPeLTrACLixM6eHzTlspnEXVHVYTFbaxP7U6ckNacaiEinNALDgamug3RWh3PMxpjBxphrjDGTjTGTjDFXG2MGVyNcREa6DiAizmzpOkBXlLNc7h5gFuF2egeU3r93jZ8RE6X9l7/kOoeIOLOV6wBdUc4c8yBr7SXt/nypMWa/SgWK2BZAoq+ZF5FuSWQxlzNiftwYc7Axpq50GwsElQ4WkW1cBxARp1I7lXEicDewrHS7BzjdGLPAGDO/kuEisIXrACLi1KaeHzS5DtFZ5azKSPIa4OGuA4iIU/UkcGVGOasyHivnvpjSC38ikrjpjNWOmI0xPYHewNrGmIGAKX2oPzCsCtm6pbTV56auc4iIc4l7AXBNUxknAuMJS7j9wYbzgV9VMlREPMKNTESktqWnmK21VwNXG2NOtdZeW8VMUfmC6wAiEgsbuw7QWeWsY84bY4747J3W2v+rQJ4oreM6gIjEwhDXATqrnGLeqd37PYEvE05tqJhFJAnSV8zW2lPb/9kYMwC4o2KJopO4fwwRqYgenh8MyGUzeddBylXOBSaftRjYLOogFaARs4gsl6iBWjkb5T8E2NIf6wmvpvt9JUNFJFH/ECJSUUOAt12HKFc5c8w/b/d+K/C+tfbDCuWJkkbMIrJcogZq5Zz59yTwOuHxTAOBQqVDRUTFLCLLJaoPyrkkeyzwT8LTsccCE40xB1Q6WAQS9RNSRCoqUX1QzlTGecBO1tqPAYwxQ4BHgfsqGaw7PD+oBwa5ziEisZGoYi5nVUbd8lIumVPm57k0mE/39hARSVQxlzNi/rsx5mHgd6U/HwT8tXKRItHoOoCIxEpv1wE6o5wLTM4yxuwP7E44Cr3JWvvHiicTEYlOvesAnVHOiBlr7QPAAxXOIiJSKWV1XVzEfa64qzS/LCLtJWrEnNZiFhFpL1Ej5kSFFemsk+sffHZcw5/Xcp1D3FpEz5mQcR2jbOXslTEaaAY2Kj3eANZa+8XKRusWTWUIALe2fW378Q33z+xhWj3XWcSdfiz5yHWGzihnKuMW4JeEqzJ2AnZk5T2aRWJrKU29TmoZn7d2xUZcUpuKrgN0RjnFnLfW/s1a+7G1ds7yW8WTdY9GzLLChOL2I/9tN33GdQ5xqs11gM4op5gfN8b8zBizqzFm++W3iicTidDhhXO3bbV1ifp1ViKVlM3XgPJe/Nu59HbHdvdZYJ/o40Sm1XUAiZdF9Op3VsuJr1/Z44ZhrrOIE4k5vQTKu/JvTDWCRCzuUy3iwB+Le+w0rvjn5zavm76b6yxSdfNcB+gMY+2qXxMxxhxmrb3TGHP6qj5urf1lRZN1k+cH8wn3kBZZYSDzP5nUNK5YZ+zarrNIVV1Mc77ZdYhyrWmOuU/pbb/V3OJutusAEj9z6T/o0tbD3nSdQ6ouUSPm1U5lWGtvLL29uHpxIjUL2Nh1CImfW9u+vtsxDX+buIGZvXPHj5aUSEcxL2eM6QkcC2wF9Fx+v7X2mArmisIs1wEkvg5cdtFGzzWdmjeGAa6zSFUkqpjLWS53B7AusC/wJLABsKCSoSKiYpbVmsHgdX/V9p2prnNI1aSumDe11l4ALLLW3k54wfmIysaKhOaYZY1+3nrQHnNsv3+7ziFVkaiVWuUUc0vp7TxjzNbAAMCrWKLoaMQsHTqgcNFga1nsOodU3PuuA3RGOcV8kzFmIHAB8GfgVeCnFU0VDRWzdOg9O2zDu9q+/KLrHFJRc2nOz3cdojM6LGZr7W+stXOttU9aa79orV3HWvvraoTrJhWzlOWC1qP3WGB7veI6h1RMznWAzipnVcZawBGE0xcrHm+t/WHlYkViuusAkgyWurqDC+c3/aXHeQVj6OE6j0Qu5zpAZ5UzlfFXwlKeBkxqd4s7XUQgZXvFbrxpUNzledc5pCJyrgN01movyV7xAGMmW2sTuZuc5wcfAuu7ziHJ0EBry7Sm43K9TGEz11kkUqfRnL/GdYjOKGsdszHmeGPMesaYQctvFU8WDY2apWytNDQeVTi7xdpk7d0rHcq5DtBZ5RRzAfgZ8DyfTmP8q5KhIvSG6wCSLBPtlls+XRyhTfXT5R3XATqrnGI+nfAiE89au3HpFufz/tp73XUASZ4TWk4f1WLrE7XuVVZrKQkcoJVTzK9AYhfgv+w6gCTPUpp6ndxy2lydE5gKr9CcT9zBGeWcYNIGvGSMeRxYtvzOBCyXA5jiOoAk0yPFHbedYjd5elvzzh6us0i3JPKS+3JGzH8CLgOeI1nL5chlM7OB/7rOIcl0WOHckW22bobrHNItL7kO0BXlHC11uzGmB7B56a43rLUta/qcmJlKuDueSKcspHf/c1qPf/3njTeu5zqLdFkii7nDEbMxZm/gLeBXwPXAm8aYPSucK0qazpAuu69tr1FvF4c95zqHdEmRhH7/lzOV8Qvgq9bavay1exLuy3xlZWNFSt9U0i1jCxcML1qTqG0jBYB3aM4vdB2iK8op5kZr7YrlJtbaN4HGykWK3JOEPzlFuuQTBgzOth6SuCVXkpjrLT6nnGL+lzHmFmPM3qXbzSTkxT+AXDYzl3CeWaTLbmr75m4f2UH/dJ1DOuVJ1wG6qpxiHke4lvmHwGmE+zGfVMlQFfCE6wCSfAcuu+gL1pKofX1r3OOuA3RVOfsxLyM89+9Ea+13rbVXlu5LksT+A0l8TGfIeje0fTuRLybVoI9ozid2r5zVFrMJNRtjZhNe2vyGMWaWMebC6sWLzFNonlki8NPWg3b/xPZL5BKsGpPowdiaRszjgdHATtbawdbaQcDOwGhjzI+qki4iuWxmHgldzyhxY8yBhQsHWssS10k+65gHl7DOzxaw9fUrL0S4dmKB4dctZKvrF3L2I0sBePaDVra5YSE73byQtz8Jxyzzllr2vXMRHW0FnBCpLeYjgEOste8tv8Na+y5wWOljSfOE6wCSDu/Y9Te6p21M7F4IPGrbRv5+WO+V7nv8vVYefKOFqSf14ZWT+3LmbuEBLb94vsD9Y3vxk316csOLBQAueXIZP969CWNM1bNXQGqLudFaO/uzd1prZ5Gs5XLLPeE6gKTHea3H7r7Q9nzVdY729tyogUG9Vi7VG/5VwN+9iaaG8P51+oTf8o31sKQVFrdYGuvhnU+KTF9QZC+vnO1zYu8DmvPvug7RHWsq5kIXPxZXT4E2QJdoFKmrP7RwXqO1xHp7gjfnFHn6/VZ2/s1C9rptES9OD78Fzt29iRMeWspVEwucMqoH501YyiVjmhynjcw/XAforjUV80hjzPxV3BYAI6oVMCq5bCYPTHSdQ9Jjqt1ks78VRz3rOseatBZh7lJ44dg+/OwrPRl732KstWy7bj0vHNeHx4/sw7tziwzrV4cFDrpvMYc9sISZCxP9WvkfXQfortUWs7W23lrbfxW3ftbaJE5lAPzedQBJl9NaTtltqW1823WO1dmgv2H/LRowxjBq/XrqDMxe/OmLe9ZaLn1qGRfs2cTFTy7j4r2bOGybRq6ZmMRfigGYDzzqOkR3lXOBSZr8Hi2bkwi10NDjmJazllkbz/+v9vtSIxPeC/eJf3NOG4U2WLv3p/PQt09pIbNZAwN7GRa3QJ0Jb4tjPUGzRn+hOZ/YnyrL1VQx57KZGcDTrnNIujxX3Hqr54tbOv//6pD7F7PrLYt4Y06RDX65gFsmFzhmu0benWvZ+vqFHHzfEm7fr9eKVReLWyy3T2nh5J3ClRqn79KD7/1+Cec+tpRxOyX1l2IecB0gCiYlaxbL5vnBScANrnNIuvRi2eKpTcfNbjRtG7rOUsOWAGvTnE/qUXgr1NSIueR+IHFngEm8LaGp9yktp2prULceTkMpQw0Wcy6bmQVMcJ1D0ufh4qjtphU3fsZ1jhp2v+sAUam5Yi6513UASadDCz8e0WaNzpmsvoWE55OmQq0W8x9J5kUyEnML6DPgvNZj33edowbdm9TTSlalJou5tHl+4q8Okni6p22fnd8trvu86xw15jeuA0SpJou55C7XASS9xhYu3LRozSeuc9SIl2nOv+A6RJRquZjvBz5yHULSaTZrDflp60Gvuc5RI25xHSBqNVvMuWymBbjedQ5Jr1+3fXv0f+3AF13nSLnlJyylSs0Wc8mNwFLXISS9Dig0r28tC1znSLE/0ZxP3frxmi7mXDYzG801SwV9aIcMu7kt82/XOVLsRtcBKqGmi7nkKtcBJN1+0nroHnNtXx3iGr1JNOcTfVLJ6tR8MeeymZeBx1znkDQzZmzhwgHWatosYle4DlApNV/MJVe7DiDp9pbdwPtD2146qCE6b5OiS7A/S8Uc+gvhP7RIxfitx+++yPbUErpo/JzmfCz3wI6CihnIZTMWuMZ1Dkm3InX13y/8uC7u5wQmwEzgdtchKknF/KnfAp87FVwkSi/ZTYf/o7jDc65zJNw1NOdTPV+vYi7JZTMLSfGLCRIfp7b8cJdltvEd1zkSagE1cGGYinllv0KXaUuFFWhsOrblzMVxPScw5q6iOT/PdYhKUzG3k8tmlgCXus4h6fdMccSIiXYLbarfOXOAn7sOUQ0q5s/7DfCe6xCSfscUztqhxdZ/6DpHglxOc36+6xDVoGL+jNLmRue7ziHpt5iefca3/GCm6xwJ8R/CqcaaoGJetd8B2hVMKi4o7rLDK8WNNKXRsR+nfSVGeyrmVSitaz7DdQ6pDYcUzhvRZo1Gzqs3iS5sNmaM8YwxL3fi8eONMb3b/dnZUVUq5tXIZTNPE54NKFJR8+k74ILWo/W6xuqdQXPeVuF5xgO9O3xUFaiY1+wc0FVaUnl3t/3PLu8X10nV8UgRuZfm/JPd+PwGY8ztxpipxpj7jDG9jTFfNsb82xgzzRhzqzGmyRjzQ2AY8LgxZsWOdcaYy4wxU4wxLxhjhnb7b1MmFfMa5LKZt4DLXeeQ2nBg4aJNipa5rnPEyDzCUWx3DAdustZuA8wHTgduAw6y1o4AGoBx1tprCK9hGGOtHVP63D7AC9bakcBTwPHdzFI2FXPHLgNecR1C0u9jBg75ZeuBr7rOESPn0pz/bze/xn+stc+W3r8T+DLwnrX2zdJ9twN7ruZzC4QbnEE4z+11M0vZVMwdyGUzBeBY0FVaUnnXtX139Md2rX+5zhEDzxHN6STdmZtusdYu//w2wtF1VaiYy5DLZiaiPZulSg4oXLSutThbERADLcCJEb3gt6ExZtfS+4cAjwKeMWbT0n2HA8vnsBcA/SJ4zm5TMZfvfEAbz0jFfWCHbnBL29cnu87h0M9pzpe9zK0DrwFHGmOmAoOAK4GjgT8YY6YR/ib869JjbwL+1v7FP1fMpyN16YjnB2OACa5zSC2w9qWmE6atZRZt4zpJlb0DjKA5v8R1EJc0Yu6EXDbzOHCz6xxSC4w5qHBBP2tZ5jpJFbUBR9Z6KYOKuSvOAqa7DiHp94bdcOMHins87zpHFV1Kc/7Zjh+WfirmTsplM3lgnOscUhvObjlh98W26XXXOargOeAS1yHiQsXcBbls5iHgWtc5JP3aqG84rHAu1tLqOksF5YHv05xvcx0kLlTMXXcGUEu/Zoojk+3mX5pQ3C7Nv+KPozmfcx0iTrQqoxs8P1gfmAys4zqLpFsThaVTm46b0WRaN3adJWJ30Jw/wnWIuNGIuRty2cx04GDCV5NFKmYZPXoe33LGAmu7dSVb3LwF/MB1iDhSMXdTaQmdTjyRinuqOHKbf9nhT7vOEZH5wLdpzi9wHSSOVMzRuAJ40HUISb+jCmdv32rrkn5OYBE4lOZ8Law26RIVcwRKJ54cCbztOouk2yJ69f1Ry8nd3XHNtfNozgeuQ8SZijkipfXN+wOLXWeRdHuouNuOrxW/kNRVGvfQnM+6DhF3KuYI5bKZacAxdG+rQZEOHVI4f8s2a2a5ztFJkwm/P6QDKuaI5bKZe4HTXOeQdJtHv4EXtx6RpKmzmcB+2gejPCrmCshlM9cCl7rOIen2f2377vpBMs4JzANfozn/H9dBkkIXmFSQ5wc3ACe5ziHpNZRPPn6h6ZQmYxjgOstqLAH2pTmflmV+VaERc2X9ALjPdQhJr5kMWufqtv2nuc6xGq3AWJVy52nEXGGeH/QA/kp4CKRIRbzYNG7SEJPfwXWOdixwBM35O10HSSKNmCusdJjrfoAO2JSKOaBw0VBrWeQ6Rzt+f6fyAAAGJklEQVTjVcpdp2Kuglw2sxD4BvCG6yySTu/bdTe4ve2rk1znKPlfmvPXuA6RZJrKqCLPDzYkPDNwE9dZJH0MxeJLTSe8MsAsHuEwxgU057UiqZs0Yq6iXDbzAbAH8KrrLJI+lrq6gwsX9HZ4TuDpKuVoqJirLJfNzAD2IrwKSiRSr9mNNnmwuFu11zZb4CSa81dW+XlTS1MZjnh+MAAIgNGus0i6NNDaMrXpuHd7m8LwKjxdG3A0zfk7qvBcNUMjZkdKmx59lbCcRSLTSkPjkQW/aG3FD3BoAQ5SKUdPxexQLptZTLiU7reus0i6vGi/tMUTxZHPVPAp5gPfojl/fwWfo2ZpKiMmPD+4DPix6xySHk0Ulk5rOu6/PUyrF/GXzhGW8ssRf10p0Yg5JnLZzHmEl3Cn+Zh6qaJl9Oh5Usv4fMTnBD4HjFIpV5aKOUZy2cz1hJduz3SdRdJhQnH7kZPtZlHtVXEXsA/N+aTtA504msqIIc8PhgF/AHZznUWSrw9LFkxpOn5BgykO6+KXsMBFNOcviTKXrJ5GzDGUy2Y+AvYGrnUcRVJgEb36ndVy4vQufvpCwpUXKuUq0og55jw/+D5wE9DbdRZJtod7nP3s8LoPO7Nu/iXCbTvfqlQmWTUVcwJ4fjACeADY1HUWSa6BzP9kUtO4tjpjh5Tx8OsJL7F2dXl3TdNURgKUDnndEfiz6yySXHPpP+jS1sM6OicwDxxAc/4HKmV3VMwJUbpScD/gdMLjekQ67da2r+/6oV174mo+/E9gO1004p6mMhLI84NNgd8QboYk0inrMee/zzWd2qvdOYFtwM+AC2nOtziMJiUaMSdQLpt5GxhDeEHKQsdxJGFmMHjda9v2W35O4CvArjTnz1Upx4dGzAnn+cFGwM3AV1xnkURpeaTHmRduVvfRlZpLjh8Vc0p4fnAs8AuI7TH2Eh8vACeUXlSWGNJURkrksplbgC2Bh1xnkdiaD5wCjFYpx5tGzCnk+cH+wOXA5q6zSCxY4F7gzFw209UrAKWKVMwp5flBA3A8cBEw1HEccedR4JxcNqOjzBJExZxynh/0Bc4EzgD6Oo4j1TMJ8HPZzKOug0jnqZhrhOcHQwlHz8cDDY7jSOW8A5wP3JvLZvTNnVAq5hrj+cHmhPPP+7vOIpGaCVwC3JTLZrQeOeFUzDXK84NdCI+y+iZgHMeRrptNuD3sL3PZjC42SgkVc43z/GALwjnow4AejuNI+aYBVwN35bKZpa7DSLRUzAKA5wfrAT8knIMe7DiOrFoRCICrctnMBNdhpHJUzLISzw96AocApwLbOY4joQXAb4FrctnMO67DSOWpmGW1PD8YDZxMuN2oTlCpvneA64Bbc9nMfNdhpHpUzNKh0lrobwMHA/uiuehK+gj4PeGVehO15K02qZilUzw/GAh8j7Ckx6D9VqLwMXAfYRk/k8tmio7ziGMqZukyzw/WBQ4knJPeBS2764w5hOc43gs8kctm2hznkRhRMUskSvtC7wvsA+yN9udYldcJ9674K/CoLgSR1VExS0V4frAlYUmPISzqQU4DufEh8ARhGT+qnd2kXCpmqTjPD+qAbfi0qEcDA52Gip4FXgWeAZ4mnCt+320kSSoVszhRuqBla2Crz9z6u8xVBgvkCEu4/e21XDazwGEuSREVs8SK5wdfICzo5aW9EbAeMIzqlfZSYBbhPhT/4fMFvLhKOaRGqZglMTw/6E1Y0kMJLxsfTDh3PYhwamT5dqam3VuzivtaCVdFLC/f9m9n5bKZRRX9i4h0QMUsIhIzujhARCRmVMwiIjGjYhYRiRkVs4hIzKiYRURiRsUsIhIzKmYRkZhRMYuIxIyKWUQkZlTMIiIxo2IWEYkZFbOISMyomEVEYkbFLCISMypmEZGYUTGLiMSMillEJGZUzCIiMaNiFhGJGRWziEjMqJhFRGJGxSwiEjMqZhGRmFExi4jEjIpZRCRmVMwiIjGjYhYRiRkVs4hIzKiYRURiRsUsIhIzKmYRkZhRMYuIxIyKWUQkZlTMIiIxo2IWEYkZFbOISMyomEVEYkbFLCISMypmEZGYUTGLiMSMillEJGZUzCIiMfP/yJL5lUvL2ZgAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "series = pd.Series([(oneType), (twoTypes)], index=['Json', 'both'], name='Domain output')\n", + "series.plot.pie(figsize=(6, 6), autopct='%1.0f%%')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "Most of the domains has only one type of value output, but not all of them. 16% have outputs that can be json and non-json\n", + "\n", + "Out of the ones that has one value type, 78% is json. \n", + "\n", + ">Are there a set of location domains that always produces a JSON?\n", + "\n", + "Yes, there is a set that always produces the value as a valid JSON, but not all of them. There are also the ones there never produces JSON and some that produces both. \n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": 189, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "location_domain\n", + "twitter.com 5594\n", + "petsmart.com 2313\n", + "cdiscount.com 1835\n", + "debenhams.com 1229\n", + "mediamarkt.de 1094\n", + "Name: value_md5, dtype: int64" + ] + }, + "execution_count": 189, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Using the above methoed I could tell that the domains did not have only one \n", + "#output, but I could not find a way to tell the output type. \n", + "#Thats why I decided to calculate by hand as you propably notice, sorry. \n", + "\n", + "location_domain_group_unique_md5 = location_domain_group['value_md5'].nunique()\n", + "location_domain_group_unique_md5.sort_values(ascending=False).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## The JSON values are always from the same location or related domains?\n", + "For \"value\" comparison I will use value_md5 instead, because its reliable and faster\n", + "\n", + "* value_md5 is the calculated md5 for the value columns" + ] + }, + { + "cell_type": "code", + "execution_count": 191, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['value_md5', 'location_domain', 'value_len'], dtype='object')" + ] + }, + "execution_count": 191, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = dd.read_parquet('all_json_above_mean.parquet', columns=['value_md5','location_domain', 'value_len'])\n", + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_md5location_domainvalue_len
0cff77029e3ae45dd439a62987b1d8340canada.ca3713
19ac0a0a0afb677c8fd985a7c2f4ddbc5tmall.com103878
29ac0a0a0afb677c8fd985a7c2f4ddbc5tmall.com103878
3983f2d6827a86b128a02cf7442c94af1coches.net1686
4b2ad4d7452aeed3df181b1501cc20231coches.net1686
\n", + "
" + ], + "text/plain": [ + " value_md5 location_domain value_len\n", + "0 cff77029e3ae45dd439a62987b1d8340 canada.ca 3713\n", + "1 9ac0a0a0afb677c8fd985a7c2f4ddbc5 tmall.com 103878\n", + "2 9ac0a0a0afb677c8fd985a7c2f4ddbc5 tmall.com 103878\n", + "3 983f2d6827a86b128a02cf7442c94af1 coches.net 1686\n", + "4 b2ad4d7452aeed3df181b1501cc20231 coches.net 1686" + ] + }, + "execution_count": 192, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/worker.py:2791: UserWarning: Large object of size 1.89 MB detected in task graph: \n", + " (\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
location_domain
nunique
value_md5
000599fa6f59053c67e6ccbef137a0d21
0005e12de9897336bf5c7e352a8075681
00076462ead16ac77a1d56745584fd5b1
0007a2345e42bca1d5cac86e356bb87b1
000b0b6b104a36cbc6f31b923e1b31a71
\n", + "" + ], + "text/plain": [ + " location_domain\n", + " nunique\n", + "value_md5 \n", + "000599fa6f59053c67e6ccbef137a0d2 1\n", + "0005e12de9897336bf5c7e352a807568 1\n", + "00076462ead16ac77a1d56745584fd5b 1\n", + "0007a2345e42bca1d5cac86e356bb87b 1\n", + "000b0b6b104a36cbc6f31b923e1b31a7 1" + ] + }, + "execution_count": 207, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "aggmd.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "metadata": {}, + "outputs": [], + "source": [ + "f1 = aggmd['location_domain']['nunique'] > 1\n", + "aggf = aggmd[f1]" + ] + }, + { + "cell_type": "code", + "execution_count": 215, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "35746" + ] + }, + "execution_count": 215, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unique_values_count = len(aggmd)\n", + "unique_values_count" + ] + }, + { + "cell_type": "code", + "execution_count": 218, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(35, 35711)" + ] + }, + "execution_count": 218, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "values_multiple_origin = len(aggf)\n", + "values_single_origin = unique_values_count - values_multiple_origin \n", + "(values_multiple_origin, values_single_origin )" + ] + }, + { + "cell_type": "code", + "execution_count": 219, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 219, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "series = pd.Series([(values_multiple_origin), (values_single_origin)], index=['multiple', 'single'], name='Value Origin')\n", + "series.plot.pie(figsize=(6, 6), autopct='%1.0f%%')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "The absolute most values have only one origin. Only 35 occurencies of the same value are found to have more than one domain origin. \n", + "\n", + ">The JSON values are always from the same location or related domains?\n", + "\n", + "Almost, 0,097% of the values have multiple origins, but 99,9% is only produced by one domain. \n", + "\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 4e18a11938d628fee979e401903cad1150620192 Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Sun, 31 Mar 2019 19:55:58 -0300 Subject: [PATCH 13/23] Readme update - Quantitative_Comparasion overview --- .../README.md | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/analyses/2019_03_aliamcami_value_analyses/README.md b/analyses/2019_03_aliamcami_value_analyses/README.md index 6c7d919..d7195a9 100644 --- a/analyses/2019_03_aliamcami_value_analyses/README.md +++ b/analyses/2019_03_aliamcami_value_analyses/README.md @@ -1,6 +1,7 @@ # Overview -All the greatest values are JSON, but they represent very little percentage of the whole data. +## JSON +All the greatest values are JSON, but they represent very little percentual of the whole data. ### Most of the data have small value_len (mean = 1356 for the 10% sample) @@ -25,14 +26,25 @@ All the greatest values are JSON, but they represent very little percentage of t ## The top 46745 gratest value_len are valid JSONs, that is 9.35% of the filtered sample (value_len > mean) and 0,41% of the original 10% sample. +--- +## Correlation of location_domain and value + +- One domain produces multiple JSONs +- One JSON is usually (99.9%) produced by a single domain. + +### + +- One domain can produce values there are both Json or not, but most produce only one type +- Most of the domains that produce a single type produces JSON type. + --- # Future questions ## About JSONs: -- **The JSON values are always from the same location or related domains?** -- **Are there a set of location domains that always produces a JSON?** +- **The JSON values are always from the same location or related domains?*** +- **Are there a set of location domains that always produces a JSON?*** - Does the JSON values follow a structure pattern? What pattern? - What data does the JSON hold? Is there any pattern on content? - Do they have nested JSON? Css? Html? Javascript? Recursive study on JSON properties. @@ -40,6 +52,8 @@ The top 46745 gratest value_len are valid JSONs, that is 9.35% of the filtered s - Is a JSON's structure for a single script_url domain always the same? - Is every JSON with the same structure produced by the same script_url domain? + *See notebook 'isJson_Quantitative_Comparasion.ipynb' for more information + ## General I'm think some things here maybe a crawler investigation or just wiki reading, since someone may have already described and explained. I just need to find, read and understand it. From a509bffb4471205335e70c44e7969279b27f247d Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Wed, 3 Apr 2019 23:14:14 -0300 Subject: [PATCH 14/23] DataPrep cleanup and new 'json_keys' and 'json_schema' columns to dataPrep final sample output --- .../isJson_dataPrep.ipynb | 2229 +++++------------ 1 file changed, 574 insertions(+), 1655 deletions(-) diff --git a/analyses/2019_03_aliamcami_value_analyses/isJson_dataPrep.ipynb b/analyses/2019_03_aliamcami_value_analyses/isJson_dataPrep.ipynb index f1cb759..d14915f 100644 --- a/analyses/2019_03_aliamcami_value_analyses/isJson_dataPrep.ipynb +++ b/analyses/2019_03_aliamcami_value_analyses/isJson_dataPrep.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Start client" + "# Start" ] }, { @@ -21,46 +21,66 @@ "/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/config.py:20: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", " defaults = yaml.load(f)\n" ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "

Client

\n", - "\n", - "
\n", - "

Cluster

\n", - "
    \n", - "
  • Workers: 4
  • \n", - "
  • Cores: 4
  • \n", - "
  • Memory: 8.59 GB
  • \n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ "import dask.dataframe as dd\n", "from dask.distributed import Client\n", + "from dask.diagnostics import ProgressBar" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All sub samples and new samples with new columns/data will be saved under the \"DIR\" directory to keep things organized. \n", + "As such, the function \"save_parquet\" and \"read_parquet\" adds this directory to every parquet name, and I'm using this functions instead of dd.read_parquet/dd.to_parquet direct to ensure the same read and write settings across the notebook. " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "#Initializing client / distributed\n", + "# client = Client()\n", + "# client\n", "\n", - "#Initializing client\n", - "client = Client()\n", - "client" + "#Create folder to save/read new data\n", + "DIR = 'sample_0_prep/'\n", + "import os\n", + "if not os.path.exists(DIR):\n", + " os.makedirs(DIR)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If no \"recalculate_partition\" is passed on, it will not recalculate the partitions. It is not mandatory, but good if you are significantly reducing the size of the data. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#Save a DF to a parquet\n", + "def save_parquet(df, name, recalculate_partition=False):\n", + " with ProgressBar():\n", + " #DF.REPARTITION copyed from: https://stackoverflow.com/questions/44657631/strategy-for-partitioning-dask-dataframes-efficiently\n", + " if recalculate_partition:\n", + " n = 1+df.memory_usage(deep=True).sum().compute() // (1000 * 1000 * 100)\n", + " print(\"Npartition: \", n)\n", + " df.repartition(npartitions= n).to_parquet(DIR + name, engine=\"pyarrow\")\n", + " else:\n", + " df.to_parquet(DIR + name, engine=\"pyarrow\")\n", + " \n", + " \n", + "def read_parquet(name):\n", + " return dd.read_parquet(DIR + name, engine='pyarrow')" ] }, { @@ -75,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -84,7 +104,7 @@ "Index(['value_1000', 'value', 'value_len', 'symbol', 'script_url', 'location'], dtype='object')" ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -103,7 +123,44 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Filtered value_len > 1356\n", + "## DF overview\n", + "Some overview about the sample: \n", + "- Mean: 1356.97,\n", + "- Min: 0,\n", + "- Max: 4496861\n", + "- Std: 26310.62" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 48.2s\n", + "1356.9776628910975 0 4496861 26310.62140481331 11292867\n" + ] + } + ], + "source": [ + "with ProgressBar():\n", + " df_mean = df['value_len'].mean()\n", + " df_min = df['value_len'].min()\n", + " df_max = df['value_len'].max()\n", + " df_std = df['value_len'].std()\n", + " df_len = df['value_len'].count()\n", + " (df_mean, df_min, df_max, df_std, df_len) = dd.compute(df_mean, df_min, df_max, df_std, df_len);\n", + " print(df_mean, df_min, df_max, df_std, df_len)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### FILTER: value_len > df_mean\n", "1356 is the value_len mean\n", "\n", "To filter the data into something that is more interesting to this task I decided to only work with values that are at above the mean.\n", @@ -113,14 +170,22 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 58.0s\n", + "Npartition: 244\n", + "[########################################] | 100% Completed | 1min 30.9s\n" + ] + } + ], "source": [ "#Save\n", - "df = df[df['value_len'] > 1356]\n", - "dd.to_parquet(df=df, path='filtered_above_mean.parquet', engine='pyarrow')\n", - "# len(dff)" + "save_parquet(df= df[df['value_len'] > df_mean], name='above_mean.parquet', recalculate_partition=True)" ] }, { @@ -141,7 +206,7 @@ ], "source": [ "#Read\n", - "df = dd.read_parquet('filtered_above_mean.parquet', engine='pyarrow')\n", + "df = read_parquet('above_mean.parquet')\n", "df.columns" ] }, @@ -149,42 +214,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## DF overview\n", - "Some overview about the sample: \n", - "- Mean: 1356.97,\n", - "- Min: 0,\n", - "- Max: 4496861\n", - "- Std: 26310.62" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1356.9776628910975 0 4496861 26310.62140481331\n" - ] - } - ], - "source": [ - "df_mean = df['value_len'].mean()\n", - "df_min = df['value_len'].min()\n", - "df_max = df['value_len'].max()\n", - "df_std = df['value_len'].std()\n", - "(df_mean, df_min, df_max, df_std) = dd.compute(df_mean, df_min, df_max, df_std);\n", - "print(df_mean, df_min, df_max, df_std)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Domains\n", - "The following code is from this same project: ~/analyses/hello_world.ipynb\n" + "# Add Column: Domains\n", + "The following code is copyed from this same project: ~/analyses/hello_world.ipynb\n", + "\n", + "It uses the data saved from the last section\n", + "This section is dedicated to extract the domain of the columns \"location\" and \"script_url\" and add it as new columns \"location_domain\" and \"script_domain\"" ] }, { @@ -206,28 +240,39 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ + "#To guarantee the usage of the correct parquet created above in case we start from this section\n", + "df = read_parquet('above_mean.parquet')\n", + "\n", "df.astype({'value_1000': str, 'value': str,'value_len': int,'symbol': int,'script_url': str, 'location': str})\n", - "df['location_domain'] = df.location.apply(extract_domain)\n", - "df['script_domain'] = df.script_url.apply(extract_domain)" + "df['location_domain'] = df.location.apply(extract_domain, meta='O')\n", + "df['script_domain'] = df.script_url.apply(extract_domain, meta='O')" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 1min 17.3s\n" + ] + } + ], "source": [ "#save\n", - "df.to_parquet('0_sample_domains.parquet', engine='pyarrow')" + "save_parquet(df=df, name='above_mean_domain.parquet')" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -251,189 +296,140 @@ " \n", " \n", " \n", - " value_1000\n", - " value\n", - " value_len\n", - " symbol\n", - " script_url\n", - " location\n", " location_domain\n", + " location\n", " script_domain\n", + " script_url\n", " \n", " \n", " \n", " \n", " 0\n", - " {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...\n", - " {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...\n", - " 3713\n", - " window.sessionStorage\n", - " https://assets.adobedtm.com/caacec67651710193d...\n", - " https://www.canada.ca/en/services.html\n", " canada.ca\n", + " https://www.canada.ca/en/services.html\n", " adobedtm.com\n", + " https://assets.adobedtm.com/caacec67651710193d...\n", " \n", " \n", " 1\n", - " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " 103878\n", - " window.localStorage\n", - " https://g.alicdn.com/alilog/mlog/aplus_v2.js\n", - " https://maniform.world.tmall.com/category-1282...\n", " tmall.com\n", + " https://maniform.world.tmall.com/category-1282...\n", " alicdn.com\n", + " https://g.alicdn.com/alilog/mlog/aplus_v2.js\n", " \n", " \n", " 2\n", - " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " 103878\n", - " window.localStorage\n", - " https://g.alicdn.com/alilog/mlog/aplus_v2.js\n", - " https://maniform.world.tmall.com/category-1282...\n", " tmall.com\n", + " https://maniform.world.tmall.com/category-1282...\n", " alicdn.com\n", + " https://g.alicdn.com/alilog/mlog/aplus_v2.js\n", " \n", " \n", " 3\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " 1358\n", - " window.document.cookie\n", - " https://www.coches.net/scripts/common.min.js?2...\n", - " https://www.coches.net/fiat/segunda-mano/\n", " coches.net\n", + " https://www.coches.net/fiat/segunda-mano/\n", " coches.net\n", + " https://www.coches.net/scripts/common.min.js?2...\n", " \n", " \n", " 4\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " 1358\n", - " window.document.cookie\n", - " https://www.coches.net/scripts/common.min.js?2...\n", - " https://www.coches.net/fiat/segunda-mano/\n", " coches.net\n", + " https://www.coches.net/fiat/segunda-mano/\n", " coches.net\n", + " https://www.coches.net/scripts/common.min.js?2...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " value_1000 \\\n", - "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... \n", - "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", - "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", - "3 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", - "4 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", - "\n", - " value value_len \\\n", - "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... 3713 \n", - "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... 103878 \n", - "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... 103878 \n", - "3 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", - "4 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", - "\n", - " symbol script_url \\\n", - "0 window.sessionStorage https://assets.adobedtm.com/caacec67651710193d... \n", - "1 window.localStorage https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", - "2 window.localStorage https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", - "3 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", - "4 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", + " location_domain location \\\n", + "0 canada.ca https://www.canada.ca/en/services.html \n", + "1 tmall.com https://maniform.world.tmall.com/category-1282... \n", + "2 tmall.com https://maniform.world.tmall.com/category-1282... \n", + "3 coches.net https://www.coches.net/fiat/segunda-mano/ \n", + "4 coches.net https://www.coches.net/fiat/segunda-mano/ \n", "\n", - " location location_domain \\\n", - "0 https://www.canada.ca/en/services.html canada.ca \n", - "1 https://maniform.world.tmall.com/category-1282... tmall.com \n", - "2 https://maniform.world.tmall.com/category-1282... tmall.com \n", - "3 https://www.coches.net/fiat/segunda-mano/ coches.net \n", - "4 https://www.coches.net/fiat/segunda-mano/ coches.net \n", - "\n", - " script_domain \n", - "0 adobedtm.com \n", - "1 alicdn.com \n", - "2 alicdn.com \n", - "3 coches.net \n", - "4 coches.net " + " script_domain script_url \n", + "0 adobedtm.com https://assets.adobedtm.com/caacec67651710193d... \n", + "1 alicdn.com https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", + "2 alicdn.com https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", + "3 coches.net https://www.coches.net/scripts/common.min.js?2... \n", + "4 coches.net https://www.coches.net/scripts/common.min.js?2... " ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#read\n", - "df = dd.read_parquet('0_sample_domains.parquet', engine='pyarrow')\n", - "df.head()" + "df = read_parquet('above_mean_domain.parquet')\n", + "df[['location_domain', 'location', 'script_domain', 'script_url']].head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# is JSON Column\n", + "# Add Column: is_json\n", + "\n", + "After manual initial analysis I have think that the huge values are json structured, to validate that I included an new column that is a boolean value with the validation of json\n", "\n", - "After manual initial analysis I have think that the huge values are json structured, to validate that I included an new column that is a boolean value with the validation of json" + "After simple validation of value is a json or not, boolean value will be saved on a new column named \"is_json\"\n" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import json\n", + "import pandas as pd\n", "\n", "def is_json(myjson):\n", " try:\n", " json.loads(myjson)\n", " return True\n", + "\n", " except ValueError as e:\n", " return False" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "\n", - "df['is_json'] = df['value'].apply(is_json)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### is_JSON data\n", - "Saving the new produced data with 'is_json' columns into disk" + "#To guarantee the usage of the correct parquet created above in case we start from this section\n", + "df = read_parquet('above_mean_domain.parquet')\n", + "df['is_json'] = df['value'].apply(is_json, meta=False)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "/anaconda3/envs/overscripted/lib/python3.6/site-packages/fastparquet/util.py:221: FutureWarning: A future version of pandas will default to `skipna=True`. To silence this warning, pass `skipna=True|False` explicitly.\n", - " inferred_dtype = infer_dtype(column)\n" + "[########################################] | 100% Completed | 2min 25.1s\n" ] } ], "source": [ "#save\n", - "df.to_parquet('is_json_above_mean.parquet')" + "save_parquet(df=df, name='above_mean_domain_json.parquet')" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -458,13 +454,6 @@ " \n", " \n", " value_1000\n", - " value\n", - " value_len\n", - " symbol\n", - " script_url\n", - " location\n", - " location_domain\n", - " script_domain\n", " is_json\n", " \n", " \n", @@ -472,61 +461,26 @@ " \n", " 0\n", " {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...\n", - " {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...\n", - " 3713\n", - " window.sessionStorage\n", - " https://assets.adobedtm.com/caacec67651710193d...\n", - " https://www.canada.ca/en/services.html\n", - " canada.ca\n", - " adobedtm.com\n", " True\n", " \n", " \n", " 1\n", " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " 103878\n", - " window.localStorage\n", - " https://g.alicdn.com/alilog/mlog/aplus_v2.js\n", - " https://maniform.world.tmall.com/category-1282...\n", - " tmall.com\n", - " alicdn.com\n", " True\n", " \n", " \n", " 2\n", " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " 103878\n", - " window.localStorage\n", - " https://g.alicdn.com/alilog/mlog/aplus_v2.js\n", - " https://maniform.world.tmall.com/category-1282...\n", - " tmall.com\n", - " alicdn.com\n", " True\n", " \n", " \n", " 3\n", " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " 1358\n", - " window.document.cookie\n", - " https://www.coches.net/scripts/common.min.js?2...\n", - " https://www.coches.net/fiat/segunda-mano/\n", - " coches.net\n", - " coches.net\n", " False\n", " \n", " \n", " 4\n", " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " 1358\n", - " window.document.cookie\n", - " https://www.coches.net/scripts/common.min.js?2...\n", - " https://www.coches.net/fiat/segunda-mano/\n", - " coches.net\n", - " coches.net\n", " False\n", " \n", " \n", @@ -534,64 +488,36 @@ "" ], "text/plain": [ - " value_1000 \\\n", - "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... \n", - "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", - "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", - "3 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", - "4 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", - "\n", - " value value_len \\\n", - "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... 3713 \n", - "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... 103878 \n", - "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... 103878 \n", - "3 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", - "4 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", - "\n", - " symbol script_url \\\n", - "0 window.sessionStorage https://assets.adobedtm.com/caacec67651710193d... \n", - "1 window.localStorage https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", - "2 window.localStorage https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", - "3 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", - "4 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", - "\n", - " location location_domain \\\n", - "0 https://www.canada.ca/en/services.html canada.ca \n", - "1 https://maniform.world.tmall.com/category-1282... tmall.com \n", - "2 https://maniform.world.tmall.com/category-1282... tmall.com \n", - "3 https://www.coches.net/fiat/segunda-mano/ coches.net \n", - "4 https://www.coches.net/fiat/segunda-mano/ coches.net \n", - "\n", - " script_domain is_json \n", - "0 adobedtm.com True \n", - "1 alicdn.com True \n", - "2 alicdn.com True \n", - "3 coches.net False \n", - "4 coches.net False " + " value_1000 is_json\n", + "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... True\n", + "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... True\n", + "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... True\n", + "3 usunico=17/12/2017:0-00155123:830; SessionASM=... False\n", + "4 usunico=17/12/2017:0-00155123:830; SessionASM=... False" ] }, - "execution_count": 8, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#read\n", - "df = dd.read_parquet('is_json_above_mean.parquet')\n", - "df.head()" + "df = read_parquet('above_mean_domain_json.parquet')\n", + "df[['value_1000', 'is_json']].head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Value md5\n", + "# Add Column: value_md5\n", "Include new columns called \"value_md5\" that is the md5 of value column" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -603,25 +529,46 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "df['value_md5'] = df['value'].apply(md5)" + "#To guarantee the usage of the correct parquet created above in case we start from this section\n", + "df = read_parquet('above_mean_domain_json.parquet') \n", + "\n", + "df['value_md5'] = df['value'].apply(md5, meta=' ')" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 18, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_1000is_json
0{\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...True
1{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...True
2{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...True
3{\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279...True
4{\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279...True
\n", + "
" + ], + "text/plain": [ + " value_1000 is_json\n", + "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... True\n", + "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... True\n", + "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... True\n", + "3 {\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279... True\n", + "4 {\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279... True" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#read all_json_above_mean\n", + "df = read_parquet('JSONs_only.parquet')\n", + "df[['value_1000', 'is_json']].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add json keys and schema columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Extract the top level keys, sort them and add as a list into another column named 'json_keys'\n", + "Will be using \"https://github.com/rnd0101/json_schema_inferencer\" to guess the json schema and save it into another column called \"json_schema\"" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "from json_schema_inferencer.guess_json_schema import guess_schema\n", + "\n", + "df = read_parquet('JSONs_only.parquet')\n", + "\n", + "def jsonSchema(myjson):\n", + " try:\n", + " dct = json.loads(myjson)\n", + " value = guess_schema(dct)\n", + " l = list(value['properties'])\n", + " l.sort()\n", + " return l\n", + " except ValueError as e:\n", + " return list()\n", + " \n", + "def jsonKeys(myjson):\n", + " try:\n", + " dct = json.loads(myjson)\n", + " keys = list(dct.keys())\n", + " keys.sort()\n", + " return keys\n", + " except ValueError as e:\n", + " return list()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", "output_type": "stream", "text": [ - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=1375)\n", - "distributed.nanny - WARNING - Worker process 1375 was killed by unknown signal\n", - "distributed.nanny - WARNING - Restarting worker\n", - "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=1421)\n", - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=1421)\n" + "[########################################] | 100% Completed | 3min 57.7s\n" ] - }, + } + ], + "source": [ + "df['json_keys'] = df['value'].apply(jsonKeys, meta='')\n", + "df['json_schema'] = df['value'].apply(jsonSchema, meta='')\n", + "save_parquet(df=df, name='JSONs_key_schema.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ { - "name": "stderr", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_1000json_keysjson_schema
0{\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...[im-settings][im-settings]
1{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...[APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c][APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c]
2{\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...[APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c][APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c]
3{\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279...[LastSearch, LastSearch_e, dueljs_channel_comm...[LastSearch, LastSearch_e, dueljs_channel_comm...
4{\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279...[LastSearch, LastSearch_e, dueljs_channel_comm...[LastSearch, LastSearch_e, dueljs_channel_comm...
\n", + "
" + ], + "text/plain": [ + " value_1000 \\\n", + "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... \n", + "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", + "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", + "3 {\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279... \n", + "4 {\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279... \n", + "\n", + " json_keys \\\n", + "0 [im-settings] \n", + "1 [APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c] \n", + "2 [APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c] \n", + "3 [LastSearch, LastSearch_e, dueljs_channel_comm... \n", + "4 [LastSearch, LastSearch_e, dueljs_channel_comm... \n", + "\n", + " json_schema \n", + "0 [im-settings] \n", + "1 [APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c] \n", + "2 [APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c] \n", + "3 [LastSearch, LastSearch_e, dueljs_channel_comm... \n", + "4 [LastSearch, LastSearch_e, dueljs_channel_comm... " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#read \n", + "df = read_parquet('JSONs_key_schema.parquet')\n", + "df[['value_1000', 'json_keys', 'json_schema']].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### All NON json above the mean" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", "output_type": "stream", "text": [ - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=1421)\n", - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=1421)\n", - "distributed.nanny - WARNING - Worker process 1421 was killed by unknown signal\n", - "distributed.nanny - WARNING - Restarting worker\n" + "[########################################] | 100% Completed | 26.7s\n", + "Npartition: 12\n", + "[########################################] | 100% Completed | 27.8s\n" ] } ], "source": [ - "#save\n", - "df.to_parquet('is_json_above_mean_md5.parquet')" + "df = read_parquet('above_mean_domain_json_md5.parquet')\n", + "save_parquet(df=df[df['is_json'] == False], name='NON_JSONs_only.parquet', recalculate_partition=True)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1223,44 +984,48 @@ " location_domain\n", " script_domain\n", " is_json\n", + " value_md5\n", " \n", " \n", " \n", " \n", " 0\n", - " {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...\n", - " {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...\n", - " 3713\n", - " window.sessionStorage\n", - " https://assets.adobedtm.com/caacec67651710193d...\n", - " https://www.canada.ca/en/services.html\n", - " canada.ca\n", - " adobedtm.com\n", - " True\n", + " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", + " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", + " 1358\n", + " window.document.cookie\n", + " https://www.coches.net/scripts/common.min.js?2...\n", + " https://www.coches.net/fiat/segunda-mano/\n", + " coches.net\n", + " coches.net\n", + " False\n", + " db64465b639e01993d9212390f057628\n", " \n", " \n", " 1\n", - " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " 103878\n", - " window.localStorage\n", - " https://g.alicdn.com/alilog/mlog/aplus_v2.js\n", - " https://maniform.world.tmall.com/category-1282...\n", - " tmall.com\n", - " alicdn.com\n", - " True\n", + " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", + " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", + " 1358\n", + " window.document.cookie\n", + " https://www.coches.net/scripts/common.min.js?2...\n", + " https://www.coches.net/fiat/segunda-mano/\n", + " coches.net\n", + " coches.net\n", + " False\n", + " db64465b639e01993d9212390f057628\n", " \n", " \n", " 2\n", - " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " 103878\n", - " window.localStorage\n", - " https://g.alicdn.com/alilog/mlog/aplus_v2.js\n", - " https://maniform.world.tmall.com/category-1282...\n", - " tmall.com\n", - " alicdn.com\n", - " True\n", + " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", + " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", + " 1358\n", + " window.document.cookie\n", + " https://tags.tiqcdn.com/utag/schibsted/coches....\n", + " https://www.coches.net/fiat/segunda-mano/\n", + " coches.net\n", + " tiqcdn.com\n", + " False\n", + " db64465b639e01993d9212390f057628\n", " \n", " \n", " 3\n", @@ -1268,11 +1033,12 @@ " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", " 1358\n", " window.document.cookie\n", - " https://www.coches.net/scripts/common.min.js?2...\n", + " https://tags.tiqcdn.com/utag/schibsted/coches....\n", " https://www.coches.net/fiat/segunda-mano/\n", " coches.net\n", - " coches.net\n", + " tiqcdn.com\n", " False\n", + " db64465b639e01993d9212390f057628\n", " \n", " \n", " 4\n", @@ -1280,11 +1046,12 @@ " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", " 1358\n", " window.document.cookie\n", - " https://www.coches.net/scripts/common.min.js?2...\n", + " https://tags.tiqcdn.com/utag/schibsted/coches....\n", " https://www.coches.net/fiat/segunda-mano/\n", " coches.net\n", - " coches.net\n", + " tiqcdn.com\n", " False\n", + " db64465b639e01993d9212390f057628\n", " \n", " \n", "\n", @@ -1292,900 +1059,52 @@ ], "text/plain": [ " value_1000 \\\n", - "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... \n", - "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", - "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", + "0 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", + "1 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", + "2 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", "3 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", "4 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", "\n", " value value_len \\\n", - "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... 3713 \n", - "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... 103878 \n", - "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... 103878 \n", + "0 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", + "1 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", + "2 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", "3 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", "4 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", "\n", " symbol script_url \\\n", - "0 window.sessionStorage https://assets.adobedtm.com/caacec67651710193d... \n", - "1 window.localStorage https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", - "2 window.localStorage https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", - "3 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", - "4 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", + "0 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", + "1 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", + "2 window.document.cookie https://tags.tiqcdn.com/utag/schibsted/coches.... \n", + "3 window.document.cookie https://tags.tiqcdn.com/utag/schibsted/coches.... \n", + "4 window.document.cookie https://tags.tiqcdn.com/utag/schibsted/coches.... \n", "\n", - " location location_domain \\\n", - "0 https://www.canada.ca/en/services.html canada.ca \n", - "1 https://maniform.world.tmall.com/category-1282... tmall.com \n", - "2 https://maniform.world.tmall.com/category-1282... tmall.com \n", - "3 https://www.coches.net/fiat/segunda-mano/ coches.net \n", - "4 https://www.coches.net/fiat/segunda-mano/ coches.net \n", + " location location_domain script_domain \\\n", + "0 https://www.coches.net/fiat/segunda-mano/ coches.net coches.net \n", + "1 https://www.coches.net/fiat/segunda-mano/ coches.net coches.net \n", + "2 https://www.coches.net/fiat/segunda-mano/ coches.net tiqcdn.com \n", + "3 https://www.coches.net/fiat/segunda-mano/ coches.net tiqcdn.com \n", + "4 https://www.coches.net/fiat/segunda-mano/ coches.net tiqcdn.com \n", "\n", - " script_domain is_json \n", - "0 adobedtm.com True \n", - "1 alicdn.com True \n", - "2 alicdn.com True \n", - "3 coches.net False \n", - "4 coches.net False " + " is_json value_md5 \n", + "0 False db64465b639e01993d9212390f057628 \n", + "1 False db64465b639e01993d9212390f057628 \n", + "2 False db64465b639e01993d9212390f057628 \n", + "3 False db64465b639e01993d9212390f057628 \n", + "4 False db64465b639e01993d9212390f057628 " ] }, - "execution_count": 16, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "#read\n", - "df = dd.read_parquet('is_json_above_mean_md5.parquet')\n", + "#read \n", + "df = read_parquet('NON_JSONs_only.parquet')\n", "df.head()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Saving other possible usefull samples to future analyses" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=21460)\n", - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=21460)\n", - "distributed.nanny - WARNING - Worker process 21460 was killed by unknown signal\n", - "distributed.nanny - WARNING - Restarting worker\n", - "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=21468)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=21468)\n", - "distributed.nanny - WARNING - Worker process 21468 was killed by unknown signal\n", - "distributed.nanny - WARNING - Restarting worker\n", - "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=21484)\n", - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "psutil.AccessDenied: psutil.AccessDenied (pid=21484)\n", - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=21484)\n", - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=21484)\n", - "distributed.nanny - WARNING - Worker process 21484 was killed by unknown signal\n", - "distributed.nanny - WARNING - Restarting worker\n", - "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=21476)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=21476)\n", - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=21476)\n", - "distributed.nanny - WARNING - Worker process 21476 was killed by unknown signal\n", - "distributed.nanny - WARNING - Restarting worker\n", - "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=21497)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=21497)\n", - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=21497)\n", - "distributed.nanny - WARNING - Worker process 21497 was killed by unknown signal\n", - "distributed.nanny - WARNING - Restarting worker\n", - "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=21489)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "tornado.application - ERROR - Exception in callback >\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: 'Process' object has no attribute '_cache'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 341, in wrapper\n", - " ret = self._cache[fun]\n", - "AttributeError: _cache\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 359, in catch_zombie\n", - " yield\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - "ProcessLookupError: [Errno 3] No such process\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/tornado/ioloop.py\", line 907, in _run\n", - " return self.callback()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/nanny.py\", line 266, in memory_monitor\n", - " memory = proc.memory_info().rss\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/__init__.py\", line 1166, in memory_info\n", - " return self._proc.memory_info()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 469, in memory_info\n", - " rawtuple = self._get_pidtaskinfo()\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 339, in wrapper\n", - " return fun(self, *args, **kwargs)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_common.py\", line 344, in wrapper\n", - " return fun(self)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 400, in _get_pidtaskinfo\n", - " ret = cext.proc_pidtaskinfo_oneshot(self.pid)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/contextlib.py\", line 99, in __exit__\n", - " self.gen.throw(type, value, traceback)\n", - " File \"/anaconda3/envs/overscripted/lib/python3.6/site-packages/psutil/_psosx.py\", line 372, in catch_zombie\n", - " raise AccessDenied(proc.pid, proc._name)\n", - "psutil.AccessDenied: psutil.AccessDenied (pid=21489)\n", - "distributed.nanny - WARNING - Worker process 21489 was killed by unknown signal\n", - "distributed.nanny - WARNING - Restarting worker\n" - ] - } - ], - "source": [ - "df[df['is_json'] == True].to_parquet('all_json_above_mean.parquet', engine='pyarrow')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "df[df['is_json'] == False].to_parquet('all_NON_json_above_mean.parquet')" - ] - }, { "cell_type": "code", "execution_count": null, From 9e48a034ac5ad3e605623d803258ae21d77612d3 Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Mon, 8 Apr 2019 15:13:53 -0300 Subject: [PATCH 15/23] Remove Quantitative comparison and Add value distribution notebook --- .../isJson_Quantitative_Comparasion.ipynb | 917 ------------------ .../isJson_Value_Distribution.ipynb | 664 +++++++++++++ 2 files changed, 664 insertions(+), 917 deletions(-) delete mode 100644 analyses/2019_03_aliamcami_value_analyses/isJson_Quantitative_Comparasion.ipynb create mode 100644 analyses/2019_03_aliamcami_value_analyses/isJson_Value_Distribution.ipynb diff --git a/analyses/2019_03_aliamcami_value_analyses/isJson_Quantitative_Comparasion.ipynb b/analyses/2019_03_aliamcami_value_analyses/isJson_Quantitative_Comparasion.ipynb deleted file mode 100644 index 20a2660..0000000 --- a/analyses/2019_03_aliamcami_value_analyses/isJson_Quantitative_Comparasion.ipynb +++ /dev/null @@ -1,917 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Start dask" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/anaconda3/envs/overscripted/lib/python3.6/site-packages/dask/config.py:168: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", - " data = yaml.load(f.read()) or {}\n", - "/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/config.py:20: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", - " defaults = yaml.load(f)\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "

Client

\n", - "\n", - "
\n", - "

Cluster

\n", - "
    \n", - "
  • Workers: 4
  • \n", - "
  • Cores: 4
  • \n", - "
  • Memory: 8.59 GB
  • \n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import dask.dataframe as dd\n", - "from dask.distributed import Client\n", - "\n", - "#Initializing client\n", - "client = Client()\n", - "client" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data\n", - "This notebook starts using 'is_json_above_bean.parquet', this is a filtered data that you can get by running the data preparation notebook called 'jsJson_dataPrep.ipynb'. \n", - "This parquet contains the 10% sample data filtered by values above the mean of value_len. \n", - "\n", - "This new sample has 499805 rows, meaning that its only 4,42% of the original sample (most values are smaller than the sample's mean of 1356). " - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
value_lenis_json
03713True
1103878True
2103878True
31358False
41358False
\n", - "
" - ], - "text/plain": [ - " value_len is_json\n", - "0 3713 True\n", - "1 103878 True\n", - "2 103878 True\n", - "3 1358 False\n", - "4 1358 False" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = dd.read_parquet('is_json_above_mean.parquet', columns=['value_len', 'is_json'])\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Visualization: " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/worker.py:2791: UserWarning: Large object of size 1.89 MB detected in task graph: \n", - " (" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "cdf['value_len'].plot(kind='hist', legend=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "jsonGroup = cdf.groupby('is_json')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And we cannot identify any non_json (blue) on the right side of the histogram. This means there all frquency of non-json values are very low or inexistent for the biggest values. Since there are so many small values, the biggest ones represent such a small portion that is hard to identify by look on the histograms and graphs. " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "is_json\n", - "False AxesSubplot(0.125,0.125;0.775x0.755)\n", - "True AxesSubplot(0.125,0.125;0.775x0.755)\n", - "Name: value_len, dtype: object" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZsAAAD8CAYAAAChHgmuAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAG9tJREFUeJzt3X+01XWd7/HniwN4KE0Qwbgc9aAxcyUrwhPiaro3f4TodSRN70XvGlkNDWW6tDW1EptZSaWrvGsmZ1wxmiWFPwrQUrmKl4vkTHcqfxwUlR8RJ6I4YYKAaJOowPv+sT8HN7DPPntvzuds2Of1WOu79vf7/n5+7W8d33y/38/+fhURmJmZ5TSg3gMwM7PG52RjZmbZOdmYmVl2TjZmZpadk42ZmWXnZGNmZtk52ZiZWXZONmZmlp2TjZmZZTew3gM4VBx77LHR2tpa72GYmR1Wli9f/nJEjOipnJNN0traSnt7e72HYWZ2WJH020rK+TKamZll52RjZmbZOdmYmVl2vmdjZlalt956i87OTnbu3FnvofSZ5uZmWlpaGDRoUE31nWzMzKrU2dnJUUcdRWtrK5LqPZzsIoKtW7fS2dnJmDFjamrDl9HMzKq0c+dOhg8f3i8SDYAkhg8fflBnck42ZmY16C+JpsvBfl8nGzMzy873bMzMDlLrrEd6tb0N3/hvPZZpamrife97397tBx98kO6egrJhwwYuuOACVq5c2VtDrJqTTS9onfVIRf/nMDPrLUOGDGHFihX1HkbFsl1Gk9Qs6SlJz0laJekrKT5G0pOS1klaIGlwih+RtjvS/taitq5P8bWSzi2KT0mxDkmziuIl+zAza2QbNmzgIx/5CBMmTGDChAn8/Oc/P6DMqlWrmDhxIuPHj+f9738/69atA+Cee+7ZG//0pz/N7t27e3VsOe/ZvAGcFREfAMYDUyRNAm4GbomIscB2YEYqPwPYHhHvAW5J5ZA0DpgGvBeYAvyLpCZJTcAc4DxgHHBZKkuZPszMGsLrr7/O+PHjGT9+PBdddBEAI0eOZOnSpTzzzDMsWLCAa6655oB6t99+O9deey0rVqygvb2dlpYW1qxZw4IFC/jZz37GihUraGpq4t577+3V8Wa7jBYRAfwxbQ5KSwBnAZen+DxgNnAbMDWtA9wPfEuF6Q9TgfkR8QbwG0kdwMRUriMi1gNImg9MlbSmTB9mZg2h1GW0t956i6uvvnpvwvjVr351QL0zzjiDm266ic7OTi6++GLGjh3LsmXLWL58OR/60IeAQiIbOXJkr4436z2bdPaxHHgPhbOQXwOvRMSuVKQTGJ3WRwMbASJil6QdwPAUf6Ko2eI6G/eLn57qdNeHmVnDuuWWWzjuuON47rnn2LNnD83NzQeUufzyyzn99NN55JFHOPfcc/nud79LRDB9+nS+/vWvZxtb1qnPEbE7IsYDLRTORk4pVSx9lprEHb0YP4CkmZLaJbVv2bKlVBEzs8PGjh07GDVqFAMGDODuu+8ued9l/fr1nHTSSVxzzTVceOGFPP/885x99tncf//9bN68GYBt27bx299W9OaAivXJbLSIeEXSvwKTgKGSBqYzjxZgUyrWCRwPdEoaCBwNbCuKdymuUyr+cpk+9h/XHcAdAG1tbSUTkplZTw6V2aif/exn+cQnPsF9993HmWeeyTvf+c4DyixYsIB77rmHQYMG8e53v5svf/nLHHPMMdx4441MnjyZPXv2MGjQIObMmcOJJ57Ye4OLiCwLMAIYmtaHAP8PuAC4D5iW4rcDn03rVwG3p/VpwMK0/l7gOeAIYAywHmiikCjXp9jgVOa9qU7JPsotp512WtTqxOserrmumR1+Vq9eXe8h1EWp7w20RwU5IeeZzShgXrpvMyAlj4clrQbmS7oReBa4M5W/E7g7TQDYlhIOEbFK0kJgNbALuCoidgNIuhpYkpLP3IhYldq6rps+zMysDnLORnse+GCJ+Hrenk1WHN8JXNpNWzcBN5WILwYWV9qHmZnVh5+NZmZm2TnZmJlZdk42ZmaWnZONmZll56c+m5kdrNlH93J7O8ru3rp1K2effTYAf/jDH2hqamLEiBEAPPXUUwwefOg9e9jJxszsMDN8+PC9z0WbPXs2Rx55JF/4whf2KdP1+5YBAw6NC1iHxijMzOygdXR0cOqpp/KZz3yGCRMmsHHjRoYOHbp3//z58/nUpz4FwEsvvcTFF19MW1sbEydO5Iknnuiu2V7hZGNm1kBWr17NjBkzePbZZxk9uvtnEF9zzTV88YtfpL29nYULF+5NQrn4MpqZWQM5+eST974qoJzHHnuMtWvX7t3evn07r7/+OkOGDMkyLicbM7MGUvzwzQEDBnQ9qxKAnTt37l2PiD6dTODLaGZmDWrAgAEMGzaMdevWsWfPHh544IG9+8455xzmzJmzd3v/F7H1Np/ZmJkdrB6mKtfTzTffzJQpUzjhhBMYN24cb7zxBgBz5szhyiuv5Hvf+x67du3izDPP3Cf59DYVn2L1Z21tbdHe3l5T3dZZjxwy77Mws/zWrFnDKaeUehdkYyv1vSUtj4i2nur6MpqZmWXnZGNmZtk52ZiZ1aC/3YI42O/rZGNmVqXm5ma2bt3abxJORLB161aam5trbsOz0czMqtTS0kJnZydbtmyp91D6THNzMy0tLTXXd7IxM6vSoEGDGDNmTL2HcVjxZTQzM8vOycbMzLJzsjEzs+ycbMzMLDsnGzMzy87JxszMssuWbCQdL+lxSWskrZJ0bYrPlvR7SSvScn5RnesldUhaK+ncoviUFOuQNKsoPkbSk5LWSVogaXCKH5G2O9L+1lzf08zMepbzzGYX8PmIOAWYBFwlaVzad0tEjE/LYoC0bxrwXmAK8C+SmiQ1AXOA84BxwGVF7dyc2hoLbAdmpPgMYHtEvAe4JZUzM7M6yZZsIuLFiHgmrb8GrAG6fyE2TAXmR8QbEfEboAOYmJaOiFgfEW8C84GpkgScBdyf6s8DPl7U1ry0fj9wdipvZmZ10Cf3bNJlrA8CT6bQ1ZKelzRX0rAUGw1sLKrWmWLdxYcDr0TErv3i+7SV9u9I5fcf10xJ7ZLa+9NjJ8zM+lr2ZCPpSOBHwOci4lXgNuBkYDzwIvCPXUVLVI8a4uXa2jcQcUdEtEVE24gRI8p+DzMzq13WZCNpEIVEc29E/BggIl6KiN0RsQf4DoXLZFA4Mzm+qHoLsKlM/GVgqKSB+8X3aSvtPxrY1rvfzszMKpVzNpqAO4E1EfHNovioomIXASvT+iJgWppJNgYYCzwFPA2MTTPPBlOYRLAoCs/2fhy4JNWfDjxU1Nb0tH4J8JPoL88CNzM7BOV86vOHgb8CXpC0IsW+RGE22XgKl7U2AJ8GiIhVkhYCqynMZLsqInYDSLoaWAI0AXMjYlVq7zpgvqQbgWcpJDfS592SOiic0UzL+D3NzKwH2ZJNRPw7pe+dLC5T5ybgphLxxaXqRcR63r4MVxzfCVxazXjNzCwfP0HAzMyyc7IxM7PsnGzMzCw7JxszM8vOycbMzLJzsjEzs+ycbMzMLDsnGzMzy87JxszMsnOyMTOz7JxszMwsOycbMzPLzsnGzMyyc7IxM7PsnGzMzCw7JxszM8vOycbMzLJzsjEzs+ycbMzMLDsnGzMzy87JxszMsnOyMTOz7JxszMwsOycbMzPLLluykXS8pMclrZG0StK1KX6MpKWS1qXPYSkuSbdK6pD0vKQJRW1NT+XXSZpeFD9N0gupzq2SVK4PMzOrj5xnNruAz0fEKcAk4CpJ44BZwLKIGAssS9sA5wFj0zITuA0KiQO4ATgdmAjcUJQ8bktlu+pNSfHu+jAzszrIlmwi4sWIeCatvwasAUYDU4F5qdg84ONpfSpwVxQ8AQyVNAo4F1gaEdsiYjuwFJiS9r0rIn4REQHctV9bpfowM7M66JN7NpJagQ8CTwLHRcSLUEhIwMhUbDSwsahaZ4qVi3eWiFOmDzMzq4PsyUbSkcCPgM9FxKvlipaIRQ3xasY2U1K7pPYtW7ZUU9XMzKqQNdlIGkQh0dwbET9O4ZfSJTDS5+YU7wSOL6reAmzqId5SIl6uj31ExB0R0RYRbSNGjKjtS5qZWY8qSjaSTq224TQz7E5gTUR8s2jXIqBrRtl04KGi+BVpVtokYEe6BLYEmCxpWJoYMBlYkva9JmlS6uuK/doq1YeZmdXBwArL3S5pMPB94AcR8UoFdT4M/BXwgqQVKfYl4BvAQkkzgN8Bl6Z9i4HzgQ7gT8AnASJim6SvAU+ncl+NiG1p/co0piHAo2mhTB9mZlYHFSWbiPgLSWOBvwbaJT0FfC8ilpap8++Uvq8CcHaJ8gFc1U1bc4G5JeLtwAFnXRGxtVQfZmZWHxXfs4mIdcDfA9cB/xW4VdIvJV2ca3BmZtYYKr1n835Jt1D4rcxZwF+mH2ueBdyScXxmZtYAKr1n8y3gO8CXIuL1rmBEbJL091lGZmZmDaPSZHM+8HpE7AaQNABojog/RcTd2UZnZmYNodJ7No9RmPHV5R0pZmZm1qNKz2yaI+KPXRsR8UdJ78g0psPOhubLYXadOp+9o04dm5lVrtIzm//Y75H/pwGvlylvZma2V6VnNp8D7pPU9TiYUcD/yDMkMzNrNJX+qPNpSf8Z+HMKP9T8ZUS8lXVkZmbWMCo9swH4ENCa6nxQEhFxV5ZRmZlZQ6ko2Ui6GzgZWAHsTuGuF5aZmZmVVemZTRswLj2/zMzMrCqVzkZbCbw750DMzKxxVXpmcyywOj3t+Y2uYERcmGVUZmbWUCpNNrNzDsLMzBpbpVOf/03SicDYiHgsPT2gKe/QzMysUVT6ioG/Ae4Hvp1Co4EHcw3KzMwaS6UTBK6i8JrnV2Hvi9RG5hqUmZk1lkqTzRsR8WbXhqSBFH5nY2Zm1qNKk82/SfoSMETSx4D7gP+db1hmZtZIKk02s4AtwAvAp4HFgN/QaWZmFal0NtoeCq+F/k7e4ZiZWSOq9Nlov6HEPZqIOKnXR2RmZg2nmmejdWkGLgWO6f3hmJlZI6ronk1EbC1afh8R/wSclXlsZmbWICr9UeeEoqVN0meAo3qoM1fSZkkri2KzJf1e0oq0nF+073pJHZLWSjq3KD4lxTokzSqKj5H0pKR1khZIGpziR6TtjrS/teKjYWZmWVR6Ge0fi9Z3ARuA/95Dne8D3+LAd97cEhH/UByQNA6YBrwX+E/AY5L+LO2eA3wM6ASelrQoIlYDN6e25ku6HZgB3JY+t0fEeyRNS+X8CmszszqqdDbamdU2HBE/reKsYiowPyLeAH4jqQOYmPZ1RMR6AEnzgamS1lC4jHd5KjOPwsNCb0ttzU7x+4FvSZLfxWNmVj+Vzkb723L7I+KbVfR5taQrgHbg8xGxncKz1p4oKtOZYgAb94ufDgwHXomIXSXKj+6qExG7JO1I5V+uYoxmZtaLKv1RZxtwJYX/kI8GPgOMo3Dfpuy9m/3cRuH10uOBF3n78pxKlI0a4uXaOoCkmZLaJbVv2bKl3LjNzOwgVPPytAkR8RoUbvQD90XEp6rpLCJe6lqX9B3g4bTZCRxfVLQF2JTWS8VfBoZKGpjOborLd7XVmZ7hdjSwrZvx3AHcAdDW1ubLbGZmmVR6ZnMC8GbR9ptAa7WdSRpVtHkRhddNAywCpqWZZGOAscBTwNPA2DTzbDCFSQSL0v2Xx4FLUv3pwENFbU1P65cAP/H9GjOz+qr0zOZu4ClJD1C4JHURB84y24ekHwIfBY6V1AncAHxU0vjUxgYKz1kjIlZJWgispjDb7aqI2J3auRpYQuFlbXMjYlXq4jpgvqQbgWeBO1P8TuDuNMlgG4UEZWZmdaRK/9EvaQLwkbT504h4Ntuo6qCtrS3a29trqzz76N4dTFV976hf32bW70laHhFtPZWr9DIawDuAVyPinyncDxlT8+jMzKxfqfQJAjdQuGx1fQoNAu7JNSgzM2sslZ7ZXARcCPwHQERsoropz2Zm1o9VmmzeTDO6AkDSO/MNyczMGk2lyWahpG9T+G3L3wCP4RepmZlZhSp9Nto/SPoY8Crw58CXI2Jp1pGZmVnD6DHZSGoClkTEOYATjJmZVa3Hy2jpx5V/klTHH5OYmdnhrNInCOwEXpC0lDQjDSAirskyKjMzayiVJptH0mJmZla1sslG0gkR8buImNdXAzIzs8bT0z2bB7tWJP0o81jMzKxB9ZRsil9EdlLOgZiZWePqKdlEN+tmZmYV62mCwAckvUrhDGdIWidtR0S8K+vozMysIZRNNhHR1FcDMTOzxlXN+2zMzMxq4mRjZmbZOdmYmVl2TjZmZpadk42ZmWXnZGNmZtk52ZiZWXZONmZmlp2TjZmZZZct2UiaK2mzpJVFsWMkLZW0Ln0OS3FJulVSh6TnJU0oqjM9lV8naXpR/DRJL6Q6t0pSuT7MzKx+cp7ZfB+Ysl9sFrAsIsYCy9I2wHnA2LTMBG6DQuIAbgBOByYCNxQlj9tS2a56U3row8zM6iRbsomInwLb9gtPBbpexDYP+HhR/K4oeAIYKmkUcC6wNCK2RcR2YCkwJe17V0T8IiICuGu/tkr1YWZmddLX92yOi4gXAdLnyBQfDWwsKteZYuXinSXi5fo4gKSZktoltW/ZsqXmL2VmZuUdKhMEVCIWNcSrEhF3RERbRLSNGDGi2upmZlahvk42L6VLYKTPzSneCRxfVK4F2NRDvKVEvFwfZmZWJ32dbBYBXTPKpgMPFcWvSLPSJgE70iWwJcBkScPSxIDJwJK07zVJk9IstCv2a6tUH2ZmVic9vamzZpJ+CHwUOFZSJ4VZZd8AFkqaAfwOuDQVXwycD3QAfwI+CRAR2yR9DXg6lftqRHRNOriSwoy3IcCjaaFMH2ZmVifZkk1EXNbNrrNLlA3gqm7amQvMLRFvB04tEd9aqg8zM6ufQ2WCgJmZNTAnGzMzy87JxszMsnOyMTOz7JxszMwsOycbMzPLzsnGzMyyc7IxM7PsnGzMzCw7JxszM8vOycbMzLJzsjEzs+ycbMzMLDsnGzMzy87JxszMsnOyMTOz7JxszMwsOycbMzPLzsnGzMyyc7IxM7PsnGzMzCw7JxszM8vOycbMzLJzsjEzs+zqkmwkbZD0gqQVktpT7BhJSyWtS5/DUlySbpXUIel5SROK2pmeyq+TNL0oflpqvyPVVd9/SzMz61LPM5szI2J8RLSl7VnAsogYCyxL2wDnAWPTMhO4DQrJCbgBOB2YCNzQlaBSmZlF9abk/zpmZtadQ+ky2lRgXlqfB3y8KH5XFDwBDJU0CjgXWBoR2yJiO7AUmJL2vSsifhERAdxV1JaZmdVBvZJNAP9X0nJJM1PsuIh4ESB9jkzx0cDGorqdKVYu3lkibmZmdTKwTv1+OCI2SRoJLJX0yzJlS91viRriBzZcSHQzAU444YTyIzYzs5rV5cwmIjalz83AAxTuubyULoGRPjen4p3A8UXVW4BNPcRbSsRLjeOOiGiLiLYRI0Yc7NcyM7Nu9HmykfROSUd1rQOTgZXAIqBrRtl04KG0vgi4Is1KmwTsSJfZlgCTJQ1LEwMmA0vSvtckTUqz0K4oasvMzOqgHpfRjgMeSLORBwI/iIj/I+lpYKGkGcDvgEtT+cXA+UAH8CfgkwARsU3S14CnU7mvRsS2tH4l8H1gCPBoWszMrE76PNlExHrgAyXiW4GzS8QDuKqbtuYCc0vE24FTD3qwZmbWKw6lqc9mZtagnGzMzCw7JxszM8vOycbMzLJzsjEzs+ycbMzMLDsnGzMzy87JxszMsnOyMTOz7JxszMwsOycbMzPLzsnGzMyyc7IxM7PsnGzMzCw7JxszM8vOycbMzLJzsjEzs+ycbMzMLDsnGzMzy87JxszMsnOyMTOz7JxszMwsOycbMzPLzsnGzMyyc7IxM7PsBtZ7ALlImgL8M9AEfDcivlHnIeUx++g69bujPv2a2WGpIc9sJDUBc4DzgHHAZZLG1XdUZmb9V0MmG2Ai0BER6yPiTWA+MLXOYzIz67ca9TLaaGBj0XYncHqdxtKYfPnOzKrQqMlGJWJxQCFpJjAzbf5R0toa+zsWeLnGuo0o3/H4Sqn/aQ9p/v/Gvnw89tUIx+PESgo1arLpBI4v2m4BNu1fKCLuAO442M4ktUdE28G20yh8PN7mY7EvH4999afj0aj3bJ4GxkoaI2kwMA1YVOcxmZn1Ww15ZhMRuyRdDSyhMPV5bkSsqvOwzMz6rYZMNgARsRhY3EfdHfSluAbj4/E2H4t9+Xjsq98cD0UccN/czMysVzXqPRszMzuEONkcJElTJK2V1CFpVr3HUy1JcyVtlrSyKHaMpKWS1qXPYSkuSbem7/q8pAlFdaan8uskTS+KnybphVTnVkmqtY8+OBbHS3pc0hpJqyRd28+PR7OkpyQ9l47HV1J8jKQn01gXpEk4SDoibXek/a1FbV2f4mslnVsUL/n3U0sffUFSk6RnJT1c6zgb5VhULSK81LhQmHzwa+AkYDDwHDCu3uOq8jv8F2ACsLIo9r+AWWl9FnBzWj8feJTC75gmAU+m+DHA+vQ5LK0PS/ueAs5IdR4Fzquljz46FqOACWn9KOBXFB531F+Ph4Aj0/og4Mk0hoXAtBS/HbgyrX8WuD2tTwMWpPVx6W/jCGBM+ptpKvf3U20ffXhM/hb4AfBwLeNspGNR9bGr9wAO5yX9R2NJ0fb1wPX1HlcN36OVfZPNWmBUWh8FrE3r3wYu278ccBnw7aL4t1NsFPDLovjectX2Uafj8hDwMR+PAHgH8AyFJ3G8DAxM8b1/AxRmf56R1gemctr/76KrXHd/P6lOVX300TFoAZYBZwEP1zLORjkWtSy+jHZwSj0WZ3SdxtKbjouIFwHS58gU7+77lot3lojX0kefSpckPkjhX/P99niky0YrgM3AUgr/+n4lInaVGM/esab9O4DhVH+chtfQR1/4J+CLwJ60Xcs4G+VYVM3J5uBU9FicBtLd9602XksffUbSkcCPgM9FxKvlipaINdTxiIjdETGewr/qJwKnlBlPbx2Pct+5LsdD0gXA5ohYXhwuM5aGPRa1crI5OBU9Fucw9JKkUQDpc3OKd/d9y8VbSsRr6aNPSBpEIdHcGxE/rnGsDXM8ukTEK8C/UrhnM1RS12/0isezd6xp/9HANqo/Ti/X0EduHwYulLSBwlPkz6JwptMfj0VNnGwOTqM+FmcR0DWDajqFexdd8SvSDKlJwI50yWcJMFnSsDSLajKF68ovAq9JmpRmXV2xX1vV9JFdGuOdwJqI+GbRrv56PEZIGprWhwDnAGuAx4FLuhlr13e4BPhJFG4oLAKmpdlTY4CxFCZKlPz7SXWq7SOriLg+IloiojWN8ycR8T9rGOdhfyxqVu+bRof7QmG20K8oXMv+u3qPp4bx/xB4EXiLwr+UZlC47rsMWJc+j0llReGldL8GXgDaitr5a6AjLZ8sircBK1Odb/H2D4mr7qMPjsVfULgM8TywIi3n9+Pj8X7g2XQ8VgJfTvGTKPwHsgO4DzgixZvTdkfaf1JRW3+XvsNa0gy8cn8/tfTRh8flo7w9G61fH4tqFj9BwMzMsvNlNDMzy87JxszMsnOyMTOz7JxszMwsOycbMzPLzsnGzMyyc7IxM7PsnGzMzCy7/w9D5zzuFuBn3AAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "jsonGroup['value_len'].plot(kind='hist', legend=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Sample overview\n", - "Some overview about the sample after the data prep: \n", - "- Rows: 499805\n", - "- Mean: 27829.33,\n", - "- Min: 1357,\n", - "- Max: 4496861\n", - "- Std: 122092.41" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "#Hardcoded data to fast use, but your can update for the calculed value within the next few cells \n", - "MEAN = 27829.33\n", - "MIN = 1357\n", - "MAX = 4496861\n", - "STD = 122092.41\n", - "COUNT = 499805\n", - "\n", - "#Information for original sample.\n", - "ORIG_MEAN = 1356.97\n", - "ORIG_MIN = 0\n", - "ORIG_MAX = 4496861\n", - "ORIG_STD = 26310.62\n", - "ORIG_COUNT = 11292867\n", - "\n", - "#hardcoded information about described data for values one std above the mean: \n", - "A_MEAN = 271204.44\n", - "A_MIN = 27669\n", - "A_MAX = 4496861\n", - "A_STD = 306555\n", - "A_COUNT = 46745" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "def describedData(df):\n", - " tmp_mean = df['value_len'].mean()\n", - " tmp_min = df['value_len'].min()\n", - " tmp_max = df['value_len'].max()\n", - " tmp_std = df['value_len'].std()\n", - " tmp_count = df['value_len'].count()\n", - " (tmp_mean, tmp_min, tmp_max, tmp_std, tmp_count) = dd.compute(tmp_mean, tmp_min, tmp_max, tmp_std, tmp_count);\n", - " return (tmp_mean, tmp_min, tmp_max, tmp_std, tmp_count)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "27829.332847810645 1357 4496861 122092.41371885882 499805\n" - ] - } - ], - "source": [ - "#Calculate the described data for mean sample\n", - "(MEAN, MIN, MAX, STD, COUNT) = describedData(df)\n", - "print(MEAN, MIN, MAX, STD, COUNT)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1356.9776628910975 0 4496861 26310.62140481331 11292867\n" - ] - } - ], - "source": [ - "#Calculate the described data for original sample\n", - "(ORIG_MEAN, ORIG_MIN, ORIG_MAX, ORIG_STD, ORIG_COUNT) = describedData(dd.read_parquet('sample_0.parquet'))\n", - "print(ORIG_MEAN, ORIG_MIN, ORIG_MAX, ORIG_STD, ORIG_COUNT)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "271204.44978072523 27669 4496861 306555.0273738244 46745\n" - ] - } - ], - "source": [ - "#Calculate the described data for one std above the mean (using mean and stf of the original sample)\n", - "std_above = df[df['value_len'] > ORIG_STD + ORIG_MEAN]\n", - "(A_MEAN, A_MIN, A_MAX, A_STD, A_COUNT) = describedData(std_above)\n", - "print(A_MEAN, A_MIN, A_MAX, A_STD, A_COUNT)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following cell will create a dataframe of the described data calculated above and save it into a csv to fulture use, if calculations are not possible. " - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
MEANMINMAXSTDCOUNT
ORIGINAL1356.9776630449686126310.62140511292867
ABOVE_MEAN27829.33284813574496861122092.413719499805
ABOVE_STD271204.449781276694496861306555.02737446745
\n", - "
" - ], - "text/plain": [ - " MEAN MIN MAX STD COUNT\n", - "ORIGINAL 1356.977663 0 4496861 26310.621405 11292867\n", - "ABOVE_MEAN 27829.332848 1357 4496861 122092.413719 499805\n", - "ABOVE_STD 271204.449781 27669 4496861 306555.027374 46745" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Comparasion of this sample and original 10% sample:\n", - "import pandas as pd\n", - "import numpy as np\n", - "%matplotlib inline\n", - "\n", - "compare = pd.DataFrame([(ORIG_MEAN, ORIG_MIN, ORIG_MAX, ORIG_STD, ORIG_COUNT),\n", - " (MEAN, MIN, MAX, STD, COUNT), \n", - " (A_MEAN, A_MIN, A_MAX, A_STD, A_COUNT)], \n", - " columns=['MEAN', 'MIN', 'MAX', 'STD', 'COUNT'],\n", - " index= ['ORIGINAL','ABOVE_MEAN', 'ABOVE_STD'])\n", - "compare.to_csv('describedData.csv')\n", - "compare" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Number of rows\n", - "The number of rows after filtering for values above the mean are about 4.42% of the original sample. \n", - "And the count for values one std above the mean is just 9.35% of this sample or 0.41% of original sample. \n", - "By this we can see that the really big values represent just a very small portion of the whole. " - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Above the mean / original 4.425846864219688\n", - "1 STD Above the mean / original 0.41393385754033946\n", - "1 STD Above the mean / Above mean 9.35264753253769\n" - ] - } - ], - "source": [ - "print('Above the mean / original', COUNT / ORIG_COUNT * 100)\n", - "print('1 STD Above the mean / original', A_COUNT / ORIG_COUNT * 100)\n", - "print('1 STD Above the mean / Above mean', A_COUNT / COUNT * 100)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "compare['COUNT'].plot(kind='pie')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Max and Min values\n", - "\n", - "it is expected that the maximum will be the same for all mentioned samples since the filtering is being made by the minimum, and is also expected that the min is the value used to filter." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "compare[['MIN','MAX']].plot()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Mean and Std\n", - "> A low standard deviation indicates that the data points tend to be close to the mean (also called the expected value) of the set, while a high standard deviation indicates that the data points are spread out over a wider range of values. (https://en.wikipedia.org/wiki/Standard_deviation)\n", - "\n", - "It is noticeable that both mean and std are increassing as the data is filtered by bigger values. \n", - "\n", - "The increase of the mean is to be expected, since we are eliminating the smaller values and leaving only the bigger ones. \n", - "\n", - "But the STD is not necessary like the mean where it will increase after the filter, but since it is the case here we can safaly assume that as the values get bigger the more spread out they are, the mean of the sample is less accurate to represent the whole dataset since they have a huge difference of value from one another. " - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "compare[['MEAN','STD']].plot()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# IS JSON\n", - "\n", - "This whole sample has: \n", - "- False: 307577 rows\n", - " - 61,54% are not valid JSON\n", - " \n", - "- True: 192228 rows\n", - " - 38,46% are valid JSON" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPoAAADuCAYAAAAQqxqwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFdpJREFUeJzt3XmUXGWZx/HvU72xN5sssuQmkLDLzghhVVTGQoHjAC5wcEfCJg7IZVEvI8MpNhVGDFuGHZkTFZFcCMNiQhIIiWwBRIFgQcIuhCKYhaTzzh+3mmlip7uqu6qee+/7fM6p091Fdd9fh/r13d9XnHMYY/KtoB3AGNN8VnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFN8YDVnRjPGBFH4SI9IjIE30ewQCvDUTk6dalM6Y27doBMmCxc24X7RDGDIet0YeguuaeJiKPVR/79POaHURkVnUrYI6IjK4+f0yf568SkbbW/wbGN1b0wa3eZ7P99upzbwKfcc7tBhwNXN7P930PuKy6NbAHMF9Etqu+fmz1+R7ga83/FYzvbNN9cP1tuncAvxSR3rKO6ef7HgbOEZHNgd85554XkU8DuwOzRQRgdZI/GsY0lRV9aE4D3gB2JtkqWrLyC5xzt4rII0ARuEdEvg0IcINz7qxWhjXGij403cB859wKETkO+Kf9bBEZBbzonLu8+vkngP8F7hCRnzvn3hSR9YG1nXMvtSJ0EMZrACOAzYHOGr/tfeA14JVyqfiPZmUzzSU2P/rAROR959xaKz03GvgtsAj4I3Cyc26t6qm3Sc65HUXkLOAYYBnwOvBV59w7InI0cBbJlsAy4ETn3MxGZA3CeF0gICnziJU+HwFsOMxFLAReBeYBc4EXgOerH+eWS8V/2rIx6WBFz7AgjAPggD6PUYpxlgOPAlOBKcD0cqm4UDGP6cOKniFBGG/NR4u9pW6iAfUAj5MUfyowrVwqvqsbyV9W9BSr7lMfDhxKUuyP6yYalhXAkyRr+9vKpeIs3Th+saKnTBDG7cBnSM6vHw6sqZuoaZ4BrgduKpeKbyhnyT0rekoEYTwG+C5wLLCRcpxWWg7cDVwHTCqXisuU8+SSFV1REMadwBHA8cBBynHS4C3gFuC6cqk4RztMnljRFVT3vU8GfoBfa+96zAJ+Ui4VJ2sHyQMregsFYdwBfAc4F9hUOU5WPAicXS4VZ2gHyTIregsEYVwguXgmAkbqpsmsu4BzyqXiE9pBssiK3mRBGB8B/BTYQTtLDjhgIvCjcqn4nHaYLLGiN0kQxgcDFwB7amfJoeXADcB55VJxnnaYLLCiN1gQxhsBVwOHaWfxwBLgPODicqnYox0mzazoDRSE8eEkJf+YdhbPzAa+US4Vn9EOklZW9AYIwngd4DLg68pRfPYBybGQUrlUXK4dJm2s6MMUhPEBJPuLI7SzGCBZu3+lXCrO1Q6SJlb0IQrCuIvkYNtpJCPHmPRYCJxULhVv1A6SFlb0IQjCeFfgJuyUWdrdCpxQLhXf0w6izYpepyCMv0pyA0atQzEZXc8B/1ouFV/UDqLJhnuuQxDGZwI3YyXPkjHAQ0EY764dRJOt0WtQvYT1cuBE7SxmyN4HjvT1Jhlbow8iCOPVgd9gJc+6tYA7gzD+pnYQDVb0AQRhvAFwH8k94yb72oEJQRj/WDtIq9mm+yoEYTySZOSTbbSzmKa4huSIvBeXzlrR+xGE8W4kt0VurJ3FNFUMHO3DxBRW9JVUz5FPAdZRjmJa44/AIeVS8QPtIM1k++h9BGE8GpiMldwnBwHXB2Gc66sbrehVQRhvBtyLjeHmo68AF2qHaCbbdAeCMF4fmAZsr53FqDq5XCr+UjtEM3hf9OqQy/cB+2lnMepWkFxU8zvtII1mm+5wLVZykygAtwRhPFY7SKN5XfQgjM8lmRnFmF6rAX8Iwnhb7SCN5O2mexDGRwG3YfeSm/6Vgb3LpeLr2kEawcuiB2G8FcnMnnmdwNA0xn3AZ8ulYuZL4t2mexDGbcCNWMnN4A4GTtIO0QjeFR34IbCPdgiTGRfmYX/dq033IIx3IZm8r0M7i8mUP5Hsr2d2dFlv1ujVwRxvwkpu6rcHycSYmeVN0YHzgR21Q5jMOicI48xOr+XFpnsQxvuT3KXk0x8203h/BXYtl4qLtYPUK/dv/CCM1yaZYCH3v6tpum3I6M0vPrz5LwYC7RAmN04Kwvgg7RD1yvWmexDG2wFPAW3aWUyuPAnsVi4VV2gHqVXe1+jnYyU3jbczGbtHIrdr9CCM9yCZcM+YZpgPjMnKgbk8r9H/UzuAybXNge9rh6hVLtfo1amMp2jnMLn3HhCUS8UF2kEGk9c1+gXaAYwX1iGZNjv1crdGD8L4UOBO7RzGGxWStfq72kEGkqs1enXI3vO1cxivdJOBtXquig4cTXLqw5hWOjUI43W1Qwwkb0U/VTuA8VI3KT+vnpuiVwcH+KR2DuOtb2gHGEhuik7K/6FN7u0ahHFqdxtzUfTqOHCp3nQyXkjtyiYXRQcOATbVDmG897UgjFM5glFeip7av6TGKxsCX9QO0Z/MFz0I4w2BL2jnMKYqlSudzBcd+BrQqR3CmKpDgjBO3W5kHoqeyr+gxlupPDCc6aJXT2ek9pSG8dZx2gFWlumiA4dqBzCmH9sHYbyFdoi+sl70g7UDGLMKB2gH6CuzRQ/CeE1sDjWTXlb0BtkfO9pu0suK3iC22W7SbHSaTrNluej7aQcwZhCpWatnsuhBGK8O7KKdw5hBWNGHaQ9s+mOTflb0YbKj7SYLtgvCeCPtEJDdou+tHcCYGu2vHQCyW/TttQMYU6MdtANABoteHdJ5S+0cxtQo0A4A0F7rC0VkDHAGMKLv9znnPtWEXAPZGOhq8TKNGaoR2gGgjqIDE4ErgWuAnubEqUkq/uGMqVEq3q/1bLovd86Nd87Ncs492vtoWrJVS8U/nDE12iIIY/Vd5HoC3Cki40RkUxFZv/fRtGSrZkU3WdJBCgYurWfTvfdm+jP6POeAUY2LUxMrusmaEcArmgFqLrpzbmQzg9TBim6yJgAe0gxQz1H3DuAE/v8CgCnAVc65ZU3INRAruska9fdsPZvu40n2N35V/frY6nPfbnSoQaj/oxlTJ/X3bD1F39M513cgxgdE5MlGBxpI9a61dVq5TGMaQP1gXD1H3XtEZKveL0RkFK0/n65+msKYIVAfCameNfoZwB9F5EVASDZHbEx1Ywanfkt1PUfd7xeR0cA2JEX/i3NuadOSGZMf6kWveVNYRI4EOp1zc0jmOvu1iOzWtGTG5EemNt1/5JybKCL7Ap8DLiE56v4vTUlmWmonefH5w9pmvKadI4+W0vEyFFUz1FP03gNvRWC8c+4OEYkaH8m02ucLjzx2Rcdlo0UYrZ0lp2ZpB6jnKPYrInIVcBRwl4h01fn9JoXGtd0x44qOy3YSYW3tLDmmebcnUN8a/SjgEOAS59y7IrIpH73u3WTMpR3jp3ypbdqB2jk8kP6ii8g6zrn3gNVILnuletfaUmC2iLQ559R/EVO7Ait6JnaeN2P3wvMHamfxxELtALWs0W8lmbX0UZK71aT6fO/na4nINc65s5sT8SNcC5aRa6uxdPH9Xac/tZm8nYpBCz3xtnaAQfexnXOHVj+OdM6Nqn788HNgE+AIEWn6gI3lUnERsKjZy8mr9am8Patr3NzN5O29tLN45u/aAYZ9MM051+Oc2w64uQF5avFyi5aTKyPl1Zdndp28cB1ZvKN2Fg+lf41eBxn8JQ3xUouWkxt7ybN/vr/zjNU7ZXmgncVT2V+j99Gq/Wcreh2OKEyb/T+dPx1REPcx7SweU1+j13N6LS2s6DU6rX3itFPabt9bJJP/n/PkDe0AjXwDfNDAnzUQK3oNftXxi6mfb5uVmkn+PPe8doB6bmoZKyJrVj8/RkR+JiIfjpzhnPtkMwL2w4o+gDZ6lk/qPHualTw1FhJV1O8hqGcffTywSER2Bn5IUrgbm5JqYFb0VViDJf+Y0XXKEzsWyvtpZzEf+qt2AKh/AgcHHAZc5py7DFSuj34VWK6w3FTbiAVvze4a9/ImsmAP7SzmIzJX9IUichZwDBCLSBsKN9SXS8UeYH6rl5tmY2Te32Z0nbJkTVmynXYW80/+oh0A6iv60STXt3/LOfc6sBlwcVNSDS4VfyXTYGzh6acnd4bdHdKzhXYW069UvFdrLrpz7nXn3M+cc9OqX7/snNPYRweYqbTcVPly2wOP3NxxwVYFcRpTY5naPKYdAGq7e226c25fEVnIRy+KEcA55zSGX35YYZmpclb7LQ9+ty0eK0KbdhazSm8QVeZqh4Aaiu6c27f6MU0DE8wEVuDlwBfOTei4ZOqn2x4/UDuJGdQM7QC9MlmUcqlYAf6snaPV2lm+7J7OMx+ykmeGFb0BpmsHaKW1WPTezK6TntqmMH+sdhZTMyt6A9ynHaBVNuXt12d1jXttQ3nPhtfOjsWk5EAcZLvoD5Dsp+fa9lKeO63r1J415INttLOYujxEVGn1TMOrlNmil0vFBSTDW+XWpwqPPTmp8+wN2mXFZtpZTN3u0A7QV2aLXnWvdoBmOa5t8sMTOi7ZtiCsq53FDIkVvYHu1g7QDP/Rft3UqP3GT4rQpZ3FDMnjRJVUDXmW9QEJZgB/A0ZqB2kM527puODBsW3P2C2m2fZ77QAry/QavVwqOuB67RyN0MmypQ90/vtMK3kuWNGb4AYyPt77OrxfeaTrxGdHFV7fWzuLGba5RJU52iFWlvmil0vFl0hOtWXS5vLWq7O6Tvz7evL+LtpZTEP8t3aA/mS+6FXXaQcYip3lheemdn6/sJos20o7i2mI5aT0vZiXov8OqGiHqMfnCrMe/33njzdpE7eJdhbTMHEaxofrTy6KXi4VFwO3aeeo1XfbJs24suMXO4igcYuvaZ6rtQOsSi6KXpXKTaaVXdR+5ZSz2m/dR4RO7SymoeYBk7VDrEpuil4uFR8BntXOsSrCihUTO6MHj2p/8ECRlk1fZVrnWqJKau+9yE3Rq36lHaA/q7F0ydTO02btWXjOpirOp/eBK7RDDCRvRb+alM22uh7vvfNI14nPb1l4q1UTXJjWG09UUZ9fbSC5Knq5VPwAOE87R69AXps3s+ukSrcs2kk7i2maxcCl2iEGk6uiV91ACsbS3kP++uz9naev1iXLc3IdvlmFa4gq6pMoDiZ3Ra9O8PBjzQyHFWb8aWLneVu02VTFefcBcJF2iFrkruhVv0FpGJ9T2347/RcdV+wiwloayzctNYGo8op2iFrksujVu9rOafVy/6vj8imndfx2X5uP3AvvorzlWI9cFh2gXCpOBh5sxbIKrOi5o/PcaV9om3lgK5ZnUuEnRJW/a4eoVW6LXtX0tXp1quLHdi68aFMV++MZUnrNxqrkuujlUnE68Idm/fwNefetWV3jXtpU3tmzWcswqfR9okqmpu7OddGrTgAWNPqHjpb55Ye7Tl60lizZvtE/26Ta7USVzM0pkPuil0vFV4FxjfyZ+xSefuaezjPX7pCeEY38uSb1FgKnaYcYitwXHaBcKt5Gg25jPbJtyqxbOi4YWRC3QSN+nsmUM4gqL2mHGAovil41Dnh1OD/gzPZfP3hR+9W7i7BGgzKZ7LiXqHKVdoih8qbo1ZldvjnU77+649IpJ7Tfub/NR+6lYb130sCbogOUS8V7gPH1fE87y5fd3RlO/2zbowc2J5XJgOOJKvO1QwyHV0WvOh14vpYXrsnihQ91nTxnu8LL+zY5k0mv64gqE7VDDJd3RS+XiouAY4GegV63Ce+8Mbtr3CsbSWX31iQzKfQoDT5jo8W7osOHw06t8qq5beXlF6d3nbJsDVm6bQtjmXR5EziCqLJEO0gjeFl0gHKpeCH9DCi5f+HJOXd1huu1y4rNFWKZdFgGfImoMk87SKN4W/Sq44EpvV8c03bvzBs6LhxTENbTi2RS4BSiynTtEI0kzmV62rJhC8J4feDhn7Tf8NrX2+7ZT8T7P36+u5qocrx2iEbzvugAE849euS32ifPBDbSzmJU3Q4cSVQZ8EBtFlnRe0Xdu5Fsxq+tnMTouBc4lKjygXaQZrDN1F5R5THgCGCpdhTTcjOAw/NacrCif1RUuR/4AskQvsYPjwNFosoi7SDNZJvu/Ym6DwAmgQ3wmHPPAAcRVd7SDtJstkbvT1SZCnyWjE3FbOryELCfDyUHK/qqRZWHgYNpwug0Rt1dwGeIKt78v7WiDySq/Ak4gJTN52aG5SbgsLzvk6/Mij6YqPIUsBfwsHYUM2w/A47L2sCOjWAH42oVdXcB15Dc+WayZSlwMlHlGu0gWqzo9Yq6Q+ACQLSjmJrMI7lBZbZ2EE1W9KGIur8IXA9280vK3Q98OUszqjSL7aMPRVT5A/AJ+tz5ZlLnQuBzVvKErdGHI+ouAGcAPwU6lNOYxDzgO0SVe7SDpIkVvRGi7t2BW4Ex2lE8NwH4AVHlPe0gaWNFb5Soew2SNfspYNMmt9h8krX4ZO0gaWVFb7SoeyfgCsBmV20+R7IWP52oYpcrD8CK3ixR97HAxcDG2lFyairJZvpj2kGywIreTFF3N8nm/PFAp3KavHgB+CFR5XbtIFliRW+FqHsEcC7wdWz/fagWAOcDv8zzABHNYkVvpah7JPAjkstorfC1eQ34OXAlUWWhdpissqJriLq3IplA4ivAaspp0uoF4CLgRqKKDe81TFZ0TVH3BiSzdH4PGKWcJi1mAZcCvyGqrNAOkxdW9DSIugU4hGSer8/j36XJC4CbgWuJKnO0w+SRFT1tou4tgSOBfwP+hfzeJbcMuBu4EZhkm+fNZUVPs6h7C+BLJKXfh+yX/m1gMhADk30aykmbFT0rou6PkwxY+Sng08DHdQPVxAFPkxR7EjAzj7OgZIEVPaui7q2BfauPvYBt0L8o502Sg2mPVB+ziSrv6kYyYEXPj6i7Hdga2KHPY3tgc2DdBi5pOfASMHelxxNElXIjFiAiG5AMGgGwCdAD9A7LvJdzzi6YqZMV3QfJeHcbkVx33/tYn+Qe+vY+j7bqx8XAu9VHpc/nbwPzWjm4oohEwPvOuUtWel5I3r92Cq4GdnWWD5Ij2vOqj8wSka2B3wPTSc5IHC4iTzrn1q3+9y8DBzvnvi0iGwPjgS2BFcApzrmZStHV+Xa+1mTf9sAE59yuwCsDvO5y4CLn3B7AUcC1rQiXVrZGN1kz1zlXy4iuBwPbJFv4AKwnIqs757ycQNOKbrLmH30+X8FHry3oe9+AYAfuPmSb7iazqgfiFojIaBEpkMxv3+s+4MTeL0Rkl1bnSxMrusm6M0mutrufZOy4XicCY0Vkjoj8GfiORri0sNNrxnjA1ujGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeMCKbowHrOjGeOD/AM0W+8UGUHW2AAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "is_json_counts = df['is_json'].value_counts().compute()\n", - "is_json_counts.plot(kind='pie')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As the value_len increases the frequence of valid JSON on the columns 'value' also increases,\n", - "for the rows that have the value_len one std above the mean, we have the following:\n", - "- isJson True: 46691 rows\n", - " - 99,88% are valid JSON\n", - " \n", - "- isJson False: 54 rows\n", - " - 0,11% are not valid JSON\n", - "\n", - "\n", - "The valid json also represent 9.35% of the data because the number of non Json are too small to make a percentual difference. " - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "one std above the mean = len: 46745 (9.35%)\n" - ] - } - ], - "source": [ - "print(\"one std above the mean = len: {0} ({1:0.2f}%)\".format(A_COUNT, A_COUNT / COUNT * 100))" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
value_len
meanstdminmaxcount
is_json
False82460.05555613627.1180632813510465354
True271422.740185412552.29861327669449686146691
\n", - "
" - ], - "text/plain": [ - " value_len \n", - " mean std min max count\n", - "is_json \n", - "False 82460.055556 13627.118063 28135 104653 54\n", - "True 271422.740185 412552.298613 27669 4496861 46691" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "group = std_above.groupby('is_json')\n", - "group_result = group.agg({'value_len': ['mean', 'std', 'min', 'max', 'count']}).compute()\n", - "group_result" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NOT json count: 54 (0.01%)\n", - "IS json count: 46691 (9.34%)\n" - ] - } - ], - "source": [ - "a = group_result['value_len']['count']\n", - "print(\"NOT json count: {0} ({1:0.2f}%)\".format(a[0], a[0] / COUNT * 100))\n", - "print(\"IS json count: {0} ({1:0.2f}%)\".format(a[1], a[1] / COUNT * 100))" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQIAAADuCAYAAADSvgkdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAEh1JREFUeJzt3XmQnVWdxvHv6bCDvAhhSSAYdmWRyD6Cg0HQQbawODAFOkOGJYIwTlGKoIynQEUccAw4YA2jcWBAKAUxhkFBtlKjISyhhmUgxRAQIQsk8wYyMEnnnvnjvE132l7u7bv87nnf51N1qzu3b/d9oO99+rzLeY8LISAi1dZjHUBE7KkIRERFICIqAhFBRSAiqAhEBBWBiKAiEBFUBCKCikBEUBGICCoCEUFFICKoCEQEFYGIoCIQEVQEIoKKQERQEYgIKgIRQUUgIqgIRAQVgYigIhARVAQigopARFARiAgqAhFBRSAiqAhEBFjPOoB0iM82BCYWt62ADYH1gQ2Ir4MasLb42AssBxYXt2X4vGaQWjrEhRCsM0gr+GwcsCewH7Az8Q2/fXGbCIxv4qevBV6nvxgWA4uAJ4An8PnLTfxs6QIqghT5zAF7AAcUtwOBKcAmRoneoK8U4PHi40KNItKhIkiFz3YDTgCOJr75N7cNNKqVwH3AHOBufL7MOI+MQEXQreJf/YOJb/4TgA/YBmpKDZhPLIU5+HyBcR4ZREXQTXy2HvBxYBpwHLCdbaC2eYVYCjfj87nWYURF0B18Nhk4GzgTmGAbpuOeBb4P3KTNBzsqAitx6H80cAFxFFD1czpWAz8GrsPn86zDVI2KoNN8tgnwGeDvgPcbp+lW84GZwG34fK11mCpQEXRKPKHnfOASmjumXyXPApfh8zusg5SdiqDdfNYDnA5cAbzPOE2qHgUuxef3WQcpKxVBO/nsL4BvAvtaRymJB4FLtA+h9VQE7eCzA4CrgCOso5TUz4Av4vPnrYOUhYqglXy2OfBtYDrgjNOU3TvAV4FrtEOxeSqCVvHZUcTj4ZOso1TMI8B0fP60dZCUqQia5bPNgH8EZlhHqbDVwOXAVfi81zpMilQEzfDZ4cAsYCfrKALEWY9n4vMnrYOkRkUwFj7bGLgSuBDtC+g2a4B/wOfftA6SEhVBo3w2ibjX+kPWUWREPyaODlZZB0mBiqARPjsE+CnlnRVYNk8C0/D5Iusg3a7qE13q57NPAw+hEkjJvsB8fDbVOki304hgNPEU4SuBL1pHkTHrBS7C59daB+lWKoKR+Ow9wK3AsdZRpCVmAefi8zXWQbqNimA4PtuOeM29va2jSEvdDZyCz9+xDtJNVARD8dn2wAPA7tZRpC0eAI7XEYV+KoLBfLYj8YWyi3UUaau5wNH4fKV1kG6gIhgolsDDwGTjJNIZvwM+gc/ftA5iTYcP+/hsAnA/KoEq+TPgbny2qXUQayoCAJ+NB34F7GodRTruI8AcfLaRdRBLKoL41+Be4rqBUk0fBX5gHcJStYsgXlL8JjRvQOCv8NmXrUNYqXYRwGXASdYhpGtcgc8q+Xqo7lEDn00D7kTTiGVdq4DDqrY+YzWLwGd7Ew8dbWYdRbrSH4AD8fkS6yCdUr1NA59tSbyegEpAhjMJuKtYlKYSqlUEPhsH3A7sbB1Fut4hwHXWITqlWkUAlwJHWoeQZJyNz462DtEJ1dlH4LN9gMeA9a2jSFJeBfbG5yusg7RTNUYEPluPOBddJSCNmkgFNhGqUQTx6kL7W4eQZJ2Oz060DtFO5d808NlewOPABtZRJGlLiZsIy6yDtEO5RwTxKMEsVALSvG2AG6xDtEu5iwAuAg60DiGlcXJZT0Eu76aBz3YAFgKVnl4qLbcQ2LNsayyWeUTgUQlI6+0GnGMdotXKOSLw2R7A08A46yhSSkuBXfD5W9ZBWqWsI4KvoRKQ9tkG+IJ1iFYq34jAZ/sD89H0YmmvVcCu+HyxdZBWKOOI4EpUAtJ+mwJftQ7RKuUaEcTFLh+wjiGV0Qvshc+ftw7SrLKNCL5hHUAqZT1KsjhueUYEPjsM+LV1DKmcd4AdUz/1uEwjggusA0glbQR81jpEs8oxIoiLli4iDtVEOm0JcVSw2jrIWJVlRDADlYDY2ZbEL4uffhHEGYZ/ax1DKm+GdYBmpF8EcAwwwTqEVN7h+Oz91iHGqgxFcJZ1AJHC2dYBxirtIvDZtsAnrWOIFD5lHWCs0i4COB5NLpLuMQmf7WcdYixSL4JjrQOIDDLNOsBYpFsEPtsILVYi3UdF0GFHAJtYhxAZZB98tpN1iEalXATHWQcQGUZyo4KUi+AY6wAiwzjBOkCj0pxr4LMpwBPWMUSGsRbYFp+/YR2kXqmOCDQakG42DjjUOkQjUi2CpP4nSyUltbBOqkWQ5EkbUikqgrby2QTitE+RbnaAdYBGpFcEGg1IGrZK6XwCFYFI+ySzeaAiEGkfFUEbqQgkFcnsJ0irCHy2FbCjdQyROu1jHaBeaRUBJHspKKmkrYpZsl2vriJwzt1fz30doGsTSmq2tw5QjxEvAe6c24g41Xe8c+699C8uujkwsc3ZhrKdwXOKNGN74AXrEKMZbS2Ac4HPE9/0j9FfBCuBf25jruGoCCQ16Y8IQggzgZnOuQtCCNd1KNNIVASSmvSLoE8I4Trn3IeByQO/J4RwU5tyDUf7CCQ15SkC59zNwC7AAuJca4AAdLoINCKQ1JSnCIgnRuwZ7K9ioiKQ1FjsVG9YvecRPEV3vAm3sQ4g0qBN632gc26tc27BgNvkER472Tn3VCsCQv0jgvHAM865R4Aa8OHi/reJmwrLin8fFEJoz9LQcbFTrXgsqdmwgce+HUKY0rYkI6j3jeWHuX8q8FYI4eqBdzrnHPF6iLUmsg2mFY0kRRs0883FqOBm+kcWnwshzB30mL2AWcVz9QAnhxAWOufOAC4s7p8HnBdCWMsQ6j1q8PAwIacO+HxX4C7gN8DBwDTn3JMhhC2Kr58GHBlCOMs5ty1wA3HeQA24MITw+1FiaDQgXSsEAvG13PexBtQCbk0D5/Fv7JxbUHz+YgjhRGApcFQI4R3n3G7Aj/jTyUwzgJkhhFuccxsA45xzHwBOBQ4NIaxxzl0PnM4wO/jrPWrwZvEfCLFd1gdWAd8e9NA9gTNDCDOccyP97GuBb4UQfl803hxg71FiaETQRsULeZ0X8aAbAWrgihd432NdrXhh1AIu9H/N9X0MAULo/1gb4r5BN+j7vDbg/uJzQuj/vLbuY1yNnlDDER/fw4DH0fe1Gj3Uwrtf59376GFt/+ch4NzagV8LPcXz9bgajrX0uP7Hvnuy3Tp6GZfPqP/XMNSmwfrAd51zU4ib4bsP8X2/A77snNsBuLMYDXwM2B+YHwfobEwslSHVOyJ4z8B/O+emAQcN8dAXQgjz6/iRRwJ7FAEB3uuc2ziE8PYI39PKzQwZxDkc8cU87B+w+NsafODI4EDSkG+5rvVyk9//98ASYF/i7+adwQ8IIdzqnJtHvLr3L51zZxH/L/1bCOGSep5kTLMPQwh3EZccG2zVgM9rrPsrGzgLyxF3LE4pbtuPUgIAa8aSVcRYs6/bDHit2N/2aYYYGTvndgb+O4RwLTAb+CBwP3CKc26b4jFbOufeN9yT1LtpcNKAf/YQt1FG/FMQQqg551YU2zUvACfSf3ThV8D5wD8VP39KCGHB0D/pXSoCSVFvk99/PXCHc+5TwIOs+8e2z6nAGc65NcBi4PIQwnLn3FeAe51zPcT3z/nAS0M9SV0rHTnnZg34Zy+wCLgROI/iqEGxs/AnA7dxnHOnAt8gDo+eATYsdhZuTdxZuDuxjB4MIZw/ahCf9aJ9BZKWp/B511+gJK0lz3y2AtjCOoZIA36Nz//cOsRo6r0wyQ7OuZ8655Y655Y45+4o9lB22msGzynSjCRes/XuLJxF3AkxkTiJ4ufFfZ32R4PnFGnGq9YB6lFvEWwdQpgVQugtbj8Etm5jruEk8T9VZIBSjQhed86d4ZwbV9zOACyWfFYRSGpKVQTTgb8kHpp4DTgFOLNdoUagIpDUJFEE9Z6/fwXw1yGEFRBPTgCuJhZEJ6kIJDVJFEG9I4IP9pUAQAhhOfCh9kQakYpAUpPEa7beIugpLmcOvDsisJgN+AeD5xQZqzfx+YrRH2av3iK4BpjrnLvCOXc5MBf4VvtiDcPnrwDLO/68ImPzhHWAetVVBMXVik8mzoJaBpwUQri5ncFG8LjR84o06jHrAPWqe3gfQniGOF/A2qPEacwi3S6ZP1qpLYIKsQhEUpDMiEBFINIebwHPWYeoV3pF4POXgNetY4iMYgE+T+aqWukVQZTMkEsqK5n9A5BuEdRzXUQRS0ltwqZaBA9ZBxAZQQDutQ7RiFSL4GHgf6xDiAzjEXy+xDpEI9IsAp/3AvdYxxAZxmzrAI1Kswiin1kHEBnGz60DNCrlIrgHXeJcus8ifP6f1iEalW4R+Hwl2mko3Se50QCkXARRcttiUnpJviZVBCKts5J4RCs5aReBz18mXhtBpBvchs+T3G+VdhFEN1oHECkk+1osQxHcjk4uEnsL8HlSpxUPlH4R+Pxt4BbrGFJ5yY4GoAxFEF1vHUAqbSVgdem+lihHEfj8GeA+6xhSWbPw+ZvWIZpRjiKIvmMdQCqpBlxnHaJZZSqCe0jo0lBSGnPw+QvWIZpVniLweQC+bh1DKiUA3jpEK5SnCKJbgOQmfEiybsfnySxiMpJyFUG8WOQl1jGkEtYAX7EO0SrlKgIAn99Noud7S1L+tQz7BvqUrwiii60DSKmtAi63DtFK5SwCn88D7rSOIaU1E58vtg7RSuUsguhSoNc6hJTOcixWAm+z8haBz58D/sU6hpTOZfg8tw7RauUtguhLwMvWIaQ0HgJusA7RDuUugnj+93TiiR8izVgFTC9OXCudchcBgM/vB75nHUOSdzE+f9E6RLuUvwiiLwCl/SVK2z1Eyae6V6MIfL4KOBNtIkjjSr1J0KcaRQDg84cpwXRR6bhSbxL0qU4RRJcA/2UdQpLxC0q+SdDHhVDqEc+f8tnuwDxgC+so0tWeBw7G55W4MG7VRgTg8+eBU4G11lGka+XA8VUpAahiEQD4/F7gIusY0pVqwGnFmamVUc0iAPD5TOD71jGk61yMz39hHaLTqlsE0XnAb6xDSNe4GZ9fbR3CQrWLwOergZOAl6yjiLl5wNnWIaxUuwgAfL4M+CSwzDqKmHkGOBaf/591ECsqAuhbIOUoYIV1FOm4hcDH8Pnr1kEsqQj6+PxJ4OPEQ0dSDS8CR5TtakNjoSIYKK5m+wm0unIVvABMxeevWAfpBiqCweL1Do8AKj1ULLnngMPxuXYSF1QEQ4mLVnwUqPyQsYSeJpbAH62DdBMVwXB8/jRwGPCsdRRpmfuAj+DzJdZBuo2KYCRxAYtDgDnWUaRpM4Gj8bmODA2herMPx8JnPcDX0HJqKVoNnIfPdTr5CFQEjfDZacAPgI2to0hdlgIn4fPfWgfpdiqCRvlsP+AuYJJ1FBnRAuAEfK7L2ddB+wga5fPHgQOAB62jyLB+CByqEqifRgRj5TNHnL14FbCpcRqJXgPOwefaudsgFUGzfLYT8boGU62jVNy/AxfqqMDYqAhaIY4OPkscHWxmnKZqFgPn4vPZ1kFSpiJoJZ9NJo4OjjBOUhW3Ahfg8+XWQVKnImi1ODr4G+ByYAfbMKX1JPClKl5SrF1UBO3is42A84FLgS2N05TFi8BlwK1lX3mo01QE7eazjLj24ufR0YWxWkY8s/N7xeXlpMVUBJ3is22Jf83OAdY3TpOKt4BrgGuKJe6lTVQEneaznYlrKnwGHWEYzhLiUvbX4/Ol1mGqQEVgJW4yTAc+B+xsnKZbPEGcJfgjbQJ0lorAWpzZeBTxUtrHU73NhreA24Ab8fkj1mGqSkXQTXy2DXGT4RTgIMDZBmqb1cS5Gj8Bbtf2vz0VQbeKOxePAY4jjhhSP+LwBvAfwGzgl3rzdxcVQQriOQlTiaVwLOlMgV5IfOPPBn6Lz7UCdZdSEaQoTnTaH9hvwMfxppngFeCx4vYo8Jj2+KdDRVAWPptEfynsDUwEJgDbARu06Fn+F3iVON33VeIVgeMbX2/6pKkIyi7OfdiSWAgT6C+HTYH1gHHFR4BeYE1xe5s4s+9V+t78PtcqUCWlIhARXapMRFQEIoKKQERQEYgIKgIRQUUgIqgIRAQVgYigIhARVAQigopARFARiAgqAhFBRSAiqAhEBBWBiKAiEBFUBCKCikBEUBGICCoCEUFFICKoCEQEFYGIoCIQEVQEIoKKQERQEYgIKgIRQUUgIqgIRAT4fxoxDoxWW+6XAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "group_result['value_len']['count'].plot(kind='pie')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### All greater values are JSON\n", - "\n", - "There is absolute no value greater than 104653 (max value for non-json) that represents a valid JSON. \n", - "\n", - "This implies that all the greater values are JSON but they represent very low percentage of the whole data (6.76%). " - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "104653\n", - "len: 33788 (6.76%)\n" - ] - } - ], - "source": [ - "max_non_json_value = group_result['value_len']['max'][0]\n", - "allJson = df[df['value_len'] > max_non_json_value ]\n", - "length = len(allJson)\n", - "print(\"len: {0} ({1:0.2f}%)\".format(length, length / COUNT * 100))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/analyses/2019_03_aliamcami_value_analyses/isJson_Value_Distribution.ipynb b/analyses/2019_03_aliamcami_value_analyses/isJson_Value_Distribution.ipynb new file mode 100644 index 0000000..ba2df56 --- /dev/null +++ b/analyses/2019_03_aliamcami_value_analyses/isJson_Value_Distribution.ipynb @@ -0,0 +1,664 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Start dask" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/overscripted/lib/python3.6/site-packages/dask/config.py:168: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", + " data = yaml.load(f.read()) or {}\n" + ] + } + ], + "source": [ + "import dask.dataframe as dd\n", + "from dask.diagnostics import ProgressBar\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# from dask.distributed import Client\n", + "# #Initializing client\n", + "# client = Client()\n", + "# client\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parquet\n", + "Used sample: sample_0_prep/full_sample_json.parquet\n", + " * This sample is the 10% sample with the \"is_json\" column added to it, this column is the result of the 'value' columns as a valid json or not. \n", + " * This sample can be obtained by running 'jsJson_dataPrep.ipynb'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_lenis_json
08False
\n", + "
" + ], + "text/plain": [ + " value_len is_json\n", + "0 8 False" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = dd.read_parquet('sample_0_prep/full_sample_json.parquet', engine=\"pyarrow\", columns=['value_len', 'is_json'])\n", + "df.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 1.6s\n" + ] + } + ], + "source": [ + "with ProgressBar():\n", + " df = df.compute()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Values distribution: " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The absolute majority of the values are small. As seen on the graph below." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "%matplotlib inline\n", + "df['value_len'].plot(kind='hist', legend=True, logy=True, bins=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Type distribution\n", + "The non-json values are found mainly withing the smaller values.\n", + " - Orange bar: non-json values\n", + " - Blue bars: json values" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'density')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAELCAYAAADz6wBxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFqxJREFUeJzt3X+QXWd93/H3JzLCDSEOGEFdy4rkyHHipknBWwWawpC0JnZi4YF4GimmUFu1BoLbZtJMRm46BaaTUSBTEsCe2E4xxkywcR2nkYyohjGkzkw9INkmII0irDim3tiJDEmME2aCRb794x7ZV8v+uLvnnr2rs+/XzM7e8+w95/neY60/+zznueemqpAkaam+Y9IFSJJObwaJJKkVg0SS1IpBIklqxSCRJLVikEiSWjFIJEmtGCSSpFZOyyBJ8vokf5jkpiSvn3Q9krSanbHcHSa5FbgcOF5VPzTUfinwAWAN8D+q6tfmOUwBfwOcCUwv1OfLXvay2rhxY5uyJWnVefDBB79aVesWel6W+xYpSV7HIARuPxkkSdYAXwYuYRAMB4DtDEJl94xDXAN8tar+PskrgPdX1VXz9Tk1NVUHDx4c7wuRpJ5L8mBVTS30vGUfkVTV/Uk2zmjeAhyrqkcBktwJXFFVuxmMXubyV8ALZ/tBkp3AToANGza0rFqSNJeVco3kXODxoe3ppm1WSd6c5GbgY8ANsz2nqm6pqqmqmlq3bsGRmSRpiZZ9RDKHzNI255xbVd0D3LPgQZOtwNbNmze3KE2SNJ+VMiKZBs4b2l4PPDGhWiRJi7BSguQAcEGSTUnWAtuAPW0PWlV7q2rnWWed1bpASdLslj1IktwBPABcmGQ6yY6qOgFcB+wHjgB3VdXhMfS1NcktTz/9dNtDSZLmsOzLfyfB5b+StHijLv9dKVNbnXBEIkndWymrtjpRVXuBvVNTU9e2Oc7GXZ88ZfuxM3/u25/0bsNK0urU6xGJJKl7vQ4Sp7YkqXu9DhKX/0pS93odJJKk7vU6SJzakqTu9TpInNqSpO71OkgkSd0zSCRJrfQ6SLxGIknd63WQeI1EkrrX6yCRJHXPIJEktWKQSJJa6XWQeLFdkrrX6yDxYrskda/XQSJJ6p5BIklqxSCRJLVikEiSWjktP7M9yXcA/w34buBgVX10wiVJ0qq17COSJLcmOZ7k0Iz2S5McTXIsya4FDnMFcC7wLDDdVa2SpIVNYkRyG3ADcPvJhiRrgBuBSxgEw4Eke4A1wO4Z+18DXAg8UFU3J7kbuG8Z6pYkzWLZg6Sq7k+ycUbzFuBYVT0KkORO4Iqq2g1cPvMYSaaBbzab3+quWknSQlbKxfZzgceHtqebtrncA/xkkg8B98/2hCQ7kxxMcvCpp54aX6WSpFOslIvtmaWt5npyVX0D2DHfAavqliRPAlvXrl17ccv6JElzWCkjkmngvKHt9cATbQ/qLVIkqXsrJUgOABck2ZRkLbAN2NP2oN60UZK6N4nlv3cADwAXJplOsqOqTgDXAfuBI8BdVXW4bV+OSCSpe5NYtbV9jvZ9wL5x9pVkK7B18+bN4zysJGnISpna6oQjEknqXq+DxGskktS9XgeJIxJJ6l6vg8QRiSR1r9dB4ohEkrrX6yCRJHWv10Hi1JYkda/XQeLUliR1r9dBIknqnkEiSWql10HiNRJJ6l6vg8RrJJLUvV4HiSSpewaJJKkVg0SS1IpBIklqpddB4qotSeper4PEVVuS1L1eB4kkqXsGiSSpFYNEktSKQSJJauWMSRewFEleC1zFoP6LquqfT7gkSVq1ln1EkuTWJMeTHJrRfmmSo0mOJdk13zGq6g+r6u3AvcBHu6xXkjS/SYxIbgNuAG4/2ZBkDXAjcAkwDRxIsgdYA+yesf81VXW8efxzwL/rumBJ0tyWPUiq6v4kG2c0bwGOVdWjAEnuBK6oqt3A5bMdJ8kG4Omq+vocP98J7ATYsGHDeIqXJH2blXKx/Vzg8aHt6aZtPjuAj8z1w6q6BXgP8NDatWtbFyhJmt1KCZLM0lbz7VBV76qq/7vAc3xnuyR1bKUEyTRw3tD2euCJtgf1XluS1L2VEiQHgAuSbEqyFtgG7Gl7UEckktS9SSz/vQN4ALgwyXSSHVV1ArgO2A8cAe6qqsNj6MsRiSR1bBKrtrbP0b4P2DfmvvYCe6empq4d53ElSc9bKVNbnXBEIknd63WQeI1EkrrX6yBxRCJJ3et1kDgikaTu9TpIJEndM0gkSa30Oki8RiJJ3TstP9hqVJN6H8nGXZ/8trbHfu2nl7MESVo2vR6RSJK61+sgcWpLkrrX6yBx+a8kda/XQSJJ6p5BIklqxSCRJLVikEiSWul1kLhqS5K61+sgcdWWJHWv1+9sP134TnhJp7Nej0gkSd0zSCRJrRgkkqRWRgqSJJcnWTGhk2RDkj1Jbk2ya9L1SNJqNmo4bAMeSfK+JD/YpsPmf/7Hkxya0X5pkqNJjo0QDt8PfLKqrgEualOPJKmdkYKkqt4CvBL4E+AjSR5IsjPJi5fQ523ApcMNSdYANwKXMQiG7UkuSvJPktw74+vlwMPAtiSfAT67hBokSWMy8nRVVX0d+F3gTuAc4E3AQ0n+/WI6rKr7gb+c0bwFOFZVj1bVN5s+rqiqL1XV5TO+jgNXA++qqp8AZl0n2wTdwSQHn3rqqcWUKElahFGvkbwxye8BnwFeAGypqsuAHwF+aQx1nAs8PrQ93bTN5X8D/yHJTcBjsz2hqm6pqqmqmlq3bt0YSpQkzWbUNyReCfxGM5p4TlV9I8k1Y6gjs7TVXE+uqkNNTfMfNNkKbN28eXOL0iRJ8xl1auvJmSGS5L0AVXXfGOqYBs4b2l4PPDGG40qSOjZqkFwyS9tlY6zjAHBBkk1J1jJYJban7UG915YkdW/eIEnyjiRfAn4gyReHvv4U+OJSOkxyB/AAcGGS6SQ7quoEcB2wHzgC3FVVh5dy/Bl9efdfSerYQtdIPg58CtgNDL+345mqmrnyaiRVtX2O9n3AvqUcc56+9gJ7p6amrh3ncSVJz1toaquq6jHgncAzQ18keWm3pbXniESSurdQkHy8+f4gcLD5/uDQ9ormNRJJ6t68U1tVdXnzfdPylCNJOt2M+obEH0vyoubxW5K8P8mGbktrz6ktSereqMt/fwv4RpIfAX4Z+Arwsc6qGhOntiSpe6MGyYmqKuAK4ANV9QFgKTdslCT1zKi3SHkmyfXAW4DXNXfrfUF3ZY2Ht0h5np8LL6kro45Ifhb4O2BHVf05gxsq/npnVY2JU1uS1L2RRiRNeLx/aPv/Abd3VZQk6fQx6qqtNyd5JMnTSb6e5JkkX++6OEnSyjfqNZL3AVur6kiXxYxbX6+ReL1D0koy6jWSvzjdQgS8RiJJy2HUEcnBJJ8A/heDi+4AVNU9nVQlSTptjBok3w18A3jDUFsBBokkrXKjrtq6uutCJEmnp1FXbX1/kvuSHGq2fzjJf+m2tPa815YkdW/Ui+2/DVwPPAtQVV9k8HG4K5oX2yWpe6MGyXdW1edntJ0YdzGSpNPPqEHy1STfx+ACO0muBJ7srCpJ0mlj1FVb7wRuAX4gyZ8Bfwpc1VlVmjjf9ChpVPMGSZJfHNrcB3yWwSjmb4GfYej+W5Kk1WmhEcnJzxy5EPhnwO8DAf4NcH+Hdc0ryUXAu4GvAfdV1d2TqkWSVrt5r5FU1Xuq6j3Ay4BXVdUvVdV/Ai4G1i+lwyS3Jjl+cinxUPulSY4mOZZk1wKHuQz4UFW9A3jrUuqQJI3HqNdINgDfHNr+JrBxiX3eBtzA0G3omw/KuhG4BJgGDiTZA6wBds/Y/xoGH/P7riRvBM5eYh2SpDEYNUg+Bnw+ye8xWLn1JuCjS+mwqu5PsnFG8xbgWFU9CpDkTuCKqtoNXD7Hod7ZBJC3aZGkCRr1Fim/muRTwGubpqur6uEx1nEu8PjQ9jTwo3M9uQmi/wy8iDk+qTHJTmAnwIYNG8ZUpiRpplFHJFTVQ8BDHdWR2bqcp5bHaEJinufckuRJYOvatWsvbleexs3lxVJ/jBwkHZsGzhvaXg880fagVbUX2Ds1NXVt22NpfgaDtHqN+s72rh0ALkiyKclaBvfx2tP2oN60UZK6t+xBkuQO4AHgwiTTSXZU1QngOmA/cAS4q6oOt+3LmzZKUveWfWqrqrbP0b6Pwbvnx6avn9kuSSvJSpna6oQjEknq3kq52N4JRyT94gV9aWVyRCJJaqXXQeKqLUnqXq+DxBGJJHWv10EiSeper4PEqS1J6l6vg8SpLUnqXq+DRJLUPYNEktRKr4PEaySS1L1eB4nXSCSpe70OEklS9wwSSVIrBokkqRWDRJLUSq+DxFVbktS9Xn8eSVXtBfZOTU1dO+latPL5eScrg/8dTj+9HpFIkrpnkEiSWjFIJEmtGCSSpFZWfJAkOT/Jh5PcPdT2oiQfTfLbSa6aZH2StNp1GiRJbk1yPMmhGe2XJjma5FiSXfMdo6oeraodM5rfDNxdVdcCbxxz2ZKkReh6+e9twA3A7ScbkqwBbgQuAaaBA0n2AGuA3TP2v6aqjs9y3PXAl5rH3xpzzZKkReg0SKrq/iQbZzRvAY5V1aMASe4Erqiq3cDlIx56mkGYfIE5RlVJdgI7ATZs2LDo2iVJo5nENZJzgceHtqebtlklOTvJTcArk1zfNN8D/EyS3wL2zrZfVd1SVVNVNbVu3boxlS5JmmkS72zPLG0115Or6mvA22e0/S1w9YIdJVuBrZs3b15sjZKkEU0iSKaB84a21wNPTKAOST3hbVUmaxJTWweAC5JsSrIW2Abs6aIjPyFRkrrX9fLfO4AHgAuTTCfZUVUngOuA/cAR4K6qOtxR/979V5I61vWqre1ztO8D9nXZd9OPd/+VpI6t+He2t+GIRJK65+eRqLe8ACstj16PSCRJ3ev1iMT3kUgaB0e38+v1iMTlv5LUvV4HiSSpe70OEldtSVL3eh0kTm1JUvd6HSSSpO71Okic2pKk7vU6SJzakqTu9fp9JFLXZr6/wPcWaDUySKQVzDfC6XTQ66ktSVL3DBJJUiu9DhJXbUlS93odJK7akqTu9TpIJEndM0gkSa0YJJKkVgwSSVIrKz5Ikpyf5MNJ7p6vTZI0GZ0GSZJbkxxPcmhG+6VJjiY5lmTXfMeoqkerasdCbZKkyej6Fim3ATcAt59sSLIGuBG4BJgGDiTZA6wBds/Y/5qqOt5xjZKkFjoNkqq6P8nGGc1bgGNV9ShAkjuBK6pqN3D5uPpOshPYCbBhw4ZxHVaSNMMkrpGcCzw+tD3dtM0qydlJbgJemeT6udpmqqpbqmqqqqbWrVs3xvIlScMmcfffzNJWcz25qr4GvH2htlk7SrYCWzdv3rzYGiVJI5pEkEwD5w1trweemEAdkvBW9WpvElNbB4ALkmxKshbYBuzpoiPvtSVJ3et6+e8dwAPAhUmmk+yoqhPAdcB+4AhwV1Ud7qh/7/4rSR3retXW9jna9wH7uuy76WcvsHdqaurarvuSpNVqxb+zvQ1HJJLUvV4HiddIJKl7k1i1Ja1arpDSXE7nfxu9HpE4tSVJ3et1kDi1JUnd63WQSJK61+sgcWpLkrrX6yBxakuSutfrIJEkdc8gkSS10usg8RqJJHWv129I9F5bkvpqJb2BsdcjEklS9wwSSVIrBokkqZVeB4kX2yWpe70OEt+QKEnd63WQSJK6Z5BIkloxSCRJrRgkkqRWUlWTrqFzSZ4CvrKEXV8GfHXM5ZzuPCen8nycyvNxqtP9fHxvVa1b6EmrIkiWKsnBqpqadB0riefkVJ6PU3k+TrVazodTW5KkVgwSSVIrBsn8bpl0ASuQ5+RUno9TeT5OtSrOh9dIJEmtOCKRJLVikEiSWjFI5pDk0iRHkxxLsmvS9SxFkluTHE9yaKjtpUk+neSR5vtLmvYk+WDzer+Y5FVD+7ytef4jSd421H5xki81+3wwSZbax3JIcl6SzyY5kuRwkv+4ms9JkjOTfD7JHzXn4z1N+6Ykn2tq/USStU37C5vtY83PNw4d6/qm/WiSnxxqn/X3aCl9LJcka5I8nOTepdbap/Mxkqrya8YXsAb4E+B8YC3wR8BFk65rCa/jdcCrgENDbe8DdjWPdwHvbR7/FPApIMCrgc817S8FHm2+v6R5/JLmZ58HXtPs8yngsqX0sYzn4xzgVc3jFwNfBi5areek6fO7mscvAD7X1HAXsK1pvwl4R/P454GbmsfbgE80jy9qfkdeCGxqfnfWzPd7tNg+lvnfyS8CHwfuXUqtfTsfI52zSRewEr+a/xHsH9q+Hrh+0nUt8bVs5NQgOQqc0zw+BzjaPL4Z2D7zecB24Oah9pubtnOAPx5qf+55i+1jgufm94FLPCcF8J3AQ8CPMngn9hlN+3O/C8B+4DXN4zOa52Xm78fJ5831e9Tss6g+lvE8rAfuA34CuHcptfbpfIz65dTW7M4FHh/anm7a+uAVVfUkQPP95U37XK95vvbpWdqX0seya6YIXsngr/BVe06aaZwvAMeBTzP4i/mvq+rELPU8V2vz86eBs1n8eTp7CX0sl98Efhn4+2Z7KbX26XyMxCCZXWZp6/s66ble82Lbl9LHskryXcDvAr9QVV+f76mztPXqnFTVt6rqnzL4S3wL8IPz1DOu8zHfa57Y+UhyOXC8qh4cbp6nnl6fj8UwSGY3DZw3tL0eeGJCtYzbXyQ5B6D5frxpn+s1z9e+fpb2pfSxbJK8gEGI/E5V3dM0r+pzAlBVfw38AYNrJN+T5IxZ6nmu1ubnZwF/yeLP01eX0Mdy+DHgjUkeA+5kML31m0uotS/nY2QGyewOABc0KynWMrjItWfCNY3LHuDkKqO3MbhOcLL9rc0qolcDTzdTMPuBNyR5SbPS6A0M5m+fBJ5J8upmZdJbZxxrMX0si6bODwNHqur9Qz9aleckybok39M8/gfAvwKOAJ8Frpyj1pOv4UrgMzWYvN8DbGtWGG0CLmCw6GDW36Nmn8X20bmqur6q1lfVxqbWz1TVVUuotRfnY1EmfZFmpX4xWE3zZQZzxr8y6XqW+BruAJ4EnmXwl80OBvOr9wGPNN9f2jw3wI3N6/0SMDV0nGuAY83X1UPtU8ChZp8beP5OCYvuY5nOx79gMC3wReALzddPrdZzAvww8HBzPg4B/7VpP5/B//iOAf8TeGHTfmazfaz5+flDx/qV5jUcpVmpNt/v0VL6WOZ/K6/n+VVbq/58LPTlLVIkSa04tSVJasUgkSS1YpBIkloxSCRJrRgkkqRWDBJJUisGiTRmSf5mzMe7LcmVCz9TmgyDRJLUikEiLSDJe5P8/ND2u5O8K8l9SR7K4IOsrphlv9ef/HCkZvuGJP+2eXxxkv+T5MEk+0/eh2uEWmbdL8kfNHV+PsmXk7y29QuXRmSQSAu7E/jZoe1/DXwEeFNVvQr4ceC/N/fXWlBz48gPAVdW1cXArcCvjmG/M6pqC/ALwLtGqUUahzMWfoq0ulXVw0lenuQfAeuAv2JwD7PfSPI6Bp9dcS7wCuDPRzjkhcAPAZ9usmdNc7y2+528m/GDDD7QTFoWBok0mrsZ3H31HzIYoVzFIFQurqpnm1uPnzljnxOcOuo/+fMAh6vqNYusYaH9/q75/i383dYycmpLGs2dDG77fSWDUDmLwYcgPZvkx4HvnWWfrwAXNbcTPwv4l037UWBdktfAYMoqyT8eoYal7id1yr9apBFU1eEkLwb+rKqeTPI7wN4kBxncjv6PZ9nn8SR3MbhN+yMMbtlOVX2zWc77wSZgzmDwAUqHF6hhSftJXfM28pKkVpzakiS14tSWtEIkuZHB54YP+0BVfWQS9UijcmpLktSKU1uSpFYMEklSKwaJJKkVg0SS1Mr/B0NHHeyvZyHKAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.hist(\n", + " (df[df.is_json==True].value_len, df[df.is_json==False].value_len),\n", + " bins=25,\n", + " density=True,\n", + " label=['true', 'false'],\n", + "# color=['teal','orange'],\n", + ")\n", + "plt.yscale('log')\n", + "plt.xlabel('value_len')\n", + "plt.ylabel('density')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'density')" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.hist(\n", + " (df[df.is_json==True].value_len, df[df.is_json==False].value_len),\n", + " bins=25,\n", + " density=True,\n", + " label=['true', 'false'],\n", + "# color=['teal','orange'],\n", + ")\n", + "plt.xlabel('value_len')\n", + "plt.ylabel('density')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# JSON percentual by group\n", + "Here the orange is the percentual of non-JSON values found in each group, and the blue is the percentual of JSON values. \n", + "We can see that as we filter the data to bigger values the percentual of JSON values also increases. \n", + "\n", + "The gorups are: \n", + "- Original: all original data (sample 10%)\n", + "- Above_mean: original data filtered to only values above the mean\n", + "- Above_std: original data filtered to only values 1 std above the mean" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "def count_json(df):\n", + " trues = df.is_json[df.is_json == True].count()\n", + " falses = df.is_json[df.is_json == False].count()\n", + " total = df.is_json.count()\n", + " return trues/total, falses/total\n", + " \n", + "total_count = count_json(df)\n", + "total_mean = df.value_len.mean()\n", + "total_std = df.value_len.std()\n", + "\n", + "above_mean_count = count_json(df[df['value_len'] > total_mean])\n", + "above_std_count = count_json(df[df['value_len'] > (total_mean + total_std)])\n", + "\n", + "p1 = pd.DataFrame([total_count, above_mean_count, above_std_count],\n", + " columns= [ 'json', 'other'],\n", + " index=[ 'original', 'above_mean', 'above_std'])\n", + "plot = p1.plot(kind='bar')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# JSON percentual by bins" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "#Helper code to separate and calculate what needed\n", + "import math\n", + "def percetangeData(df):\n", + " bins=[]\n", + " trues=[]\n", + " falses=[]\n", + " \n", + " nbins = 10\n", + " minimum_value = min(df.value_len)\n", + " range_value = max(df.value_len) - minimum_value\n", + " step = math.ceil(range_value/nbins)\n", + " bin_max_range = minimum_value\n", + " def count_in_range(df):\n", + " f1 = df.value_len >= bin_max_range - step\n", + " f2 = df.value_len < bin_max_range\n", + " return len(df[f1 & f2])\n", + "\n", + " for x in range(0, nbins):\n", + " bin_max_range += step\n", + " range_count = count_in_range(df)\n", + " bins.append(str(bin_max_range)) #superior margin for each bin\n", + " if range_count == 0:\n", + " #If range_count is 0 then there is no counting to do for trues or falses, all 0. \n", + " trues.append(0)\n", + " falses.append(0)\n", + " else:\n", + " trues.append(count_in_range(df[df.is_json == True]) / range_count)\n", + " falses.append(count_in_range(df[df.is_json == False]) / range_count)\n", + "\n", + "# print('Bins:', bins)\n", + "# print('Trues: ', trues)\n", + "# print('Falses: ', falses)\n", + "# print(pd.DataFrame([bins, trues, falses], index= ['up to value', 'json%', 'non json%']))\n", + " return (bins, trues, falses)\n", + "\n", + "def plotPercentualComparison(df, title='Value type: Json X Other'):\n", + " bins, trues, falses = percetangeData(df)\n", + " width = 0.95\n", + " p1 = plt.bar(bins, trues, width=width)\n", + " p2 = plt.bar(bins, falses, bottom=trues, width=width)\n", + "\n", + " plt.ylabel('Scores')\n", + " plt.xlabel('Value_len')\n", + " plt.title(title)\n", + " plt.legend((p1[0], p2[0]), ('JSON', 'Other'))\n", + " idx = np.round(np.linspace(0, 10 - 1, 4)).astype(int)\n", + " plt.xticks(idx, [bins[i] for i in idx])\n", + "\n", + " return plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## All values\n", + "If all data is divided in 10 bins and the percentual of NON-JSON values in each bin is painted orange, we have the following graph:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEXCAYAAABCjVgAAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xm8HfP9x/HXOzebLYKEkhtJLEGKWmKrfY8tqkWjlvjZWhWKWqKLraV2tVdaFLWloW2QSpUS1VJRESJCRHAFCYJGhCT38/tj5o7JyV2O68w9kbyfj8d5ZJbvzHzPnJvzPvOdme8oIjAzMwNoV+0KmJnZosOhYGZmGYeCmZllHApmZpZxKJiZWcahYGZmGYeCWcEkTZC0Q7XrUQmSDpf0z2rXw4rjULCySXpE0kxJnUqm/17SL5tYJiSt1cJ6D5I0VZJKpreXNF3S3rlpfSTVS7q23G1JOlvSH5orn763OZJm5V735sr+RNKr6fQ6SXc1957yIuLrEfFIueUXFZJ6p/uofbXrYm3HoWBlkdQb2BYIYGCFV/8noCuwfcn0Aen2HshNOwyYCQwqDacKGBIRy+Ze+wBIGgwcCuwSEcsC/YGHKrztJY7DZtHkULByHQY8AfweGFzJFUfEHGB4uo3Sbd4WEfNKpv0MmAvsU8l6NGMzYHREvAIQEW9HxLByF06PgnZJhzeXNFbSR5LekXRZrtzAtKnpg/TIZb2SdZwiabykDyXdJalzE9u7TtKI3PiFkh4qPRJL57WT9DNJr6VHZbdIWj6dPSb994P0CGmr3HKXpEeNr0raIzd9eUk3SHpL0puSfimpJp13uKTHJV0u6X3g7HL3obUdh4KV6zDgtvS1u6RVKrz+m4H9JS0FyZcLyZf+LQ0FJG0L1AJ30niIFOUJ4DBJp0rq3/All6vXUEn3lbmuK4ArIqILsCbJ+0BSX+AO4ESgOzAKuFdSx9yyB5IcPfUBNgQOb2IbPwY2TL+EtwWOBAZH433aHJ6+dgTWAJYFrk7nbZf+2zU9cvp3Or4FMAnoBlwE3JALnJuBecBawMbAbsBRue1tAUwBVgbOa6L+VkUOBWuRpG2AXsDwiHgaeAX4XiW3ERGPA+8A+6WTDgReiohxuWKDgb9GxEzgdmAPSStXsBpXpr/SG16/SOv2B+B4YHfgUWC6pKG5ul8QEXs3vsqFzAXWktQtImZFxBPp9O8C90fEgxExF7gEWAr4Zr5+ETEtIt4H7gU2amwDETEbOAS4DPgDcHxE1DVRn4OByyJiSkTMAs4gaZprrmnntYj4bUTMJwmBVYFV0h8KewAnRsTHETEduBwYlFt2WkRcFRHzIuKTZrZhVeJQsHIMBv4WEe+m47dT4Sak1C18/uv/UJIvHADSI4gDSI5USH+1vk554TQP6JCfIKlhfG5u8gkR0TX3+nnDjIi4LSJ2ITn38QPgXEm7f5E3lzoS6Au8KOmp3En01YDXcturB94AeuSWfTs3PJvkV32jIuI/JL/IRXo00oQFtpsOtweaOxLM6pEGEGldepHs57caghW4nuSooMEbzazXFgEOBWtW+mV8ILC9pLclvQ2cBHxD0jcqvLlbgJ3TtustScKnwX5AF+DaXD16UF4T0utA75JpfYD5wJtfpIIRMTci/giMB9b/Isumy78cEQeRfFFeCIyQtAwwjeRLFYC0OabnF61fbvnjgE7pek9rpugC2wVWJwnRd0hO8n8RbwCfAt1ywdolIr6eK+NumRdxDgVrybdIvjz7kTRXbASsBzzGgl/INZI65175tvCOJfMWaJNvEBGvAf8kaVt/MCLyv4wHAzcCG+TqsTWwkaQNWtjWA8A6kg6V1EHSisD5wIiSk9iNStvm95K0XHpidg/g68CTLS3byLoOkdQ9PRL4IJ08n+TX/F6Sdk6PYn5M8gX7r1Zsoy/wS5ImpEOB0yQ12tREsq9PUnKp77Ik++WudL/MAOpJzjW0KCLeAv4GXCqpS7qv1pRUelWZLcIcCtaSwcBNEfF6etXN2+mX9dXAwbm256HAJ7nXw7l1TCiZ93/NbO9mkl+u+RPMPYCdgV/n65Ce33iABZuyFtpW2ra9J/B9YDrwPPAhcGzJtq/WgvcpPJ1O/wj4CckRxwckJ1ePjYh/pvX7iaS/NvOe8gYAEyTNIjnpPCgi5kTEJJIv8auAd0lOsu8TEZ+VuV7SurQnOY9wYUQ8GxEvp3W/VY1fwnsjcCvJlUavAnNIzp80NA2dBzyeNgdtWUYVDgM6Ai+QXDo8guScg31FyA/ZMSuWpNeBQyJiTIuFzarMRwpmBZLUneQS06lVropZWQoLBUk3pjfDPN/EfEm6UtJkJTfkbFJUXcyqQdJmwMvAVRHxerXrY1aOwpqPJG0HzAJuiYiFrtKQtCdJ2+WeJDe0XBERWxRSGTMzK0thRwpp++n7zRTZlyQwIr2Bp6skn5AyM6uianZI1YMFb2SpS6e9VVpQ0jHAMQDLLLPMpuuuu27rtjjtmdYt92WstjEAz735YZtveoMey1dlu9Xc9gY9km57lqT97fe85G27NZ5++ul3I6J7S+WqGQoLdc5FEze2pJ2PDQPo379/jB07tnVbPLv1O7TVzk7q2nvo/W2+6bEX7FWV7VZz22Mv2AtYsva33/OSt+3WkPRay6Wqe/VRHckdmw1qSe6uNDOzKqlmKIwk6XlS6U0xH6Z3RJqZWZUU1nwk6Q5gB6CbpDrgLNJOySLiNyRdA+8JTCbp3Ku5u1zNzKwNFBYKaadfzc0P4Liitm9mS64undpx/BYr0KtrB9To6cvWmzhxIgC/Hdj2F0s2bLs5nTt3pra2lg4dOrRYtjF+HJ6ZLXaO32IFNllzNdovvRxa+IFzX8p6tV0BmFv3QQslK69h202JCN577z3q6uro06dPq7bhbi7MbLHTq2uHQgJhUSeJlVZaiTlz5rR6HQ4FM1vsCC1xgdDgy75vh4KZmWV8TsHMFnsDr368ouubWsZNZFuuU8u/Jr7ORWefwX8eH4MkOnXqzEXX3UTt6r3430cfcsGZpzPuqeRZTRtttgVDz72Q5bosz5tvvM6e3/wGp597Id/7v2MAOP9npzJgh605/PDDK/peSvlIwcysIKNH3sOMd95mxIOPc/ff/8Xlv7uVLl2SnhXOPvUEalfvzf2PP8P9jz9Dj569OOe0H2XLrtitO7ff8BvmfvaFnrP0pTkUzMwKMmP6O3RbeRXatUu+aldZtQddunbl9Ven8MJz4zjmR6dmZb9/4mlMGP8Mb0x9FYAVVlyJzbfZjpEj7mjTOjsUzMwKsvs+32LM3x/gwN235ZJzf8bE58cDMOXlF1mn3wbU1Hz+uPKamhrW6bcBr7z0+b0IR/zwJG4Zdg3z589vszo7FMzMCrLKqj34yyNPccLQM2nXThwzaF+e/OejRDRxlVAyIxutXb0X62+0CaP+/Mc2q7NPNJuZFahjp05ss+OubLPjrqzUbWUeHn0/Bx/xA16cMJ76+vqsaam+vp5JE59njbXWWWD5o4aczI+/P5hNt/hmm9TXRwpmZgWZ+NyzTH876eezvr6el16cwGo9erJ6nzVY9+sbMuzKS7Kyw668hPXW/war91ljgXX0Wasva/ZdlzEPjW6TOvtIwcwWeyOHbF2xdW3YQlcTAPPmzaNjx468/+4Mzjn9R3z26acArL/Rpgw6/GgAzrn4Kn515mnsvc0mRAQbbroZZ198VaPrO+r4k/nugO0r9h6a41AwM6uwV16aSG2vPmy94y5sveMujZbp0rUrv7pyWKPzevRcnXse+nc2vk6/DRj3+vtlBdKX5VAwM6ug4bfeyB03DePUs86vdlVaxaFgZlZBBx56BAceekS1q9FqPtFsZmYZh4KZmWUcCmZmlnEomJlZxieazWyxt+HvelV2hWd/2GKRd956k/N/eipTXp5EfX092+2yOyf/9FxeeXkSM955i2132g2A6y67gKWXXobBPzi+snVsJR8pmJlVWERw0tGHsePue3HvY08zcsxYZn/8MVdd9AsmTXiOxx5+sGLbqnRneT5SMDOrsP88PoZOnTrxre8eDCQ9oJ561nkM2GpD2rfvABGMe+oJjjjuJABeeXkSRx6wN29Nq+PgI4/l4CO+D8B999zF7TcOY97cz1h/40256+YbqKmpYdlll+Xkk09m9OjRXHrppWyzzTYVq7uPFMzMKmzySy/Sb4ONFpi27HJdWK12dY4+4RR222c/ho9+jAEDvw3A1Fde4ro/3M1t9z7E9ZdfyNy5c5ny8iRG3/snbv7TAwwf/Rg17Wq47bbbAPj4449Zf/31efLJJysaCOAjBTOzyivpAjs/vbEus7fdaTc6dupEx06dWLFbd95/dzpPPv4oE8c/y8F77wTAnDlzWG+NnkBy5PGd73ynkKo7FMzMKmzNvuvy91EjF5g2638f8fa0N6lpt3ADTceOnbLhdu3aMW/efCJgnwMG8aOhZ2XzGvo+6ty58wIP6KkkNx+ZmVXYFttsz5xPPuHeEXcCycngS3/xcwYe8D1W6r4ysz+e1fI6tt6Ov98/kvfenQHAhzNn8tprrxVab/CRgpktAcYfVbkv03J6KpXE5b+7lfN+egrDrriY+vp6ttlpV044/ed8Mns2N17zaw7cfdvsRHNj1uy7Lsed+lOOPfjb1NfX075DB24c9ht69arw5bUlHApmZgX42mq1XHXTnQtN79ipE7ff/3CTy+W7zB4w8NvZyWj4PJBmzWr5SKO13HxkZmYZh4KZmWUcCma22AmCiKh2Nariy75vh4KZLXZe+2Au82Z/tMQFQ0Tw3nvv0blz51avwyeazWyxc9WTMzke6NX1XUQjN5F9CRP/txQA78z8pKLr/SLbbk7nzp2pra1t9TYcCma22Pno03rOG/NeIeueesFeAOwx9P5C1l/Otovk5iMzM8sUGgqSBkiaJGmypKGNzF9d0j8kPSNpvKQ9i6yPmZk1r7BQkFQDXAPsAfQDDpLUr6TYz4DhEbExMAi4tqj6mJlZy4o8UtgcmBwRUyLiM+BOYN+SMgF0SYeXB6YVWB8zM2tBkaHQA3gjN16XTss7GzhEUh0wCmj0eXSSjpE0VtLYGTNmFFFXMzOj2FBo7Dqw0ouGDwJ+HxG1wJ7ArZIWqlNEDIuI/hHRv3v37gVU1czMoNhQqAN65sZrWbh56EhgOEBE/BvoDHQrsE5mZtaMIkPhKWBtSX0kdSQ5kTyypMzrwM4AktYjCQW3D5mZVUlhoRAR84AhwGhgIslVRhMknStpYFrsx8DRkp4F7gAOjyXtvnQzs0VIoXc0R8QokhPI+Wln5oZfALYusg5mZlY+39FsZmYZh4KZmWUcCmZmlnEomJlZxqFgZmYZh4KZmWUcCmZmlnEomJlZxqFgZmYZh4KZmWUcCmZmlnEomJlZxqFgZmYZh4KZmWUcCmZmlnEomJlZxqFgZmYZh4KZmWUcCmZmlnEomJlZxqFgZmYZh4KZmWUcCmZmlnEomJlZxqFgZmYZh4KZmWUcCmZmlnEomJlZxqFgZmYZh4KZmWUcCmZmlnEomJlZxqFgZmYZh4KZmWUKDQVJAyRNkjRZ0tAmyhwo6QVJEyTdXmR9zMysee2LWrGkGuAaYFegDnhK0siIeCFXZm3gDGDriJgpaeWi6mNmZi0r8khhc2ByREyJiM+AO4F9S8ocDVwTETMBImJ6gfUxM7MWFBkKPYA3cuN16bS8vkBfSY9LekLSgMZWJOkYSWMljZ0xY0ZB1TUzsyJDQY1Mi5Lx9sDawA7AQcDvJHVdaKGIYRHRPyL6d+/eveIVNTOzRJGhUAf0zI3XAtMaKfOXiJgbEa8Ck0hCwszMqqDIUHgKWFtSH0kdgUHAyJIyfwZ2BJDUjaQ5aUqBdTIzs2YUFgoRMQ8YAowGJgLDI2KCpHMlDUyLjQbek/QC8A/g1Ih4r6g6mZlZ8wq7JBUgIkYBo0qmnZkbDuDk9GVmZlXmO5rNzCzjUDAzs4xDwczMMmWFgqQDJC2XDv9M0j2SNim2amZm1tbKPVL4eUT8T9I2wO7AzcB1xVXLzMyqodxQmJ/+uxdwXUT8BehYTJXMzKxayg2FNyVdDxwIjJLU6Qssa2ZmXxHlfrEfSHKj2YCI+ABYETi1sFqZmVlVlBUKETEbmA5sk06aB7xcVKXMzKw6yr366CzgdJIH4gB0AP5QVKXMzKw6ym0+2g8YCHwMEBHTgOWKqpSZmVVHuaHwWdpPUQBIWqa4KpmZWbWUGwrD06uPuko6Gvg78NviqmVmZtVQVi+pEXGJpF2Bj4B1gDMj4sFCa2ZmZm2uxVCQVAOMjohdAAeBmdlirMXmo4iYD8yWtHwb1MfMzKqo3IfszAGek/Qg6RVIABFxQiG1MjOzqig3FO5PX2Zmthgr90TzzZI6An3TSZMiYm5x1TIzs2ooKxQk7UDSXfZUQEBPSYMjYkxxVTMzs7ZWbvPRpcBuETEJQFJf4A5g06IqZmZmba/cm9c6NAQCQES8RNL/kZmZLUbKPVIYK+kG4NZ0/GDg6WKqZGZm1VJuKBwLHAecQHJOYQxwbVGVMjOz6ig3FNoDV0TEZZDd5dypsFqZmVlVlHtO4SFgqdz4UiSd4pmZ2WKk3FDoHBGzGkbS4aWLqZKZmVVLuaHwsaRNGkYk9Qc+KaZKZmZWLeWeUzgR+KOkaSQP2lkN+G5htTIzs6po9khB0maSvhYRTwHrAncB84AHgFfboH5mZtaGWmo+uh74LB3eCvgJcA0wExhWYL3MzKwKWmo+qomI99Ph7wLDIuJu4G5J44qtmpmZtbWWjhRqJDUEx87Aw7l55Z6PMDOzr4iWvtjvAB6V9C7J1UaPAUhaC/iw4LqZmVkbazYUIuI8SQ8BqwJ/i4hIZ7UDji+6cmZm1rbKeUbzExHxp4jIP4bzpYj4b0vLShogaZKkyZKGNlNuf0mR3v9gZmZVUu7Na19Y2j/SNcAeQD/gIEn9Gim3HElHe08WVRczMytPYaEAbA5MjogpEfEZcCewbyPlfgFcBMwpsC5mZlaGIkOhB/BGbrwunZaRtDHQMyLua25Fko6RNFbS2BkzZlS+pmZmBhQbCmpkWmQzpXbA5cCPW1pRRAyLiP4R0b979+4VrKKZmeUVGQp1QM/ceC0wLTe+HLA+8IikqcCWwEifbDYzq54iQ+EpYG1JfSR1BAYBIxtmRsSHEdEtInpHRG/gCWBgRIwtsE5mZtaMwkIhIuYBQ4DRwERgeERMkHSupIFFbdfMzFqv0K4qImIUMKpk2plNlN2hyLqYmVnLimw+MjOzrxiHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZQoNBUkDJE2SNFnS0EbmnyzpBUnjJT0kqVeR9TEzs+YVFgqSaoBrgD2AfsBBkvqVFHsG6B8RGwIjgIuKqo+ZmbWsyCOFzYHJETElIj4D7gT2zReIiH9ExOx09AmgtsD6mJlZC4oMhR7AG7nxunRaU44E/trYDEnHSBoraeyMGTMqWEUzM8srMhTUyLRotKB0CNAfuLix+RExLCL6R0T/7t27V7CKZmaW177AddcBPXPjtcC00kKSdgF+CmwfEZ8WWB8zM2tBkUcKTwFrS+ojqSMwCBiZLyBpY+B6YGBETC+wLmZmVobCQiEi5gFDgNHARGB4REyQdK6kgWmxi4FlgT9KGidpZBOrMzOzNlBk8xERMQoYVTLtzNzwLkVuv1TvObe35eYAmNrmWzQzaz3f0WxmZhmHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWKTQUJA2QNEnSZElDG5nfSdJd6fwnJfUusj5mZta8wkJBUg1wDbAH0A84SFK/kmJHAjMjYi3gcuDCoupjZmYtK/JIYXNgckRMiYjPgDuBfUvK7AvcnA6PAHaWpALrZGZmzVBEFLNiaX9gQEQclY4fCmwREUNyZZ5Py9Sl46+kZd4tWdcxwDHp6DrApEIq3bxuwLstlrJK8f5uO97Xbata+7tXRHRvqVD7AivQ2C/+0gQqpwwRMQwYVolKtZaksRHRv5p1WJJ4f7cd7+u2tajv7yKbj+qAnrnxWmBaU2UktQeWB94vsE5mZtaMIkPhKWBtSX0kdQQGASNLyowEBqfD+wMPR1HtWWZm1qLCmo8iYp6kIcBooAa4MSImSDoXGBsRI4EbgFslTSY5QhhUVH0qoKrNV0sg7++2433dthbp/V3YiWYzM/vq8R3NZmaWcSiYmVlmiQgFSTWSnpF0X8n0qyTNyo33kvSQpPGSHpFUm5u3uqS/SZoo6YWGLjkk7Szpv5LGSfqnpLXS6Zen08ZJeknSB23zbtuepBslTU/vO2mYdlfu/U+VNC6d3lHSTZKek/SspB1yyxyUTh8v6QFJ3ZpbV2651SXNknRKbtpJkiZIel7SHZI6F74j2oCkzpL+k+67CZLOSacPSbuLiYb9VrLcZpLmp/cPIWnH3D4dJ2mOpG+l85r6m260W5rmPtPFxaL2HSJpsKSX09dgKikiFvsXcDJwO3Bfblp/4FZgVm7aH4HB6fBOwK25eY8Au6bDywJLp8MvAeulwz8Eft/I9o8nOdFe9X1R0P7dDtgEeL6J+ZcCZ6bDxwE3pcMrA0+T/DhpD0wHuqXzLgLObm5duWl3p5/dKel4D+BVYKl0fDhweLX3U4X2tYBl0+EOwJPAlsDGQG9gasM+zC1TAzwMjAL2b2SdK5Jc6NHs33Q6/Jt0eBBwV3OfabX3VYX3+yLzHZJ+XlPSf1dIh1eo1Htd7I8U0qTeC/hdbloNcDFwWknxfsBD6fA/SLvlUNJnU/uIeBAgImZFxOy0XABd0uHlWfheDICDgDu+9JtZREXEGJq4v0SSgAP5/P1n+zgipgMfkPznUvpaJl2mCyX7spF1kf66nQJMKNl0e2ApJfe/LF26rq+qSDT8Mu2QviIinomIqU0sdjxJcE5vYv7+wF/L+Jtuqluapj7TxcIi+B2yO/BgRLwfETOBB4EBrX6DJRb7UAB+TfLB1eemDQFGRsRbJWWfBb6TDu8HLCdpJaAv8IGke9JDyIvTPwqAo4BRkuqAQ4EL8iuU1AvoQ/JLbUm0LfBORLycjj8L7CupvaQ+wKZAz4iYCxwLPEfyn6IfySXLTa5L0jLA6cA5+UIR8SZwCfA68BbwYUT8rYg3Vw1pU8Y4ki/5ByPiyWbK9iD5W/5NM6scxII/Wpr6m+4BvAHJJefAh8BKNPGZtua9LaIWte+Q7HNI1aXTKmKxDgVJewPTI+Lp3LTVgAOAqxpZ5BRge0nPANsDbwLzSH51bpvO3wxYAzg8XeYkYM+IqAVuAi4rWecgYEREzK/Q2/qqKT1KupHkj3gsyX+2fwHzJHUgCYWNgdWA8cAZLazrHODy3C9nACStQPILrU+6rmUkHVKpN1RtETE/IjYi6SVgc0nrN1P818DpTf39SVoV2IDkfqIGTf1NN9UtTaOfafnvaNG1iH6HlNU9UKtVu62uyBfwK5I/1qnA28BsYGY6PDV91ZP05lq67LJAXTq8JfBIbt6hJN2CdwdeyU1fHXihZD3PAN+s9r5og33dm5JzCiT/Ed4BaptZ7l8kRwWbAQ/lpm8HjGpuXcBjuc/xA5ImrCEk/2FvyJU7DLi22vuooP1+Fum5lHR8KrlzCiTnVhr20SySo4tv5eb/CBiWG2/yb5okOLbKfR7vkt7r1NhnWu19U6H9u8h9h5D8OLo+N349cFCl3vNifaQQEWdERG1E9CZJ24cjYoWI+FpE9E6nz47keQ5I6iapYZ+cQfILCJIuO1aQ1NDD4E7ACyR/HMtL6ptO3xWY2LB9SeuQnAj6d2FvctG2C/BipL3gAkhaOm32QdKuwLyIeIHkF1W/3D5eYF82tq6I2Db3Of4aOD8iriZpNtoy3ZaAnUvW9ZUlqbukrunwUqT7panyEdEnt49GAD+MiD/nipQefTX3N91otzTNfKZfeYvod8hoYDdJK6RHxbux4JHel1JkL6lfRTsAv5IUwBiSqyqIiPlKLnd8KP2SeRr4bSRdeRwN3C2pnuQDPiK3voOAOyON88WVpDtI9l23tF30rIi4gYXbqiG5OmV0ur/eJPnFRERMU3J55RhJc4HX+PzwmibW1aiIeFLSCOC/JIfuz7CIdy3wBawK3Jy2R7cDhkfEfZJOIGn3/howXtKoSLutb0p6SWRP4NGGaS38TTfVLU2jn+kSagcK/g6JiPcl/YIkaADOjYiKdSTqbi7MzCyzWDcfmZnZF+NQMDOzjEPBzMwyDgUzM8s4FMzMLONQMDOzjEPBlhhpV8a7l0w7UdK1zSwzq6l5razD75V2X222KHIo2JLkDhZ+DnjZN8WZLQkcCrYkGQHsLakTZHf0rgaMU/JglP8qeVDMvqULStpBuQesSLpa0uHp8KaSHpX0tKTRaSdzLWpqufSI5kIlD9N5SdK2X/aNm5XLoWBLjIh4D/gPn/c9Pwi4C/gE2C8iNgF2BC5NuyJoUdq761UkD6/ZlKSvm/MqsFz7iNgcOJGk0zuzNuG+j2xJ09CE9Jf03yNIuiI+X9J2JD1e9gBWIekJsyXrAOsDD6Y5UkPyDIcvu9w96b9Pk/RAa9YmHAq2pPkzcJmkTUge1/nftBmoO7BpRMyVNBUofabzPBY8sm6YL2BCRGz1BevR0nKfpv/Ox/9PrQ25+ciWKJE8kOcRkuaahhPMy5M8SGWupB2BXo0s+hqITUiwAAAAnElEQVRJ196dJC1P0h03wCSgu6StIGkWkvT1MqrS2uXMCuVfILYkuoOkeabhSqTbgHsljQXG0cjzCSLiDUnDSZ4I9zJJd9xExGfpJaZXpmHRnuTZDqXPjC5dX6uWMyuau842M7OMm4/MzCzj5iOzAki6Bti6ZPIVEXFTNepjVi43H5mZWcbNR2ZmlnEomJlZxqFgZmYZh4KZmWX+Hw+3LHZZYCN9AAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plotPercentualComparison(df, title='ALL VALUES: json x other')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This graph proves that all bigger values are JSON and the non-json types only appear on the smaller values." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conclusion\n", + "\n", + "There is absolute no value greater than 104653 (max value for non-json) that represents a valid JSON in this 10% sample. \n", + "\n", + "This implies that all the greater values are JSON but they represent very low percentage of the whole data (6.76%). " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The top (0.30% - whole sample) / (6.76% - values above the mean) is gurantee to be a valid JSON\n" + ] + } + ], + "source": [ + "max_non_json_value_len = df[df.is_json == False].value_len.max()\n", + "allJson = df[df['value_len'] > max_non_json_value_len ]\n", + "length = allJson.is_json.count()\n", + "print(\"The top ({0:0.2f}% - whole sample) / ({1:0.2f}% - values above the mean) is gurantee to be a valid JSON\".format(\n", + " length / df.is_json.count() * 100, length / df[df.value_len > df.value_len.mean()].is_json.count() * 100))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "--- \n", + "\n", + "# Out of Curiosity: small values\n", + "This is not exacly relevant to the issue 22 ('What's in the really large values?') but I was courisous to know how was the distribution of the smaller values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Distribution of non-json values\n", + "That concentration of the non-json values made me curious: how is the distribution of NON-JSON values among the smaller values? \n", + "To answer this I filtered the data to only values bellow the bigger non-json and ploted the same graph.\n", + "- What I got is, again, the absolute majority of these non-json are on the first bin, so they really tend to be small values. \n", + " \n", + " *TODO: what is that really small portion of non-json values present on the 9th bin? Are they any different from the others thre are smaller?" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bigger non json value_len: 104653\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "print(\"Bigger non json value_len: \", max_non_json_value_len)\n", + "plotPercentualComparison(df[df.value_len < (max_non_json_value_len)], title=\"Up to the bigger NON-JSON: json X other\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Still, what about that first bin (the fist 1/10th)?" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "first 1/10th: 10465.3\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "print(\"first 1/10th: \", max_non_json_value_len/10)\n", + "plotPercentualComparison(df[df.value_len < (max_non_json_value_len/10)], title=\"First 1/10th: json X other\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Distribution for the values bellow the mean\n", + "This is where 95% of the rows are, they do have JSON types on them but as we can see, most of this data is of some other type, not json, and they have a eavenly distribution of this value_range, why is that?" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The values smaller than the mean represent 95.57% of the whole sample\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "m = df.value_len.mean()\n", + "bellow_mean = df[df.value_len <= (m)]\n", + "print('The values smaller than the mean represent {0:0.2f}% of the whole sample'.format(bellow_mean.is_json.count()/df.is_json.count()*100))\n", + "plotPercentualComparison(bellow_mean, title='Bellow the mean: json X other')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There is more JSON values bellow the mean than above the mean, but thats not surprising since the data bellow the mean is 95% of everthing." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Jsons bellow the mean are 90.48% of all jsons\n" + ] + } + ], + "source": [ + "bellow_mean_count = bellow_mean[bellow_mean.is_json == True].is_json.count()\n", + "above_mean_count = df[(df.value_len > m) & (df.is_json == True)].is_json.count()\n", + "total = bellow_mean_count + above_mean_count\n", + "print(\"Jsons bellow the mean are {0:.2f}% of all jsons\".format(bellow_mean_count/total * 100))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 699b06677c45dc7d0b7c115649c846c89caa77f3 Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Mon, 8 Apr 2019 15:31:20 -0300 Subject: [PATCH 16/23] Fix typo --- .../isJson_Value_Distribution.ipynb | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/analyses/2019_03_aliamcami_value_analyses/isJson_Value_Distribution.ipynb b/analyses/2019_03_aliamcami_value_analyses/isJson_Value_Distribution.ipynb index ba2df56..77766f7 100644 --- a/analyses/2019_03_aliamcami_value_analyses/isJson_Value_Distribution.ipynb +++ b/analyses/2019_03_aliamcami_value_analyses/isJson_Value_Distribution.ipynb @@ -142,7 +142,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 4, @@ -379,7 +379,7 @@ "metadata": {}, "source": [ "## All values\n", - "If all data is divided in 10 bins and the percentual of NON-JSON values in each bin is painted orange, we have the following graph:" + "If all data is divided in 10 bins and the percentage of NON-JSON values in each bin is painted orange, we have the following graph:" ] }, { @@ -441,7 +441,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "The top (0.30% - whole sample) / (6.76% - values above the mean) is gurantee to be a valid JSON\n" + "The top (0.30% - whole sample) / (6.76% - values above the mean) is guarantee to be a valid JSON\n" ] } ], @@ -449,7 +449,7 @@ "max_non_json_value_len = df[df.is_json == False].value_len.max()\n", "allJson = df[df['value_len'] > max_non_json_value_len ]\n", "length = allJson.is_json.count()\n", - "print(\"The top ({0:0.2f}% - whole sample) / ({1:0.2f}% - values above the mean) is gurantee to be a valid JSON\".format(\n", + "print(\"The top ({0:0.2f}% - whole sample) / ({1:0.2f}% - values above the mean) is guarantee to be a valid JSON\".format(\n", " length / df.is_json.count() * 100, length / df[df.value_len > df.value_len.mean()].is_json.count() * 100))" ] }, @@ -460,7 +460,7 @@ "--- \n", "\n", "# Out of Curiosity: small values\n", - "This is not exacly relevant to the issue 22 ('What's in the really large values?') but I was courisous to know how was the distribution of the smaller values" + "This is not exactly relevant to the issue 22 ('What's in the really large values?') but I was curious to know how was the distribution of the smaller values" ] }, { @@ -616,7 +616,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There is more JSON values bellow the mean than above the mean, but thats not surprising since the data bellow the mean is 95% of everthing." + "There is more JSON values below the mean than above the mean, but that's not surprising since the data below the mean is 95% of everything." ] }, { @@ -628,7 +628,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Jsons bellow the mean are 90.48% of all jsons\n" + "Jsons below the mean are 90.48% of all jsons\n" ] } ], @@ -636,7 +636,7 @@ "bellow_mean_count = bellow_mean[bellow_mean.is_json == True].is_json.count()\n", "above_mean_count = df[(df.value_len > m) & (df.is_json == True)].is_json.count()\n", "total = bellow_mean_count + above_mean_count\n", - "print(\"Jsons bellow the mean are {0:.2f}% of all jsons\".format(bellow_mean_count/total * 100))" + "print(\"Jsons below the mean are {0:.2f}% of all jsons\".format(bellow_mean_count/total * 100))" ] } ], From 46c31d04a56d6783137853d0c8deaf8e500a3cda Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Mon, 8 Apr 2019 17:46:01 -0300 Subject: [PATCH 17/23] Removed fixed names, session organization, removed false positives for valid jsons --- .../isJson_dataPrep.ipynb | 613 ++++++++++-------- 1 file changed, 325 insertions(+), 288 deletions(-) diff --git a/analyses/2019_03_aliamcami_value_analyses/isJson_dataPrep.ipynb b/analyses/2019_03_aliamcami_value_analyses/isJson_dataPrep.ipynb index d14915f..f0dbbc4 100644 --- a/analyses/2019_03_aliamcami_value_analyses/isJson_dataPrep.ipynb +++ b/analyses/2019_03_aliamcami_value_analyses/isJson_dataPrep.ipynb @@ -17,16 +17,18 @@ "output_type": "stream", "text": [ "/anaconda3/envs/overscripted/lib/python3.6/site-packages/dask/config.py:168: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", - " data = yaml.load(f.read()) or {}\n", - "/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/config.py:20: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", - " defaults = yaml.load(f)\n" + " data = yaml.load(f.read()) or {}\n" ] } ], "source": [ "import dask.dataframe as dd\n", - "from dask.distributed import Client\n", - "from dask.diagnostics import ProgressBar" + "from dask.diagnostics import ProgressBar\n", + "import json\n", + "import pandas as pd\n", + "import os\n", + "import tldextract\n", + "import hashlib\n" ] }, { @@ -34,7 +36,9 @@ "metadata": {}, "source": [ "All sub samples and new samples with new columns/data will be saved under the \"DIR\" directory to keep things organized. \n", - "As such, the function \"save_parquet\" and \"read_parquet\" adds this directory to every parquet name, and I'm using this functions instead of dd.read_parquet/dd.to_parquet direct to ensure the same read and write settings across the notebook. " + "As such, the function \"save_parquet\" and \"read_parquet\" adds this directory to every parquet name, and I'm using this functions instead of dd.read_parquet/dd.to_parquet direct to ensure the same read and write settings across the notebook. \n", + "\n", + "NOTE: each section adds its name to the 'FILE_NAME' and saves the new parquet with this name. Because of it, you can run the sections at any order you desire to have the output you need. " ] }, { @@ -48,8 +52,9 @@ "# client\n", "\n", "#Create folder to save/read new data\n", - "DIR = 'sample_0_prep/'\n", - "import os\n", + "DIR = 'sample0_prep/'\n", + "FILE_NAME = 's0'\n", + "\n", "if not os.path.exists(DIR):\n", " os.makedirs(DIR)" ] @@ -74,13 +79,13 @@ " if recalculate_partition:\n", " n = 1+df.memory_usage(deep=True).sum().compute() // (1000 * 1000 * 100)\n", " print(\"Npartition: \", n)\n", - " df.repartition(npartitions= n).to_parquet(DIR + name, engine=\"pyarrow\")\n", + " df.repartition(npartitions= n).to_parquet(DIR + name + '.parquet', engine=\"pyarrow\")\n", " else:\n", - " df.to_parquet(DIR + name, engine=\"pyarrow\")\n", + " df.to_parquet(DIR + name + '.parquet', engine=\"pyarrow\")\n", " \n", " \n", "def read_parquet(name):\n", - " return dd.read_parquet(DIR + name, engine='pyarrow')" + " return dd.read_parquet(DIR + name + '.parquet', engine='pyarrow')" ] }, { @@ -101,7 +106,9 @@ { "data": { "text/plain": [ - "Index(['value_1000', 'value', 'value_len', 'symbol', 'script_url', 'location'], dtype='object')" + "Index(['value_1000', 'value', 'value_len', 'symbol', 'script_url', 'location',\n", + " 'operation'],\n", + " dtype='object')" ] }, "execution_count": 4, @@ -110,10 +117,10 @@ } ], "source": [ - "#Original sample\n", + "#Original sample \n", "df = dd.read_parquet('sample_0.parquet', \n", " engine='pyarrow', \n", - " columns=['value_1000', 'value', 'value_len', 'symbol', 'script_url', 'location'])\n", + " columns=['value_1000', 'value', 'value_len', 'symbol', 'script_url', 'location', 'operation'])\n", "\n", "# df.astype({'value_1000': str, 'value': str,'value_len': int,'symbol': int,'script_url': str})\n", "df.columns" @@ -140,7 +147,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 48.2s\n", + "[########################################] | 100% Completed | 58.7s\n", "1356.9776628910975 0 4496861 26310.62140481331 11292867\n" ] } @@ -160,12 +167,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### FILTER: value_len > df_mean\n", - "1356 is the value_len mean\n", - "\n", - "To filter the data into something that is more interesting to this task I decided to only work with values that are at above the mean.\n", + "# Add Column: Domains\n", + "The following code is copyed from this same project: ~/analyses/hello_world.ipynb\n", "\n", - "All values above the mean count up to 499805 rows. That is just 4,42% of the whole sample, and a lot easier to work on. " + "It uses the data saved from the last section\n", + "This section is dedicated to extract the domain of the columns \"location\" and \"script_url\" and add it as new columns \"location_domain\" and \"script_domain\"" ] }, { @@ -177,58 +183,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 58.0s\n", - "Npartition: 244\n", - "[########################################] | 100% Completed | 1min 30.9s\n" + "Notebook name: s0_domains\n" ] } ], "source": [ - "#Save\n", - "save_parquet(df= df[df['value_len'] > df_mean], name='above_mean.parquet', recalculate_partition=True)" + "FILE_NAME += '_domains'\n", + "print('Notebook name: ', FILE_NAME)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['value_1000', 'value', 'value_len', 'symbol', 'script_url', 'location'], dtype='object')" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Read\n", - "df = read_parquet('above_mean.parquet')\n", - "df.columns" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Add Column: Domains\n", - "The following code is copyed from this same project: ~/analyses/hello_world.ipynb\n", - "\n", - "It uses the data saved from the last section\n", - "This section is dedicated to extract the domain of the columns \"location\" and \"script_url\" and add it as new columns \"location_domain\" and \"script_domain\"" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, "outputs": [], "source": [ - "import tldextract\n", - "\n", "def extract_domain(url):\n", " \"\"\"Use tldextract to return the base domain from a url\"\"\"\n", " try:\n", @@ -240,13 +209,10 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "#To guarantee the usage of the correct parquet created above in case we start from this section\n", - "df = read_parquet('above_mean.parquet')\n", - "\n", "df.astype({'value_1000': str, 'value': str,'value_len': int,'symbol': int,'script_url': str, 'location': str})\n", "df['location_domain'] = df.location.apply(extract_domain, meta='O')\n", "df['script_domain'] = df.script_url.apply(extract_domain, meta='O')" @@ -254,25 +220,25 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 1min 17.3s\n" + "[########################################] | 100% Completed | 6min 23.0s\n" ] } ], "source": [ "#save\n", - "save_parquet(df=df, name='above_mean_domain.parquet')" + "save_parquet(df=df, name=FILE_NAME)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -305,67 +271,67 @@ " \n", " \n", " 0\n", - " canada.ca\n", - " https://www.canada.ca/en/services.html\n", - " adobedtm.com\n", - " https://assets.adobedtm.com/caacec67651710193d...\n", + " vk.com\n", + " https://vk.com/widget_comments.php?app=2297596...\n", + " vk.com\n", + " https://vk.com/js/api/xdm.js?1449919642\n", " \n", " \n", " 1\n", - " tmall.com\n", - " https://maniform.world.tmall.com/category-1282...\n", - " alicdn.com\n", - " https://g.alicdn.com/alilog/mlog/aplus_v2.js\n", + " vk.com\n", + " https://vk.com/widget_comments.php?app=2297596...\n", + " vk.com\n", + " https://vk.com/js/api/xdm.js?1449919642\n", " \n", " \n", " 2\n", - " tmall.com\n", - " https://maniform.world.tmall.com/category-1282...\n", - " alicdn.com\n", - " https://g.alicdn.com/alilog/mlog/aplus_v2.js\n", + " vk.com\n", + " https://vk.com/widget_comments.php?app=2297596...\n", + " vk.com\n", + " https://vk.com/js/al/aes_light.js?592436914\n", " \n", " \n", " 3\n", - " coches.net\n", - " https://www.coches.net/fiat/segunda-mano/\n", - " coches.net\n", - " https://www.coches.net/scripts/common.min.js?2...\n", + " baidu.com\n", + " https://pos.baidu.com/s?hei=70&wid=670&di=u313...\n", + " baidustatic.com\n", + " https://cpro.baidustatic.com/cpro/ui/noexpire/...\n", " \n", " \n", " 4\n", - " coches.net\n", - " https://www.coches.net/fiat/segunda-mano/\n", - " coches.net\n", - " https://www.coches.net/scripts/common.min.js?2...\n", + " serienjunkies.org\n", + " http://serienjunkies.org/smilf/smilf-season-1-...\n", + " google.com\n", + " https://apis.google.com/js/plusone.js?_=151338...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " location_domain location \\\n", - "0 canada.ca https://www.canada.ca/en/services.html \n", - "1 tmall.com https://maniform.world.tmall.com/category-1282... \n", - "2 tmall.com https://maniform.world.tmall.com/category-1282... \n", - "3 coches.net https://www.coches.net/fiat/segunda-mano/ \n", - "4 coches.net https://www.coches.net/fiat/segunda-mano/ \n", + " location_domain location \\\n", + "0 vk.com https://vk.com/widget_comments.php?app=2297596... \n", + "1 vk.com https://vk.com/widget_comments.php?app=2297596... \n", + "2 vk.com https://vk.com/widget_comments.php?app=2297596... \n", + "3 baidu.com https://pos.baidu.com/s?hei=70&wid=670&di=u313... \n", + "4 serienjunkies.org http://serienjunkies.org/smilf/smilf-season-1-... \n", "\n", - " script_domain script_url \n", - "0 adobedtm.com https://assets.adobedtm.com/caacec67651710193d... \n", - "1 alicdn.com https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", - "2 alicdn.com https://g.alicdn.com/alilog/mlog/aplus_v2.js \n", - "3 coches.net https://www.coches.net/scripts/common.min.js?2... \n", - "4 coches.net https://www.coches.net/scripts/common.min.js?2... " + " script_domain script_url \n", + "0 vk.com https://vk.com/js/api/xdm.js?1449919642 \n", + "1 vk.com https://vk.com/js/api/xdm.js?1449919642 \n", + "2 vk.com https://vk.com/js/al/aes_light.js?592436914 \n", + "3 baidustatic.com https://cpro.baidustatic.com/cpro/ui/noexpire/... \n", + "4 google.com https://apis.google.com/js/plusone.js?_=151338... " ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#read\n", - "df = read_parquet('above_mean_domain.parquet')\n", + "df = read_parquet(FILE_NAME)\n", "df[['location_domain', 'location', 'script_domain', 'script_url']].head()" ] }, @@ -380,20 +346,37 @@ "After simple validation of value is a json or not, boolean value will be saved on a new column named \"is_json\"\n" ] }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Notebook name: s0_domains_isjson\n" + ] + } + ], + "source": [ + "FILE_NAME += '_isjson'\n", + "print('Notebook name: ', FILE_NAME)" + ] + }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "import json\n", - "import pandas as pd\n", - "\n", "def is_json(myjson):\n", + " if (myjson == '{}'):\n", + " #would be counted as valid, but its an empty json\n", + " return False\n", " try:\n", - " json.loads(myjson)\n", - " return True\n", - "\n", + " #Eliminate false positives\n", + " return (type(json.loads(myjson)) == dict)\n", " except ValueError as e:\n", " return False" ] @@ -404,8 +387,6 @@ "metadata": {}, "outputs": [], "source": [ - "#To guarantee the usage of the correct parquet created above in case we start from this section\n", - "df = read_parquet('above_mean_domain.parquet')\n", "df['is_json'] = df['value'].apply(is_json, meta=False)" ] }, @@ -418,13 +399,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 2min 25.1s\n" + "[########################################] | 100% Completed | 4min 21.6s\n" ] } ], "source": [ "#save\n", - "save_parquet(df=df, name='above_mean_domain_json.parquet')" + "save_parquet(df=df, name=FILE_NAME)" ] }, { @@ -460,27 +441,27 @@ " \n", " \n", " 0\n", - " {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...\n", - " True\n", + " fXDcab74\n", + " False\n", " \n", " \n", " 1\n", - " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " True\n", + " fXDcab74\n", + " False\n", " \n", " \n", " 2\n", - " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " True\n", + " Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...\n", + " False\n", " \n", " \n", " 3\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", + " Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...\n", " False\n", " \n", " \n", " 4\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", + " _ga=GA1.2.1529583939.1513387469; _gid=GA1.2.17...\n", " False\n", " \n", " \n", @@ -489,11 +470,11 @@ ], "text/plain": [ " value_1000 is_json\n", - "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... True\n", - "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... True\n", - "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... True\n", - "3 usunico=17/12/2017:0-00155123:830; SessionASM=... False\n", - "4 usunico=17/12/2017:0-00155123:830; SessionASM=... False" + "0 fXDcab74 False\n", + "1 fXDcab74 False\n", + "2 Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko... False\n", + "3 Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko... False\n", + "4 _ga=GA1.2.1529583939.1513387469; _gid=GA1.2.17... False" ] }, "execution_count": 15, @@ -503,7 +484,7 @@ ], "source": [ "#read\n", - "df = read_parquet('above_mean_domain_json.parquet')\n", + "df = read_parquet(FILE_NAME)\n", "df[['value_1000', 'is_json']].head()" ] }, @@ -519,47 +500,60 @@ "cell_type": "code", "execution_count": 16, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Notebook name: s0_domains_isjson_md5\n" + ] + } + ], + "source": [ + "FILE_NAME += '_md5'\n", + "print('Notebook name: ', FILE_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, "outputs": [], "source": [ - "import hashlib\n", - "\n", "def md5(value):\n", " return hashlib.md5(value.encode('utf-8')).hexdigest()" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ - "#To guarantee the usage of the correct parquet created above in case we start from this section\n", - "df = read_parquet('above_mean_domain_json.parquet') \n", - "\n", - "df['value_md5'] = df['value'].apply(md5, meta=' ')" + "df['value_md5'] = df['value'].apply(md5, meta='O')" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 1min 26.8s\n" + "[########################################] | 100% Completed | 2min 45.9s\n" ] } ], "source": [ "#save\n", - "save_parquet(df=df, name='above_mean_domain_json_md5.parquet')" + "save_parquet(df=df, name=FILE_NAME)" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -590,28 +584,28 @@ " \n", " \n", " 0\n", - " {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...\n", - " cff77029e3ae45dd439a62987b1d8340\n", + " fXDcab74\n", + " 7df64196939a8b6ff11482ed6df4b25a\n", " \n", " \n", " 1\n", - " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " 9ac0a0a0afb677c8fd985a7c2f4ddbc5\n", + " fXDcab74\n", + " 7df64196939a8b6ff11482ed6df4b25a\n", " \n", " \n", " 2\n", - " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " 9ac0a0a0afb677c8fd985a7c2f4ddbc5\n", + " Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...\n", + " bc0aac3569031babbd73e069947a4b12\n", " \n", " \n", " 3\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " db64465b639e01993d9212390f057628\n", + " Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...\n", + " bc0aac3569031babbd73e069947a4b12\n", " \n", " \n", " 4\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " db64465b639e01993d9212390f057628\n", + " _ga=GA1.2.1529583939.1513387469; _gid=GA1.2.17...\n", + " 324dd29b8c6438bc700ac2d85e33f12d\n", " \n", " \n", "\n", @@ -619,28 +613,28 @@ ], "text/plain": [ " value_1000 \\\n", - "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... \n", - "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", - "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", - "3 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", - "4 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", + "0 fXDcab74 \n", + "1 fXDcab74 \n", + "2 Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko... \n", + "3 Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko... \n", + "4 _ga=GA1.2.1529583939.1513387469; _gid=GA1.2.17... \n", "\n", " value_md5 \n", - "0 cff77029e3ae45dd439a62987b1d8340 \n", - "1 9ac0a0a0afb677c8fd985a7c2f4ddbc5 \n", - "2 9ac0a0a0afb677c8fd985a7c2f4ddbc5 \n", - "3 db64465b639e01993d9212390f057628 \n", - "4 db64465b639e01993d9212390f057628 " + "0 7df64196939a8b6ff11482ed6df4b25a \n", + "1 7df64196939a8b6ff11482ed6df4b25a \n", + "2 bc0aac3569031babbd73e069947a4b12 \n", + "3 bc0aac3569031babbd73e069947a4b12 \n", + "4 324dd29b8c6438bc700ac2d85e33f12d " ] }, - "execution_count": 19, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#read\n", - "df = read_parquet('above_mean_domain_json_md5.parquet')\n", + "df = read_parquet(FILE_NAME)\n", "df[['value_1000', 'value_md5']].head()" ] }, @@ -648,14 +642,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Saving other possible usefull samples to future analyses" + "# Saving other possible usefull filtered samples to future analyses" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Filter to parquet containing only JSON " + "## value_len > df_mean\n", + "1356 is the value_len mean\n", + "\n", + "To filter the data into something that is more interesting to this task I decided to only work with values that are at above the mean.\n", + "\n", + "All values above the mean count up to 499805 rows. That is just 4,42% of the whole sample, and a lot easier to work on. " ] }, { @@ -667,21 +666,110 @@ "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 27.4s\n", - "Npartition: 233\n", - "[########################################] | 100% Completed | 1min 3.4s\n" + "Notebook name: s0_domains_isjson_md5_above_mean\n" ] } ], "source": [ - "df = read_parquet('above_mean_domain_json_md5.parquet')\n", - "save_parquet(df=df[df['is_json'] == True], name='JSONs_only.parquet', recalculate_partition=True)" + "name = FILE_NAME + '_above_mean'\n", + "print('Notebook name: ', name)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 50.5s\n", + "Npartition: 245\n", + "[########################################] | 100% Completed | 1min 38.3s\n" + ] + } + ], + "source": [ + "#Save\n", + "save_parquet(df= df[df['value_len'] > df_mean], name= name, recalculate_partition=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['value_1000', 'value', 'value_len', 'symbol', 'script_url', 'location',\n", + " 'operation', 'location_domain', 'script_domain', 'is_json',\n", + " 'value_md5'],\n", + " dtype='object')" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Read\n", + "df = read_parquet(name)\n", + "df.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Filter to parquet containing only JSON " + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Notebook name: s0_domains_isjson_md5_JSON_ONLY\n" + ] + } + ], + "source": [ + "name = FILE_NAME + '_JSON_ONLY'\n", + "print('Notebook name: ', name)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 28.9s\n", + "Npartition: 233\n", + "[########################################] | 100% Completed | 1min 5.0s\n" + ] + } + ], + "source": [ + "save_parquet(df=df[df['is_json'] == True], name=name, recalculate_partition=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, "outputs": [ { "data": { @@ -747,14 +835,14 @@ "4 {\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279... True" ] }, - "execution_count": 22, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#read all_json_above_mean\n", - "df = read_parquet('JSONs_only.parquet')\n", + "df = read_parquet(name)\n", "df[['value_1000', 'is_json']].head()" ] }, @@ -775,14 +863,30 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Notebook name: s0_domains_isjson_md5_JSON_ONLY_schema_keys\n" + ] + } + ], + "source": [ + "name += '_schema_keys'\n", + "print('Notebook name: ', name)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "from json_schema_inferencer.guess_json_schema import guess_schema\n", "\n", - "df = read_parquet('JSONs_only.parquet')\n", - "\n", "def jsonSchema(myjson):\n", " try:\n", " dct = json.loads(myjson)\n", @@ -805,26 +909,26 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 3min 57.7s\n" + "[########################################] | 100% Completed | 4min 18.1s\n" ] } ], "source": [ - "df['json_keys'] = df['value'].apply(jsonKeys, meta='')\n", - "df['json_schema'] = df['value'].apply(jsonSchema, meta='')\n", - "save_parquet(df=df, name='JSONs_key_schema.parquet')" + "df['json_keys'] = df.value.apply(jsonKeys, meta='O')\n", + "df['json_schema'] = df.value.apply(jsonSchema, meta='O')\n", + "save_parquet(df=df, name=name)\n" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -911,14 +1015,14 @@ "4 [LastSearch, LastSearch_e, dueljs_channel_comm... " ] }, - "execution_count": 25, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#read \n", - "df = read_parquet('JSONs_key_schema.parquet')\n", + "df = read_parquet(name)\n", "df[['value_1000', 'json_keys', 'json_schema']].head()" ] }, @@ -926,34 +1030,60 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### All NON json above the mean" + "## All NON json above the mean" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 26.7s\n", - "Npartition: 12\n", - "[########################################] | 100% Completed | 27.8s\n" + "Notebook name: s0_domains_isjson_md5_nonJSON_ONLY\n" ] } ], "source": [ - "df = read_parquet('above_mean_domain_json_md5.parquet')\n", - "save_parquet(df=df[df['is_json'] == False], name='NON_JSONs_only.parquet', recalculate_partition=True)" + "name = FILE_NAME + '_nonJSON_ONLY'\n", + "df = read_parquet(FILE_NAME)\n", + "print('Notebook name: ', name)" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 1min 54.5s\n", + "Npartition: 116\n", + "[########################################] | 100% Completed | 1min 13.1s\n" + ] + } + ], + "source": [ + "save_parquet(df=df[df['is_json'] == False], name=name, recalculate_partition=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/overscripted/lib/python3.6/site-packages/dask/dataframe/core.py:4494: UserWarning: Insufficient elements for `head`. 5 elements requested, only 0 elements available. Try passing larger `npartitions` to `head`.\n", + " warnings.warn(msg.format(n, len(r)))\n" + ] + }, { "data": { "text/html": [ @@ -981,127 +1111,34 @@ " symbol\n", " script_url\n", " location\n", + " operation\n", " location_domain\n", " script_domain\n", " is_json\n", " value_md5\n", + " json_keys\n", + " json_schema\n", " \n", " \n", " \n", - " \n", - " 0\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " 1358\n", - " window.document.cookie\n", - " https://www.coches.net/scripts/common.min.js?2...\n", - " https://www.coches.net/fiat/segunda-mano/\n", - " coches.net\n", - " coches.net\n", - " False\n", - " db64465b639e01993d9212390f057628\n", - " \n", - " \n", - " 1\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " 1358\n", - " window.document.cookie\n", - " https://www.coches.net/scripts/common.min.js?2...\n", - " https://www.coches.net/fiat/segunda-mano/\n", - " coches.net\n", - " coches.net\n", - " False\n", - " db64465b639e01993d9212390f057628\n", - " \n", - " \n", - " 2\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " 1358\n", - " window.document.cookie\n", - " https://tags.tiqcdn.com/utag/schibsted/coches....\n", - " https://www.coches.net/fiat/segunda-mano/\n", - " coches.net\n", - " tiqcdn.com\n", - " False\n", - " db64465b639e01993d9212390f057628\n", - " \n", - " \n", - " 3\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " 1358\n", - " window.document.cookie\n", - " https://tags.tiqcdn.com/utag/schibsted/coches....\n", - " https://www.coches.net/fiat/segunda-mano/\n", - " coches.net\n", - " tiqcdn.com\n", - " False\n", - " db64465b639e01993d9212390f057628\n", - " \n", - " \n", - " 4\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " usunico=17/12/2017:0-00155123:830; SessionASM=...\n", - " 1358\n", - " window.document.cookie\n", - " https://tags.tiqcdn.com/utag/schibsted/coches....\n", - " https://www.coches.net/fiat/segunda-mano/\n", - " coches.net\n", - " tiqcdn.com\n", - " False\n", - " db64465b639e01993d9212390f057628\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " value_1000 \\\n", - "0 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", - "1 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", - "2 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", - "3 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", - "4 usunico=17/12/2017:0-00155123:830; SessionASM=... \n", - "\n", - " value value_len \\\n", - "0 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", - "1 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", - "2 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", - "3 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", - "4 usunico=17/12/2017:0-00155123:830; SessionASM=... 1358 \n", - "\n", - " symbol script_url \\\n", - "0 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", - "1 window.document.cookie https://www.coches.net/scripts/common.min.js?2... \n", - "2 window.document.cookie https://tags.tiqcdn.com/utag/schibsted/coches.... \n", - "3 window.document.cookie https://tags.tiqcdn.com/utag/schibsted/coches.... \n", - "4 window.document.cookie https://tags.tiqcdn.com/utag/schibsted/coches.... \n", - "\n", - " location location_domain script_domain \\\n", - "0 https://www.coches.net/fiat/segunda-mano/ coches.net coches.net \n", - "1 https://www.coches.net/fiat/segunda-mano/ coches.net coches.net \n", - "2 https://www.coches.net/fiat/segunda-mano/ coches.net tiqcdn.com \n", - "3 https://www.coches.net/fiat/segunda-mano/ coches.net tiqcdn.com \n", - "4 https://www.coches.net/fiat/segunda-mano/ coches.net tiqcdn.com \n", - "\n", - " is_json value_md5 \n", - "0 False db64465b639e01993d9212390f057628 \n", - "1 False db64465b639e01993d9212390f057628 \n", - "2 False db64465b639e01993d9212390f057628 \n", - "3 False db64465b639e01993d9212390f057628 \n", - "4 False db64465b639e01993d9212390f057628 " + "Empty DataFrame\n", + "Columns: [value_1000, value, value_len, symbol, script_url, location, operation, location_domain, script_domain, is_json, value_md5, json_keys, json_schema]\n", + "Index: []" ] }, - "execution_count": 28, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#read \n", - "df = read_parquet('NON_JSONs_only.parquet')\n", + "df = read_parquet(name)\n", "df.head()" ] }, From df6d8433d087bf9836ef1430ce1ed9ad71f454b1 Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Mon, 8 Apr 2019 17:51:50 -0300 Subject: [PATCH 18/23] Value distribution with new data that filtered json false positives --- .../isJson_Value_Distribution.ipynb | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/analyses/2019_03_aliamcami_value_analyses/isJson_Value_Distribution.ipynb b/analyses/2019_03_aliamcami_value_analyses/isJson_Value_Distribution.ipynb index 77766f7..3ff820d 100644 --- a/analyses/2019_03_aliamcami_value_analyses/isJson_Value_Distribution.ipynb +++ b/analyses/2019_03_aliamcami_value_analyses/isJson_Value_Distribution.ipynb @@ -40,7 +40,7 @@ "metadata": {}, "source": [ "# Parquet\n", - "Used sample: sample_0_prep/full_sample_json.parquet\n", + "Used sample: sample_0_prep/s0_domains_isjson.parquet\n", " * This sample is the 10% sample with the \"is_json\" column added to it, this column is the result of the 'value' columns as a valid json or not. \n", " * This sample can be obtained by running 'jsJson_dataPrep.ipynb'" ] @@ -96,7 +96,7 @@ } ], "source": [ - "df = dd.read_parquet('sample_0_prep/full_sample_json.parquet', engine=\"pyarrow\", columns=['value_len', 'is_json'])\n", + "df = dd.read_parquet('sample0_prep/s0_domains_isjson.parquet', engine=\"pyarrow\", columns=['value_len', 'is_json'])\n", "df.head(1)" ] }, @@ -109,7 +109,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 1.6s\n" + "[########################################] | 100% Completed | 1.5s\n" ] } ], @@ -142,7 +142,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 4, @@ -194,7 +194,7 @@ }, { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAELCAYAAADz6wBxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFqxJREFUeJzt3X+QXWd93/H3JzLCDSEOGEFdy4rkyHHipknBWwWawpC0JnZi4YF4GimmUFu1BoLbZtJMRm46BaaTUSBTEsCe2E4xxkywcR2nkYyohjGkzkw9INkmII0irDim3tiJDEmME2aCRb794x7ZV8v+uLvnnr2rs+/XzM7e8+w95/neY60/+zznueemqpAkaam+Y9IFSJJObwaJJKkVg0SS1IpBIklqxSCRJLVikEiSWjFIJEmtGCSSpFZOyyBJ8vokf5jkpiSvn3Q9krSanbHcHSa5FbgcOF5VPzTUfinwAWAN8D+q6tfmOUwBfwOcCUwv1OfLXvay2rhxY5uyJWnVefDBB79aVesWel6W+xYpSV7HIARuPxkkSdYAXwYuYRAMB4DtDEJl94xDXAN8tar+PskrgPdX1VXz9Tk1NVUHDx4c7wuRpJ5L8mBVTS30vGUfkVTV/Uk2zmjeAhyrqkcBktwJXFFVuxmMXubyV8ALZ/tBkp3AToANGza0rFqSNJeVco3kXODxoe3ppm1WSd6c5GbgY8ANsz2nqm6pqqmqmlq3bsGRmSRpiZZ9RDKHzNI255xbVd0D3LPgQZOtwNbNmze3KE2SNJ+VMiKZBs4b2l4PPDGhWiRJi7BSguQAcEGSTUnWAtuAPW0PWlV7q2rnWWed1bpASdLslj1IktwBPABcmGQ6yY6qOgFcB+wHjgB3VdXhMfS1NcktTz/9dNtDSZLmsOzLfyfB5b+StHijLv9dKVNbnXBEIkndWymrtjpRVXuBvVNTU9e2Oc7GXZ88ZfuxM3/u25/0bsNK0urU6xGJJKl7vQ4Sp7YkqXu9DhKX/0pS93odJJKk7vU6SJzakqTu9TpInNqSpO71OkgkSd0zSCRJrfQ6SLxGIknd63WQeI1EkrrX6yCRJHXPIJEktWKQSJJa6XWQeLFdkrrX6yDxYrskda/XQSJJ6p5BIklqxSCRJLVikEiSWjktP7M9yXcA/w34buBgVX10wiVJ0qq17COSJLcmOZ7k0Iz2S5McTXIsya4FDnMFcC7wLDDdVa2SpIVNYkRyG3ADcPvJhiRrgBuBSxgEw4Eke4A1wO4Z+18DXAg8UFU3J7kbuG8Z6pYkzWLZg6Sq7k+ycUbzFuBYVT0KkORO4Iqq2g1cPvMYSaaBbzab3+quWknSQlbKxfZzgceHtqebtrncA/xkkg8B98/2hCQ7kxxMcvCpp54aX6WSpFOslIvtmaWt5npyVX0D2DHfAavqliRPAlvXrl17ccv6JElzWCkjkmngvKHt9cATbQ/qLVIkqXsrJUgOABck2ZRkLbAN2NP2oN60UZK6N4nlv3cADwAXJplOsqOqTgDXAfuBI8BdVXW4bV+OSCSpe5NYtbV9jvZ9wL5x9pVkK7B18+bN4zysJGnISpna6oQjEknqXq+DxGskktS9XgeJIxJJ6l6vg8QRiSR1r9dB4ohEkrrX6yCRJHWv10Hi1JYkda/XQeLUliR1r9dBIknqnkEiSWql10HiNRJJ6l6vg8RrJJLUvV4HiSSpewaJJKkVg0SS1IpBIklqpddB4qotSeper4PEVVuS1L1eB4kkqXsGiSSpFYNEktSKQSJJauWMSRewFEleC1zFoP6LquqfT7gkSVq1ln1EkuTWJMeTHJrRfmmSo0mOJdk13zGq6g+r6u3AvcBHu6xXkjS/SYxIbgNuAG4/2ZBkDXAjcAkwDRxIsgdYA+yesf81VXW8efxzwL/rumBJ0tyWPUiq6v4kG2c0bwGOVdWjAEnuBK6oqt3A5bMdJ8kG4Omq+vocP98J7ATYsGHDeIqXJH2blXKx/Vzg8aHt6aZtPjuAj8z1w6q6BXgP8NDatWtbFyhJmt1KCZLM0lbz7VBV76qq/7vAc3xnuyR1bKUEyTRw3tD2euCJtgf1XluS1L2VEiQHgAuSbEqyFtgG7Gl7UEckktS9SSz/vQN4ALgwyXSSHVV1ArgO2A8cAe6qqsNj6MsRiSR1bBKrtrbP0b4P2DfmvvYCe6empq4d53ElSc9bKVNbnXBEIknd63WQeI1EkrrX6yBxRCJJ3et1kDgikaTu9TpIJEndM0gkSa30Oki8RiJJ3TstP9hqVJN6H8nGXZ/8trbHfu2nl7MESVo2vR6RSJK61+sgcWpLkrrX6yBx+a8kda/XQSJJ6p5BIklqxSCRJLVikEiSWul1kLhqS5K61+sgcdWWJHWv1+9sP134TnhJp7Nej0gkSd0zSCRJrRgkkqRWRgqSJJcnWTGhk2RDkj1Jbk2ya9L1SNJqNmo4bAMeSfK+JD/YpsPmf/7Hkxya0X5pkqNJjo0QDt8PfLKqrgEualOPJKmdkYKkqt4CvBL4E+AjSR5IsjPJi5fQ523ApcMNSdYANwKXMQiG7UkuSvJPktw74+vlwMPAtiSfAT67hBokSWMy8nRVVX0d+F3gTuAc4E3AQ0n+/WI6rKr7gb+c0bwFOFZVj1bVN5s+rqiqL1XV5TO+jgNXA++qqp8AZl0n2wTdwSQHn3rqqcWUKElahFGvkbwxye8BnwFeAGypqsuAHwF+aQx1nAs8PrQ93bTN5X8D/yHJTcBjsz2hqm6pqqmqmlq3bt0YSpQkzWbUNyReCfxGM5p4TlV9I8k1Y6gjs7TVXE+uqkNNTfMfNNkKbN28eXOL0iRJ8xl1auvJmSGS5L0AVXXfGOqYBs4b2l4PPDGG40qSOjZqkFwyS9tlY6zjAHBBkk1J1jJYJban7UG915YkdW/eIEnyjiRfAn4gyReHvv4U+OJSOkxyB/AAcGGS6SQ7quoEcB2wHzgC3FVVh5dy/Bl9efdfSerYQtdIPg58CtgNDL+345mqmrnyaiRVtX2O9n3AvqUcc56+9gJ7p6amrh3ncSVJz1toaquq6jHgncAzQ18keWm3pbXniESSurdQkHy8+f4gcLD5/uDQ9ormNRJJ6t68U1tVdXnzfdPylCNJOt2M+obEH0vyoubxW5K8P8mGbktrz6ktSereqMt/fwv4RpIfAX4Z+Arwsc6qGhOntiSpe6MGyYmqKuAK4ANV9QFgKTdslCT1zKi3SHkmyfXAW4DXNXfrfUF3ZY2Ht0h5np8LL6kro45Ifhb4O2BHVf05gxsq/npnVY2JU1uS1L2RRiRNeLx/aPv/Abd3VZQk6fQx6qqtNyd5JMnTSb6e5JkkX++6OEnSyjfqNZL3AVur6kiXxYxbX6+ReL1D0koy6jWSvzjdQgS8RiJJy2HUEcnBJJ8A/heDi+4AVNU9nVQlSTptjBok3w18A3jDUFsBBokkrXKjrtq6uutCJEmnp1FXbX1/kvuSHGq2fzjJf+m2tPa815YkdW/Ui+2/DVwPPAtQVV9k8HG4K5oX2yWpe6MGyXdW1edntJ0YdzGSpNPPqEHy1STfx+ACO0muBJ7srCpJ0mlj1FVb7wRuAX4gyZ8Bfwpc1VlVmjjf9ChpVPMGSZJfHNrcB3yWwSjmb4GfYej+W5Kk1WmhEcnJzxy5EPhnwO8DAf4NcH+Hdc0ryUXAu4GvAfdV1d2TqkWSVrt5r5FU1Xuq6j3Ay4BXVdUvVdV/Ai4G1i+lwyS3Jjl+cinxUPulSY4mOZZk1wKHuQz4UFW9A3jrUuqQJI3HqNdINgDfHNr+JrBxiX3eBtzA0G3omw/KuhG4BJgGDiTZA6wBds/Y/xoGH/P7riRvBM5eYh2SpDEYNUg+Bnw+ye8xWLn1JuCjS+mwqu5PsnFG8xbgWFU9CpDkTuCKqtoNXD7Hod7ZBJC3aZGkCRr1Fim/muRTwGubpqur6uEx1nEu8PjQ9jTwo3M9uQmi/wy8iDk+qTHJTmAnwIYNG8ZUpiRpplFHJFTVQ8BDHdWR2bqcp5bHaEJinufckuRJYOvatWsvbleexs3lxVJ/jBwkHZsGzhvaXg880fagVbUX2Ds1NXVt22NpfgaDtHqN+s72rh0ALkiyKclaBvfx2tP2oN60UZK6t+xBkuQO4AHgwiTTSXZU1QngOmA/cAS4q6oOt+3LmzZKUveWfWqrqrbP0b6Pwbvnx6avn9kuSSvJSpna6oQjEknq3kq52N4JRyT94gV9aWVyRCJJaqXXQeKqLUnqXq+DxBGJJHWv10EiSeper4PEqS1J6l6vg8SpLUnqXq+DRJLUPYNEktRKr4PEaySS1L1eB4nXSCSpe70OEklS9wwSSVIrBokkqRWDRJLUSq+DxFVbktS9Xn8eSVXtBfZOTU1dO+latPL5eScrg/8dTj+9HpFIkrpnkEiSWjFIJEmtGCSSpFZWfJAkOT/Jh5PcPdT2oiQfTfLbSa6aZH2StNp1GiRJbk1yPMmhGe2XJjma5FiSXfMdo6oeraodM5rfDNxdVdcCbxxz2ZKkReh6+e9twA3A7ScbkqwBbgQuAaaBA0n2AGuA3TP2v6aqjs9y3PXAl5rH3xpzzZKkReg0SKrq/iQbZzRvAY5V1aMASe4Erqiq3cDlIx56mkGYfIE5RlVJdgI7ATZs2LDo2iVJo5nENZJzgceHtqebtlklOTvJTcArk1zfNN8D/EyS3wL2zrZfVd1SVVNVNbVu3boxlS5JmmkS72zPLG0115Or6mvA22e0/S1w9YIdJVuBrZs3b15sjZKkEU0iSKaB84a21wNPTKAOST3hbVUmaxJTWweAC5JsSrIW2Abs6aIjPyFRkrrX9fLfO4AHgAuTTCfZUVUngOuA/cAR4K6qOtxR/979V5I61vWqre1ztO8D9nXZd9OPd/+VpI6t+He2t+GIRJK65+eRqLe8ACstj16PSCRJ3ev1iMT3kUgaB0e38+v1iMTlv5LUvV4HiSSpe70OEldtSVL3eh0kTm1JUvd6HSSSpO71Okic2pKk7vU6SJzakqTu9fp9JFLXZr6/wPcWaDUySKQVzDfC6XTQ66ktSVL3DBJJUiu9DhJXbUlS93odJK7akqTu9TpIJEndM0gkSa0YJJKkVgwSSVIrKz5Ikpyf5MNJ7p6vTZI0GZ0GSZJbkxxPcmhG+6VJjiY5lmTXfMeoqkerasdCbZKkyej6Fim3ATcAt59sSLIGuBG4BJgGDiTZA6wBds/Y/5qqOt5xjZKkFjoNkqq6P8nGGc1bgGNV9ShAkjuBK6pqN3D5uPpOshPYCbBhw4ZxHVaSNMMkrpGcCzw+tD3dtM0qydlJbgJemeT6udpmqqpbqmqqqqbWrVs3xvIlScMmcfffzNJWcz25qr4GvH2htlk7SrYCWzdv3rzYGiVJI5pEkEwD5w1trweemEAdkvBW9WpvElNbB4ALkmxKshbYBuzpoiPvtSVJ3et6+e8dwAPAhUmmk+yoqhPAdcB+4AhwV1Ud7qh/7/4rSR3retXW9jna9wH7uuy76WcvsHdqaurarvuSpNVqxb+zvQ1HJJLUvV4HiddIJKl7k1i1Ja1arpDSXE7nfxu9HpE4tSVJ3et1kDi1JUnd63WQSJK61+sgcWpLkrrX6yBxakuSutfrIJEkdc8gkSS10usg8RqJJHWv129I9F5bkvpqJb2BsdcjEklS9wwSSVIrBokkqZVeB4kX2yWpe70OEt+QKEnd63WQSJK6Z5BIkloxSCRJrRgkkqRWUlWTrqFzSZ4CvrKEXV8GfHXM5ZzuPCen8nycyvNxqtP9fHxvVa1b6EmrIkiWKsnBqpqadB0riefkVJ6PU3k+TrVazodTW5KkVgwSSVIrBsn8bpl0ASuQ5+RUno9TeT5OtSrOh9dIJEmtOCKRJLVikEiSWjFI5pDk0iRHkxxLsmvS9SxFkluTHE9yaKjtpUk+neSR5vtLmvYk+WDzer+Y5FVD+7ytef4jSd421H5xki81+3wwSZbax3JIcl6SzyY5kuRwkv+4ms9JkjOTfD7JHzXn4z1N+6Ykn2tq/USStU37C5vtY83PNw4d6/qm/WiSnxxqn/X3aCl9LJcka5I8nOTepdbap/Mxkqrya8YXsAb4E+B8YC3wR8BFk65rCa/jdcCrgENDbe8DdjWPdwHvbR7/FPApIMCrgc817S8FHm2+v6R5/JLmZ58HXtPs8yngsqX0sYzn4xzgVc3jFwNfBi5areek6fO7mscvAD7X1HAXsK1pvwl4R/P454GbmsfbgE80jy9qfkdeCGxqfnfWzPd7tNg+lvnfyS8CHwfuXUqtfTsfI52zSRewEr+a/xHsH9q+Hrh+0nUt8bVs5NQgOQqc0zw+BzjaPL4Z2D7zecB24Oah9pubtnOAPx5qf+55i+1jgufm94FLPCcF8J3AQ8CPMngn9hlN+3O/C8B+4DXN4zOa52Xm78fJ5831e9Tss6g+lvE8rAfuA34CuHcptfbpfIz65dTW7M4FHh/anm7a+uAVVfUkQPP95U37XK95vvbpWdqX0seya6YIXsngr/BVe06aaZwvAMeBTzP4i/mvq+rELPU8V2vz86eBs1n8eTp7CX0sl98Efhn4+2Z7KbX26XyMxCCZXWZp6/s66ble82Lbl9LHskryXcDvAr9QVV+f76mztPXqnFTVt6rqnzL4S3wL8IPz1DOu8zHfa57Y+UhyOXC8qh4cbp6nnl6fj8UwSGY3DZw3tL0eeGJCtYzbXyQ5B6D5frxpn+s1z9e+fpb2pfSxbJK8gEGI/E5V3dM0r+pzAlBVfw38AYNrJN+T5IxZ6nmu1ubnZwF/yeLP01eX0Mdy+DHgjUkeA+5kML31m0uotS/nY2QGyewOABc0KynWMrjItWfCNY3LHuDkKqO3MbhOcLL9rc0qolcDTzdTMPuBNyR5SbPS6A0M5m+fBJ5J8upmZdJbZxxrMX0si6bODwNHqur9Qz9aleckybok39M8/gfAvwKOAJ8Frpyj1pOv4UrgMzWYvN8DbGtWGG0CLmCw6GDW36Nmn8X20bmqur6q1lfVxqbWz1TVVUuotRfnY1EmfZFmpX4xWE3zZQZzxr8y6XqW+BruAJ4EnmXwl80OBvOr9wGPNN9f2jw3wI3N6/0SMDV0nGuAY83X1UPtU8ChZp8beP5OCYvuY5nOx79gMC3wReALzddPrdZzAvww8HBzPg4B/7VpP5/B//iOAf8TeGHTfmazfaz5+flDx/qV5jUcpVmpNt/v0VL6WOZ/K6/n+VVbq/58LPTlLVIkSa04tSVJasUgkSS1YpBIkloxSCRJrRgkkqRWDBJJUisGiTRmSf5mzMe7LcmVCz9TmgyDRJLUikEiLSDJe5P8/ND2u5O8K8l9SR7K4IOsrphlv9ef/HCkZvuGJP+2eXxxkv+T5MEk+0/eh2uEWmbdL8kfNHV+PsmXk7y29QuXRmSQSAu7E/jZoe1/DXwEeFNVvQr4ceC/N/fXWlBz48gPAVdW1cXArcCvjmG/M6pqC/ALwLtGqUUahzMWfoq0ulXVw0lenuQfAeuAv2JwD7PfSPI6Bp9dcS7wCuDPRzjkhcAPAZ9usmdNc7y2+528m/GDDD7QTFoWBok0mrsZ3H31HzIYoVzFIFQurqpnm1uPnzljnxOcOuo/+fMAh6vqNYusYaH9/q75/i383dYycmpLGs2dDG77fSWDUDmLwYcgPZvkx4HvnWWfrwAXNbcTPwv4l037UWBdktfAYMoqyT8eoYal7id1yr9apBFU1eEkLwb+rKqeTPI7wN4kBxncjv6PZ9nn8SR3MbhN+yMMbtlOVX2zWc77wSZgzmDwAUqHF6hhSftJXfM28pKkVpzakiS14tSWtEIkuZHB54YP+0BVfWQS9UijcmpLktSKU1uSpFYMEklSKwaJJKkVg0SS1Mr/B0NHHeyvZyHKAAAAAElFTkSuQmCC\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -235,7 +235,7 @@ }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -279,7 +279,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -399,7 +399,7 @@ }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -499,7 +499,7 @@ }, { "data": { - "image/png": "\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEXCAYAAABCjVgAAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xu8FWW9x/HPl3veQIUsuecdzUzwllneKiTFU8cMj5l4N0OzTKM0RczUvOQlU6lMs0RJxVA5oWJe4qSBSigSioSwQwUUNUDk9jt/zOxxsVhr7wWs2Qv2/r5fr/Xaa555ZuY3s2bPb80zM89SRGBmZgbQqtYBmJnZhsNJwczMMk4KZmaWcVIwM7OMk4KZmWWcFMzMLOOkYBlJvSSFpDZVmNeBkuoaGH+zpB+v73IsH5IWSfpEreOoFkmDJf211nFsDJwUmlB6wN2+qGyYpN+v4/wel3TyesQzS9Kh6zr9+oiI0yPiklosu1i6HZdK6l5QdqikWUX1Bkt6QdISSW9IuklSp4Lxw9LP+GsFZW3Ssl5llr1aIpbUTdK9khZIejdd3uCC+u0lXSZptqT3Jb0i6VxJWtv1aUhEbBYRMyutv74kXS1pXFHZtZIeXId5Ve3LTUvkpGAtiqTWZUYtBsqeuUg6B7gCOBfoCOwL9AQekdSuoOrbwPAGltOYO4A56by3Br4JvFkw/o/AIcAAYHPgOOBU4Lq1WZ8N0I+B7SSdACBpP+B44PSaRlVCs082EeFXE72AALYvKhsG/D59fyBQB/wIWADMAo4tM69LgZXAUmAR8Iu0/DPARODd9O9nykx/B7AKeD+d/jygVxrj8cDsNIbzC6ZpBQwFXgXeAkYBW5WZf4PrAtwG/KRg+DzgdWAucHLhtiI5OD4AvJeu00+AvxZMuzPwCMkBeTpwdNFybgLGkhwoDy0R6+PARcB/CpZ5KDArfb9Fuo2OLppuM2AecGLBZ/kH4B/A8WlZm3RdepXZTvXbvE06vAjYo0zdQ9LPu3tR+T7pvrB9JeuztvsqSQJ6KZ3fv4HvF9Q7BZiRbvsxwLZF8zgdeAVYCNwIqIFlHpjuV72AacBpDdRtBVwAvJZ+Br8DOqbjZqfLXpS+9gMGA38Frkpj+RdwWMH8OgK/IdkH/53uY63TcYOBCcDP0/X8SaXbcWN81TyAlvSisqSwArgGaA98nuRAtlOZ+T0OnFwwvFW6wx+XHoyOSYe3LjP9LAoOkgUHqF8BHwE+BXwA7JKOPxt4GuiWxncLMLLMvBtcFwqSAtAfeAPYFdiEJGEVHpTuSl+bAH1Ivkn/NR23aTp8QrrOe5IkoV0LlvMusH96IOlQbjumsdZ/FoVJoX+6Lm1KTHt7/Tao/yyBgcBMoC1rnxQeTQ9Ag4AeRXUvB54oM5/XSA+ija1POvxL4JeV7KskB8oD0vdbAnum7w9Ot/We6Wd8A/Bk0TweBDoBPYD5QP9G/kduSef5OA0nkBNJktEnSJLzfcAdpbZpWjYYWE6SxFoD3yL5AqJ0/P3psjcFPgr8vWB7Dk4//zPTz/MjtT6W5Ply89GG6ccR8UFEPAE8BBxd4XRfBl6JiDsiYkVEjAT+CRyxlsu/OCLej4h/kHzr/VRafhrJmUNdRHxAchA8qpHT6UrW5WjgtxExNSKWABfXj0ibYf4buCgilkTESyQH4nqHkxzsfpuu83PAvcBRBXX+FBETImJVRCxtINbLgCMk7VpU3hlYEBErSkzzejo+ExFjSA6A63K952vAUyTNKf+SNFnSXgVxvF5mujXioPz6EBFnRMQZFca0HOgjaYuIWJhuY4BjgVsj4rl0f/ghsF/R9ZPLI+KdiJgN/AXYo5FlPUVyZviHSI/IZRwLXBMRMyNiUbrsQY3si69FxK8iYiXJPvRxYBtJ2wCHAWdHxOKImEdyVjCoYNq5EXFDuo+938g6bNScFJrWSpJvj4XakvzT1VsYEYsLhl8Dtq1w/tum9Qu9BnRdmyBJvrXXW0LyTQySdu7Rkt6R9A7JKf5KYJsy86l0XbYl+bZfr/B9F5JvZ+XG9wT2qY8pjetY4GNl6pcVEfOBXwDDi0YtADqXOeB8PB1f7ALgfKBDfYGkHuldPYskLSoTw8KIGBoRu5Js18nA/emF5AXp8kpZI44G1mdt/TdJE9Jrkp5I2/uhaH9LD85vsfr+Vm5fWoOkrUmad64luS7TqVzd4mWn79tQfl9cLZb0ywdpPD1J/g9fL9iHbiE5Y6hX0T7UHDgpNK3ZJKe2hXqz+s69paRNC4Z7kJzmllL8TWouyQ5eqAdJG2kl0zdmDkk7bKeCV4eIKDf/StfldZImqXrdC97PJzl1Lzd+DkmTSmFMm0XEtwrqrM16XgkcBPQtKPsbSTPaVwsrput2GDC+eCYR8QhJ88YZBWWz09g2i4iyB8eC+gtIDpLbkjQNPkqSAAvXH0l7k2yTxypcn7USERMj4kiSg+T9JNeSoGh/S7fH1pTf3xpzLfDniPgu8CTJupdTvK/3INlP3mTd9usPgM4F+9AWaWKu12K6k3ZSaFp3Axektx22Sm8HPQK4p6jexZLaSTqApHnkj2Xm9yZJm2q9scCOkv4nvRXy6yRt8OVu6yuevjE3A5dK6gkgqYukIxuZppJ1GQWcIGkXSZsAF9aPSE/17wOGSdpE0s4kd+TUe5BknY+T1DZ97SVpl7VYr0xEvANcTXLhu77sXZImrRsk9U+X0StdlzqSayClnF84n0pIukLSbunntzlJ2/eMiHgrIh4lSUD3StpVUmtJ+5Jc3L4pIl6pZH3WMp52ko6V1DEilpNc7F+Zjr6T5HPbQ1J74KfAMxExax2WMwD4AvC9tOhM4L8kHVRmkpHAdyX1lrRZuuy70ya++SQ3UVS0b0fE68DDwNWStkj/N7eT9Pm1XY/mwEmhaQ0H/o/kLoiFwM9I7sh5saDOG+m4uST/7KdHxD/LzO86kjb9hZKuj4i3SA6855Ccxp8HHJ5+4yzlMpIk9Y6k71cQ/3Ukd5g8LOk/JBed92mgfkXrEhH/C1xP0uY8g+SbOSTf3gCGkNwd8gbJAXhk/biI+A/wRZL237lpnStILnyuq+v48MBXH+PPSO6kuorkwPgMyTfMQ9L29DVExASSC5aVqP8mugkwGniH5GJ1T5IL1/X+m2Q7/Znkzprfk9w1c+barI+ShwdvrjC244BZkt4juZvoGwARMZ7k2se9JGd727F6O3xF0uR3M3BWRLydznseyX78K0kfKTHZrST7wpMkdxItJd0GadPQpcCEdN/et4Iwvgm0I7nLaiHJF7VyTXXNWv2Vd9sASDqQ5G6Rbo3Vbc7Sb/kvAu1LXdyVdAXwsYg4vsmDqzJJu5PcsdNQ+3mTktSKJIn0TC8QWwviMwXbIEj6StpUsSXJN/0H6hOCpJ0l7a7E3sBJJN+mN2rpwfdoYFKtYymyG8k37zcaq2jNT25JQdKtkuZJerHMeEm6XtIMSVMk7ZlXLLZROI2kLfhVkm+phReKNye5rrCY5PrD1cCfmjrAHMwmeQbiu7UOpJ6k+uapH0TEslrHY00vt+YjSZ8jafP8XUTsVmL8AJI2wAEk7dLXRURD7dNmZpaz3M4UIuJJkkfCyzmSJGFERDwNdJLUIi/smJltKGrZsVNXVn8gpC4tW+OJTUmnknT6xaabbtp35513Xrclzn1+3aZbH9t+urbLrsVya7nsdHu/8O93m3zRn+zasWbLBWq2vVviOtd82evg2WefXRARXRqrV8ukoBJlJduyImIEMAKgX79+MWnSOl6XG9Zx3aZbH8OSWHsNfajJFz3r8i/XZLm1XPasy78M1GZ7T6rROk+q4TrX6nOuX+eW+D+9riQV93ZQUi3vPqpj9SdTu1H+yV0zM2sCtUwKY4Bvpnch7Qu8mz5ZaGZmNZJb85GkkSTdJ3dW8rOMF5F2BhcRN5N0yTCA5AnWJSRdH5uZWQ3llhQi4phGxgfw7Wosa/ny5dTV1bF0aUO9IgNfGtXw+DxMmwbArwZW/8aqIHjtneXc8MxC3vtgVdXnb2YtT7P4Wbm6ujo233xzevXqhVTq+nVqbiNJIw/bJv2yLa97p+qzjgi23vo9zgQuffKtqs/fzFqeZtHNxdKlS9l6660bTgjNkCTabLIFPTsV/0SDmdm6aRZJAWhxCaGeJFTy7l4zs7XXLJqPzKxl6rX0ziZf5qwmX2LTapZJodoPlcw6q7Jfw9xsh/15b/pTnH3RVTw2YSKS6LBZJ0aNGgVtt+Q/773L5Rf+gMkTnwFgj732YejwK9h8i478e85sBnzmU/xg+BX8zwmnAvDTC85l190/zZFH/09V18fMrJxm03y0obh7zMPMfWM+Ux69mxfGj2L06NF06pR0lT/s3LPo1qMXD014nocmPE/X7j25+LzvZNNu1bkLd/7mZpYvc+eUZlYbTgpV9vqbC/j4Np1p1SrZtN26dWPLLbdk9r9m8tILkzn1O+dmdU87+zymTnmeObP+BcCWW23N3p/9HGPuGVmT2M3MnBSq7OgjvsADjzzJHl8YxDkXX8PzzyedZs185Z/s1OeTtG7dOqvbunVrdurzSV59eVpWduIZ3+V3I25k5cqVa8zbzCxvTgpV1m3bbZj+5Ggu++GZtGolDjnkEMaPH09EmTukkhEfTt+jJ7vtsSdj7y/1+/ZmZvlqlheaa619+3YcdvD+HHbw/myz3e7cf//9fOnoE/jn1CmsWrUqa1patWoV06e9yCe232m16U8e8j3OOe14+u7zmVqEb2YtmM8Uquy5F6Yx9435QHLQnzJlCj179qRH70+w8667M+L6q7K6I66/il12+xQ9en9itXn03n5HtttxZ54cP65JYzcza5ZnCmX7HM/xRzFWrFhB+3btmLfgbU459xI+WLYcgL0/8zmGDBnCywuWcvGVN3DZhedx+Gf3JCLYve9eDLvyhpLzO/nM7/H1/p/PLV4zs1KaZVKohanTZ7Jdr270P2h/+h+0/4cjsl9KWsoWnTpx2fUjSk7ftXsP7hv/t2x4pz6fZPLshn7N1Mys+pwUquDm393D9beO5NqLv1/rUMzM1ouTQhWc/s2jOP2bR9U6DDOz9eYLzWZmlnFSMDOzjJOCmZllnBTMzCzTPC80D+tY3fmd+nhF1ermvsm3z7+cl16eyaoIDj/0AK785W289NJLTJjyMgcc/EUAbrrmcjbZZFOOP/3M6sZpZraefKZQJRHBV0/5Pv/V/0BemfAnXn5qNIsWL+H8889n8uTJPPXYI1VbljvLM7O8NM8zhRp47K9/p0P7dpzw9SOBpAfUnw87h577DqRt27asWLmKyROf5sRvfxeAV1+ZzklfO5zX59Zx7Enf4tgTTwPgwfvu5s5bR7Bi+TJ2+3Rfzr/0alq3bs2+O3XjuFPO4P+eeIxzfnwJe+69X83W1cyaL58pVMnUl2fS95O7rFa2xeab0atXLy644AK+eMRXGDXuKfoP/CoAs159mZt+fy9/eGA8t/z8CpYvX87MV6Yz7oHR3D76z4wa9xStW7Vm7Oikt9T3lyxm+5124Q8PPOqEYGa58ZlClUREya6xy5UfcPAXade+Pe3at2erzl14e8E8npnwBNOm/INjDz8YgKVLl7JV5y5AcuZx6ICB+a6EmbV4TgpVsuuO23Hv2PGrlb33n0XMmTNntR/WqdeuXfvsfatWrVixYiURcMTXBvGdoRetWb99h5LzMTOrJjcfVckhB+zNkveX8rs/PggkF4PPGf5zBg8ezDbbbMOSxYsancc++3+ORx8aw1sLkq633124kLl1s3ON28ysUPM8Uxj2bunyHLvOlsToX1/NGT+6jEuu/RWrIhhw8P789Kc/ZfHixVw0/FKO/tIB2YXmUrbbcWe+fe75fOvYr7Jq1SratG3Lj35yJdt265Fb3GZmhZpnUqiR7l0/xgO3X7d6Yfv2tG/fnjsfeqzsdIVdZvcf+NXsYnShp6fXVS1OM7Ny3HxkZmYZJwUzM8s0m6QQEbUOoSYigqBlrruZVV+zSAodOnTgrbfeanGJISJYseQ9Xntnea1DMbNmollcaO7WrRt1dXXMnz+/4YrvzGuagAq9Ow2ANxe+X/VZB8Fr7yznhmcWVn3eZtYyNYuk0LZtW3r37t14xWH75h/MGstMbo89bOhDTb9sM7O11Cyaj8zMrDpyTQqS+kuaLmmGpKElxveQ9BdJz0uaImlAnvGYmVnDcksKkloDNwKHAX2AYyT1Kap2ATAqIj4NDAJ+mVc8ZmbWuDzPFPYGZkTEzIhYBtwFHFlUJ4At0vcdgbk5xmNmZo3IMyl0BeYUDNelZYWGAd+QVAeMBUr+PqWkUyVNkjSp0TuMzMxsneWZFNb8EQHWeMrqGOC2iOgGDADukLRGTBExIiL6RUS/Ll265BCqmZlBvkmhDuheMNyNNZuHTgJGAUTE34AOQOccYzIzswbkmRQmAjtI6i2pHcmF5DFFdWYDhwBI2oUkKbh9yMysRnJLChGxAhgCjAOmkdxlNFXScEn1vyt5DnCKpH8AI4HB0dL6qjAz24Dk+kRzRIwluYBcWHZhwfuXgP3zjMHMzCrnJ5rNzCzjpGBmZhknBTMzyzgpmJlZxknBzMwyTgpmZpZxUjAzs4yTgpmZZZwUzMws46RgZmYZJwUzM8s4KZiZWcZJwczMMk4KZmaWcVIwM7OMk4KZmWWcFMzMLOOkYGZmGScFMzPLOCmYmVnGScHMzDJOCmZmlnFSMDOzjJOCmZllnBTMzCzjpGBmZhknBTMzyzgpmJlZxknBzMwyTgpmZpZxUjAzs4yTgpmZZZwUzMws46RgZmaZXJOCpP6SpkuaIWlomTpHS3pJ0lRJd+YZj5mZNaxNXjOW1Bq4EfgCUAdMlDQmIl4qqLMD8ENg/4hYKOmjecVjZmaNy/NMYW9gRkTMjIhlwF3AkUV1TgFujIiFABExL8d4zMysEXkmha7AnILhurSs0I7AjpImSHpaUv9SM5J0qqRJkibNnz8/p3DNzCzPpKASZVE03AbYATgQOAb4taROa0wUMSIi+kVEvy5dulQ9UDMzS+SZFOqA7gXD3YC5Jer8KSKWR8S/gOkkScLMzGogz6QwEdhBUm9J7YBBwJiiOvcDBwFI6kzSnDQzx5jMzKwBuSWFiFgBDAHGAdOAURExVdJwSQPTauOAtyS9BPwFODci3sorJjMza1hut6QCRMRYYGxR2YUF7wP4XvoyM7Ma8xPNZmaWcVIwM7OMk4KZmWUqSgqSviZp8/T9BZLuk7RnvqGZmVlTq/RM4ccR8R9JnwW+BNwO3JRfWGZmVguVJoWV6d8vAzdFxJ+AdvmEZGZmtVJpUvi3pFuAo4GxktqvxbRmZraRqPTAfjTJg2b9I+IdYCvg3NyiMjOzmqgoKUTEEmAe8Nm0aAXwSl5BmZlZbVR699FFwA9IfhAHoC3w+7yCMjOz2qi0+egrwEBgMUBEzAU2zysoMzOrjUqTwrK0n6IAkLRpfiGZmVmtVJoURqV3H3WSdArwKPCr/MIyM7NaqKiX1Ii4StIXgPeAnYALI+KRXCMzM7Mm12hSkNQaGBcRhwJOBGZmzVijzUcRsRJYIqljE8RjZmY1VOmP7CwFXpD0COkdSAARcVYuUZmZWU1UmhQeSl9mZtaMVXqh+XZJ7YAd06LpEbE8v7DMzKwWKkoKkg4k6S57FiCgu6TjI+LJ/EIzM7OmVmnz0dXAFyNiOoCkHYGRQN+8AjMzs6ZX6cNrbesTAkBEvEzS/5GZmTUjlZ4pTJL0G+COdPhY4Nl8QjIzs1qpNCl8C/g2cBbJNYUngV/mFZSZmdVGpUmhDXBdRFwD2VPO7XOLyszMaqLSawrjgY8UDH+EpFM8MzNrRipNCh0iYlH9QPp+k3xCMjOzWqk0KSyWtGf9gKR+wPv5hGRmZrVS6TWFs4E/SppL8kM72wJfzy0qMzOriQbPFCTtJeljETER2Bm4G1gB/Bn4VxPEZ2ZmTaix5qNbgGXp+/2AHwE3AguBETnGZWZmNdBY81HriHg7ff91YERE3AvcK2lyvqGZmVlTa+xMobWk+sRxCPBYwbhKr0eYmdlGorED+0jgCUkLSO42egpA0vbAuznHZmZmTazBpBARl0oaD3wceDgiIh3VCjgz7+DMzKxpVfIbzU9HxOiIKPwZzpcj4rnGppXUX9J0STMkDW2g3lGSIn3+wczMaqTSh9fWWto/0o3AYUAf4BhJfUrU25yko71n8orFzMwqk1tSAPYGZkTEzIhYBtwFHFmi3iXAz4ClOcZiZmYVyDMpdAXmFAzXpWUZSZ8GukfEgw3NSNKpkiZJmjR//vzqR2pmZkC+SUElyiIbKbUCfg6c09iMImJERPSLiH5dunSpYohmZlYoz6RQB3QvGO4GzC0Y3hzYDXhc0ixgX2CMLzabmdVOnklhIrCDpN6S2gGDgDH1IyPi3YjoHBG9IqIX8DQwMCIm5RiTmZk1ILekEBErgCHAOGAaMCoipkoaLmlgXss1M7N1l2tXFRExFhhbVHZhmboH5hmLmZk1Ls/mIzMz28g4KZiZWcZJwczMMk4KZmaWcVIwM7OMk4KZmWWcFMzMLOOkYGZmGScFMzPLOCmYmVnGScHMzDJOCmZmlnFSMDOzjJOCmZllnBTMzCzjpGBmZhknBTMzyzgpmJlZxknBzMwyTgpmZpZxUjAzs4yTgpmZZZwUzMws46RgZmYZJwUzM8s4KZiZWcZJwczMMk4KZmaWcVIwM7OMk4KZmWWcFMzMLOOkYGZmGScFMzPLOCmYmVkm16Qgqb+k6ZJmSBpaYvz3JL0kaYqk8ZJ65hmPmZk1LLekIKk1cCNwGNAHOEZSn6JqzwP9ImJ34B7gZ3nFY2ZmjcvzTGFvYEZEzIyIZcBdwJGFFSLiLxGxJB18GuiWYzxmZtaIPJNCV2BOwXBdWlbOScD/lhoh6VRJkyRNmj9/fhVDNDOzQnkmBZUoi5IVpW8A/YArS42PiBER0S8i+nXp0qWKIZqZWaE2Oc67DuheMNwNmFtcSdKhwPnA5yPigxzjMTOzRuR5pjAR2EFSb0ntgEHAmMIKkj4N3AIMjIh5OcZiZmYVyC0pRMQKYAgwDpgGjIqIqZKGSxqYVrsS2Az4o6TJksaUmZ2ZmTWBPJuPiIixwNiisgsL3h+a5/LNzGzt+IlmMzPLOCmYmVnGScHMzDJOCmZmlnFSMDOzjJOCmZllnBTMzCzjpGBmZhknBTMzyzgpmJlZxknBzMwyTgpmZpZxUjAzs4yTgpmZZZwUzMws46RgZmYZJwUzM8s4KZiZWcZJwczMMk4KZmaWaVPrAJpSr6V3NvkyZzX5Es3M1p3PFMzMLOOkYGZmGScFMzPLOCmYmVnGScHMzDJOCmZmlnFSMDOzjJOCmZllnBTMzCzjpGBmZhknBTMzyzgpmJlZxknBzMwyTgpmZpbJNSlI6i9puqQZkoaWGN9e0t3p+Gck9cozHjMza1huSUFSa+BG4DCgD3CMpD5F1U4CFkbE9sDPgSvyisfMzBqX55nC3sCMiJgZEcuAu4Aji+ocCdyevr8HOESScozJzMwaoIjIZ8bSUUD/iDg5HT4O2CcihhTUeTGtU5cOv5rWWVA0r1OBU9PBnYDpuQTdsM7AgkZrWbV4ezcdb+umVavt3TMiujRWKc+f4yz1jb84A1VSh4gYAYyoRlDrStKkiOhXyxhaEm/vpuNt3bQ29O2dZ/NRHdC9YLgbMLdcHUltgI7A2znGZGZmDcgzKUwEdpDUW1I7YBAwpqjOGOD49P1RwGORV3uWmZk1Krfmo4hYIWkIMA5oDdwaEVMlDQcmRcQY4DfAHZJmkJwhDMorniqoafNVC+Tt3XS8rZvWBr29c7vQbGZmGx8/0WxmZhknBTMzy7SopCDpVknz0ucj6su2kvSIpFfSv1sWTbOXpJXpcxeF5VtI+rekX5RYzpjCZbRkklpLel7Sg+nwkLRbk5DUuaDeuZImp68X022+laSdCsonS3pP0tkF052ZdqUyVdLParGOG4Jy20nSJZKmpGUPS9o2rb+zpL9J+kDS94vmVbJ7GkmHSHounddfJW3f1OtZS7U6fjS2jGprUUkBuA3oX1Q2FBgfETsA49NhIOuq4wqSi+XFLgGeKC6U9FVgUZXibQ6+A0wrGJ4AHAq8VlgpIq6MiD0iYg/gh8ATEfF2REwvKO8LLAFGA0g6iOSp+N0jYlfgqvxXZ8PUwHa6MiJ2T8sfBC5MJ3kbOIuibdZI9zQ3Acem87oTuCDn1drQ3EZtjh9ll5GHFpUUIuJJ1nwOorCrjduB/yoYdyZwLzCvcAJJfYFtgIeLyjcDvgf8pHpRb7wkdQO+DPy6viwino+IWY1MegwwskT5IcCrEVGfUL4FXB4RH6TznldimpYo204R8V5B+aakD4dGxLyImAgsL5q2oe5pAtgifd+RNZ87atZqePxoaBlV16KSQhnbRMTrAOnfjwJI6gp8Bbi5sLKkVsDVwLkl5nVJOm5JngFvRK4FzgNWVTqBpE1Ivo3dW2L0IFZPFjsCB6Q97D4haa/1CbYZWW07SbpU0hzgWD48UyinKzCnYLguLQM4GRgrqQ44Dri8ahFvvJri+FFyGXlxUijvWuAHEbGyqPwMYGxEFP7jIGkPYPuIGN1UAW7IJB0OzIuIZ9dy0iOACRGx2jey9AHIgcAfC4rbAFsC+5L8k42SWnaHiqW2U0ScHxHdgT8AQ8pNWz+LEmX1961/FxgQEd2A3wLXrH/EzdZGe/zIs++jjcWbkj4eEa9L+jgfnur1A+5KjzGdgQGSVgD7kXw7PQPYDGgnaRFJG3lfSbNItutHJT0eEQc27epsMPYHBkoaAHQAtpD0+4j4RiPTFZ8N1DsMeC4i3iwoqwPuS5+C/7ukVSSf1fz1D3+jVWo71bsTeAi4qIHpS3ZPI6kL8KmIeCYtvxv4cxXi3dg1xfGj3DLyEREt6gX0Al4sGL4SGJq+Hwr8rMQ0twFHlSgfDPyisWW09BdwIPBgUdksoHNRWX3fV5uWmMddwAlFZacDw9P3O5I0e6jW61vjbb3adgJ2KHh/JnBPUf1hwPcLhtsAM4HeQDvgH8CuafkCYMe03knAvbUBmbNkAAACjElEQVRe3xps3yY/flSyjGq+WtSZgqSRJAeozmm76EUk7aKjJJ0EzAa+VrsImz9JZ5FcZ/gYMEXS2Ei7Vydpg304IhYXTbMJ8AXgtKLZ3Qrcmt6+tww4PtL/nJaozHa6XNJOJNd1XiNJpEj6GDCJ5MLxqvQ23z4R8Z5KdE+TTnMKcG96RrYQOLFp1mzDUMPjR5Meo9zNhZmZZXyh2czMMk4KZmaWcVIwM7OMk4KZmWWcFMzMLOOkYGZmGScFazEkPS7pS0VlZ0v6ZQPTVLXHW0m3FXejbLYhcVKwlmQka/4OeLluNcxaJCcFa0nuAQ6X1B5AUi9gW2CypPHpD8i8IOnI4gklHaj0h4LS4V9IGpy+75v20vqspHFp/zSNKjddekZzhaS/S3pZ0gHru+JmlXJSsBYjIt4C/s6HP5QyiKRjt/eBr0TEnsBBwNWV9rYqqS1wA0nfNn1Jut64tArTtYmIvYGzabgDO7OqalF9H5nxYRPSn9K/J5J0F/1TSZ8j6SOoK8mPoLxRwfx2AnYDHknzSGvg9SpMd1/691mSDtLMmoSTgrU09wPXSNoT+EhEPJc2A3UB+kbE8rT74g5F061g9TPr+vECpkbEfmsZR2PTfZD+XYn/T60JufnIWpSIWAQ8TtJcU3+BuSPJDwItT3/3uWeJSV8D+khqL6kjyU9eAkwHukjaD5JmIUm7VhDKuk5nlit/A7GWaCRJ80z9nUh/AB6QNAmYDPyzeIKImCNpFDAFeAV4Pi1flt5ien2aLNqQ/OrW1IYCWNfpzPLmrrPNzCzj5iMzM8u4+cgsB5JuJPmd6kLXRcRvaxGPWaXcfGRmZhk3H5mZWcZJwczMMk4KZmaWcVIwM7PM/wPzMzXO68HucAAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] @@ -546,7 +546,7 @@ }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -594,7 +594,7 @@ }, { "data": { - "image/png": "\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEXCAYAAABCjVgAAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAHo1JREFUeJzt3XmclXXd//HXm2FzSw1GEwYcVDBRH7kQ1g11U7ggJrRYapZZKm1g/NzSNDXNu6zM7XbDpVxR1FRUErdcslsF3IFQRJSRZFNRRGT7/P64rrk8DrMchnPNcWbez8fjPObaz+dcM3O9z/W9zvleigjMzMwAOpS7ADMz++RwKJiZWcahYGZmGYeCmZllHApmZpZxKJiZWcahYE2SNERSTcH4XEl7t3AND0s6qiWfs9wk/V3SD8pdR6lIqpYUkjqWuxZrmEOhnUgP5B9IWibpbUn3SOpV7rrqI+kMSdeXu45yi4j9I+Kalno+SQdKelPSpwumjZT0hqTNm7G9Fn/zYBvOodC+HBgRmwLbAAuAi8pcj32CRMRdwEPAeQCStgAuBX4aEUvLWVtdPtvIj0OhHYqIFcCtQP/aaZK6SPqTpNclLZB0maSNmtpWut75kuanj/MldUnnPSLpW+nw4LTpYHg6vrekZ+vZ3jDgV8DB6VnNcwWzt5X0uKT3JN0nqXvBel+Q9C9J70h6TtKQRmqeK+kESc9Lel/SVZK2Tptr3pP0gKQti9m2pB9KmpmuN0fSjwvmDZFUI+k4SQsl/UfSD5vapwXrZ01mknZI9+dSSYsl3Vyw3H9JmpLOmyLpv+ps46yG9ls9jgH2l7QfSTg8EhETG6lxhKTp6b55WNJO6fTrgN7AXenv8cSC1Q5L/84WSzqlYFsdJJ0k6RVJSyRNqD1rKWh6OlLS6yThZXmICD/awQOYC+ydDm8MXANcWzD/fGAi8GlgM+Au4HfpvCFATQPbOhN4AtgKqAT+BZxVMO+idPhXwCvAOQXzLmig1jOA6+tMezhdvx+wUTr++3ReT2AJMJzkjc4+6XhlI/viCWDrdN2FwNPA7kAXkgPO6cVsGzgA2B4Q8N/AcmCPgv22On2tndJtLAe2TOd/F3i+kd/Zw8BR6fB44JS0hq7A4HT6p4G3ge8DHYFD0/FuTe23Rp73UGAxsKihfZgu1w94P90nnYATgdlA57p/J+l4NRDAFWktnwM+BHZK549Nfy9V6e/hcmB8nXWvBTYBNir3/1RbfZS9AD9a6Bed/IMuA95JD1TzgV3TeUr/ubcvWP6LwKvp8BAaDoVXgOEF8/YD5qbDQ2sPesC9wFHAE+n4I8A3G6j1DOoPhVMLxn8G3JsO/xK4rs7yk4EfNLIvDisYvw24tGB8DHBHM7d9B/CLgv32AdCxYP5C4AtF/s4e5qNQuBYYB1TVWeb7wFN1pv0fcERT+62R5+0DrAJuaGK5XwMTCsY7AG8AQ+r+naTjtQf2qoJpTwGHpMMzgaEF87ZJ6+hYsO525f5fausPNx+1L1+PiC1I3oWNBh6R9BmSd/gbA9PSZoB3SA7ilUVsswfwWsH4a+k0SA5O/SRtDexGcmDrlTZfDAQeXc/63ywYXg5smg5vC3y7tva0/sEkB5WGLCgY/qCe8aK2LWl/SU9IeiudNxwobJ5ZEhGrG6h7fZxIEt5Ppc01P0qn193/pOM9C8Yb2m8NGUfyuxpe2BRVj489d0SsBebVee76NPZ7vL1gP88E1pCc0dWa18S2bQP5Yk07FBFrgL9JupzkAPc3kgPhzhHxxnpubj7JP/P0dLx3Oo2IWC5pGvAL4MWIWCnpX8CxwCsRsbihEtezhnkk7+aPXs/1Nmjb6bWT24DDgTsjYpWkO0gO3iUVEW8CR6fPOxh4QNKjfLT/C/UmCfX1JulIoBfwNWAqcIWk3SNiZT2Lzwd2LVhX6bq1f0PN+T3+KCIer6eu6mZu09aTzxTaISVGAlsCM9N3eFcA50naKl2mZ3qxsSnjgVMlVaZnAKcBhR8nfYT0rCQdf7jOeH0WANWSiv37vB44UNJ+kiokdU0v8lYVuX5zt92Z5KxrEbBa0v7AviV4znVI+nbB63mb5OC4BphEcjb2XUkdJR1M8gGCu5vxHD2APwJHR8SHwGUk109OaWCVCcABkoZK6gQcR3KN4F/p/AXAdutRwmXA2ZK2TeupTP9OrQU5FNqXuyQtA94FziZpF699h/9LkouET0h6F3gA2LGIbf6W5B3l88ALJBdsf1sw/xGSC9ePNjBen1vSn0skPd1UARExDxhJcjF7Eck7zhMowd93Y9uOiPdIPq0zgeRA/V2Si/VFkXSYpOlNLwnA54En09/fRJLrFq9GxBKSd/XHkRzATwS+1shZWGMuAW6KiMcAImnYPxoYK2nnugtHxCzgeyQfbV4MHEjysefas4rfkbxheEfS8UU8/wXpa7tP0nskF533asbrsA2g9IKOmX3CpM1DV0bEteWuxdoPnymYfQJJ2pik6eXVctdi7YtDwewTJr2u8yZJU9s/y1yOtTNuPjIzs4zPFMzMLNPqvqfQvXv3qK6uLncZZmatyrRp0xZHRJNfSG11oVBdXc3UqVPLXYaZWasiqe433+vl5iMzM8s4FMzMLONQMDOzTKu7pmBm1pRVq1ZRU1PDihUryl1Ki+vatStVVVV06tSpWes7FMyszampqWGzzTajurqapPPW9iEiWLJkCTU1NfTp06dZ23DzkZm1OStWrKBbt27tKhAAJNGtW7cNOkPKLRQkXZ3el/bFBuZL0oWSZiu5V+4eedViZu1PewuEWhv6uvM8U/grMKyR+fsDfdPHKODSHGsxM7Mi5HZNISIeLbhbUn1Gktw4Pkj68N9C0jYR8Z+8ajKz9qn6pHtKur25vz+gyWU23XRT3n33XcaOHctDDz2EJLp27cqECRPo06cPS5cuZcyYMTz+eHKjuUGDBnHRRRex+eabM3fuXPr06cOFF17ImDFjABg9ejQDBgzgiCOOKOlrqaucF5p78vH7rdak09YJBUmjSM4m6N27d/Of8YzNm79us59zaXmfuxzPW87nbo/72695XftNgPk5f/Jo/jONz4+13HzZOcx/ZQbP33sNHTp0oGb+Ajb54HWY/w5HHn0Cu3x2e6597FYATv/TpRx12EHcMu4PsGA+W3X/NBf8+Q/8+MC96Ny5E7y/KN/Xkyrnheb6Gr7q7bI1IsZFxICIGFBZWcy95M3Myu8/Cxazzdbd6dAhOdRW9diaLbf4FLNffZ1pL8zk12M/uvX3af9vFFOfn8Erc5P3ypXdtmTooIFcc8tdLVpzOUOhhuQm37WqSG/4bmbWFnznwH246/5H2W2fQzjuN3/mmRf/DcCMl19lt513pKKiIlu2oqKC3XbekekvzcmmnTT6h5x7+fWsWbOmxWouZyhMBA5PP4X0BWCpryeYWVtS1WNrZj16O787eQwdOoihB/+EBx97koigvg8JRcTHmlD69O7JwN125sbb/95iNed2TUHSeGAI0F1SDXA60AkgIi4DJgHDSW4Wvxz4YV61mJmVS5cundn/q4PY/6uD2LqyG3dMfphfHHkoz7w4i7Vr12ZNS2vXruW5GS+xU9+Pf+nsV8f8iINGnciX92qZT+3ndqYQEYdGxDYR0SkiqiLiqoi4LA0EIvHziNg+InaNCPeHbWZtytMvzGT+m8kF4rVr1/L8jJfZtmobdujTm9132ZHfXnBltuxvL7iSPXb9LDv0+fiHaT67Qx/6992Oux94rEVqdjcXZtbmzT2mR+k21mP3JhdZvXo1XTp3ZuHitzj6hLP4cOUqAAbutjOjjzgYgKv+dDpjfn0OOwwaQQR8cc9duepPp9e7vVOOOZLd9zu0dK+hEQ4FM7MSmz5rDttXVzHsK4MY9pVB9S6z5Raf4vqLzq53XnWvHrz40C3Z+Od27sfammlFBdKGciiYmZXQZdfeyoVXj+f83xxf7lKaxaFgZlZCPzn8IH5y+EHlLqPZ3EuqmZllHApmZpZxKJiZWcahYGZmGV9oNrO2b9yQ0m6vtpfWRtTMX8DPT/k9M16aw9oIvrb3l/jjqWOZ8dIc5i9YxPChg5NNnXsZm26yMcf/5PDS1thMPlMwMyuxiOCbRx/P14cN4eXH7+Slx25n2fvLOeWc/+XZ6bOY9NA/S/Zcpe4sz2cKZmYl9tA/n6Jrl8788OCRQNID6nlnHMe2ex1Ap44diQj++dSznDw66fJtxktzGHLQ0bz+xpuMPeq7HHNk8u3l62+7hwuvvomVK1ex1+67cMk1E6ioqGDTTTfl2GOPZfLkyZx77rkMHjy4ZLX7TMHMrMSmvzSHPXfd6WPTPrXZplRX9eDUXxzFwSP25dn7b+LgkfsB8O/Zc5l8w8U8dc+1/ObP41i1ahUzX57DzRPv4/E7rubZ+2+ioqKCG264AYD333+fXXbZhSeffLKkgQA+UzAzK7mka+x1+8ZuaPoBQwfTpUtnunTpzFbdt2TBord48J9PMe2FmXx++PcB+GDFh2zVpz+QnHl861vfyqV2h4KZWYnt3G97bpv04MemvfveMubNX0BFh3UbaLp06ZwNV1RUsHrNGiLgB98+kN+dPOajBdO+j7p27fqxG/SUkpuPzMxKbOiXBrL8gxVce8vdQHIx+Lgzz+OI7xzI1pXdeG/Z8qa3MXggt979AAsXvwXAW28v5bXXXsu1bvCZgpm1B6MeLt22iuipVBK3X3kuP/vV7zjr/CtYG8Hwrw7if04azfvLP+D3F/+F3fY5JLvQXJ/+/bbjtyf+jH0P/RlrYy2dOnbk4nF/Ydttty3da6mHQ8HMLAe9en6Gu665YJ3pXbp0Zsqk6xtcr7DL7INH7pddjAayQFq2bFnpCq3DzUdmZpZxKJiZWcahYGZtUBAR5S6iLDb0dTsUzKzN6bp0DkveX93ugiEiWLJkCV27dm32Nnyh2czanKqnz6GGX7Jo8+2Adb8stkGWzkx+vrOwtNtdn+duRNeuXamqqmr2UzgUzKzN6bTyHfo8cXI+G6/tIfWML+Sz/WKeO0duPjIzs4xDwczMMg4FMzPLOBTMzCzjUDAzs4xDwczMMg4FMzPLOBTMzCzjUDAzs0yuoSBpmKRZkmZLOqme+b0l/UPSM5KelzQ8z3rMzKxxuYWCpArgYmB/oD9wqKT+dRY7FZgQEbsDhwCX5FWPmZk1Lc8zhYHA7IiYExErgZuAkXWWCeBT6fDmwPwc6zEzsybkGQo9gXkF4zXptEJnAN+TVANMAsbUtyFJoyRNlTR10aJFedRqZmbkGwr19Vdbt3PzQ4G/RkQVMBy4TtI6NUXEuIgYEBEDKisrcyjVzMwg31CoAXoVjFexbvPQkcAEgIj4P6Ar0D3HmszMrBF5hsIUoK+kPpI6k1xInlhnmdeBoQCSdiIJBbcPmZmVSW6hEBGrgdHAZGAmyaeMpks6U9KIdLHjgKMlPQeMB46I9nb/PDOzT5Bc77wWEZNILiAXTjutYHgGMCjPGszMrHj+RrOZmWUcCmZmlnEomJlZxqFgZmYZh4KZmWUcCmZmlnEomJlZxqFgZmYZh4KZmWUcCmZmlnEomJlZxqFgZmYZh4KZmWUcCmZmlnEomJlZxqFgZmYZh4KZmWUcCmZmlnEomJlZxqFgZmYZh4KZmWUcCmZmlnEomJlZxqFgZmYZh4KZmWUcCmZmlnEomJlZxqFgZmYZh4KZmWUcCmZmlnEomJlZxqFgZmaZXENB0jBJsyTNlnRSA8t8R9IMSdMl3ZhnPWZm1riOeW1YUgVwMbAPUANMkTQxImYULNMXOBkYFBFvS9oqr3rMzKxpeZ4pDARmR8SciFgJ3ASMrLPM0cDFEfE2QEQszLEeMzNrQp6h0BOYVzBek04r1A/oJ+lxSU9IGpZjPWZm1oTcmo8A1TMt6nn+vsAQoAp4TNIuEfHOxzYkjQJGAfTu3bv0lZqZGZDvmUIN0KtgvAqYX88yd0bEqoh4FZhFEhIfExHjImJARAyorKzMrWAzs/auqFCQ9G1Jm6XDp0r6m6Q9mlhtCtBXUh9JnYFDgIl1lrkD+Eq63e4kzUlz1ucFmJlZ6RR7pvDriHhP0mBgP+Aa4NLGVoiI1cBoYDIwE5gQEdMlnSlpRLrYZGCJpBnAP4ATImJJc16ImZltuGKvKaxJfx4AXBoRd0o6o6mVImISMKnOtNMKhgM4Nn2YmVmZFXum8Iaky4HvAJMkdVmPdc3MrJUo9sD+HZKmnmHpJ4M+DZyQW1VmZlYWRYVCRCwHFgKD00mrgZfzKsrMzMqj2E8fnQ78kqRLCoBOwPV5FWVmZuVRbPPRN4ARwPsAETEf2CyvoszMrDyKDYWV6SeFAkDSJvmVZGZm5VJsKExIP320haSjgQeAK/Iry8zMyqGo7ylExJ8k7QO8C+wInBYR9+damZmZtbgmQyG9L8LkiNgbcBCYmbVhTTYfRcQaYLmkzVugHjMzK6Niu7lYAbwg6X7STyABRMQxuVRlZmZlUWwo3JM+zMysDSv2QvM1affX/dJJsyJiVX5lmZlZORQVCpKGkHSXPZfkjmq9JP0gIh7NrzQzM2tpxTYfnQvsGxGzACT1A8YDe+ZVmJmZtbxiv7zWqTYQACLiJZL+j8zMrA0p9kxhqqSrgOvS8cOAafmUZGZm5VJsKPwU+DlwDMk1hUeBS/IqyszMyqPYUOgIXBARf4bsW85dcqvKzMzKothrCg8CGxWMb0TSKZ6ZmbUhxYZC14hYVjuSDm+cT0lmZlYuxYbC+5L2qB2RNAD4IJ+SzMysXIq9pjAWuEXSfJIb7fQADs6tKjMzK4tGzxQkfV7SZyJiCvBZ4GZgNXAv8GoL1GdmZi2oqeajy4GV6fAXgV8BFwNvA+NyrMvMzMqgqeajioh4Kx0+GBgXEbcBt0l6Nt/SzMyspTV1plAhqTY4hgIPFcwr9nqEmZm1Ek0d2McDj0haTPJpo8cAJO0ALM25NjMza2GNhkJEnC3pQWAb4L6IiHRWB2BM3sWZmVnLarIJKCKeqGfaS/mUY2Zm5VTsl9fMzKwdcCiYmVnGoWBmZplcQ0HSMEmzJM2WdFIjyx0kKdI+lczMrExyC4X0ngsXA/sD/YFDJfWvZ7nNSG7e82RetZiZWXHyPFMYCMyOiDkRsRK4CRhZz3JnAX8AVuRYi5mZFSHPUOgJzCsYr0mnZSTtDvSKiLsb25CkUZKmSpq6aNGi0ldqZmZAvqGgeqZFNlPqAJwHHNfUhiJiXEQMiIgBlZWVJSzRzMwK5RkKNUCvgvEqYH7B+GbALsDDkuYCXwAm+mKzmVn55BkKU4C+kvpI6gwcAkysnRkRSyOie0RUR0Q18AQwIiKm5liTmZk1IrdQiIjVwGhgMjATmBAR0yWdKWlEXs9rZmbNl2v31xExCZhUZ9ppDSw7JM9azMysaf5Gs5mZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWcSiYmVnGoWBmZhmHgpmZZRwKZmaWcSiYmVkm13s0m5nlqXrFjS3+nHNb/Blbls8UzMws41AwM7OMQ8HMzDIOBTMzyzgUzMws41AwM7OMQ8HMzDIOBTMzyzgUzMws41AwM7OMQ8HMzDIOBTMzy+TaIZ6kYcAFQAVwZUT8vs78Y4GjgNXAIuBHEfFanjWVizvuMmtb2ur/dG6hIKkCuBjYB6gBpkiaGBEzChZ7BhgQEcsl/RT4A3BwXjWZWem11YNje5Vn89FAYHZEzImIlcBNwMjCBSLiHxGxPB19AqjKsR4zM2tCns1HPYF5BeM1wF6NLH8k8Pf6ZkgaBYwC6N27d6nqaxfK8S4Oyv9Ozu9ezZonzzMF1TMt6l1Q+h4wAPhjffMjYlxEDIiIAZWVlSUs0czMCuV5plAD9CoYrwLm111I0t7AKcB/R8SHOdZjZmZNyPNMYQrQV1IfSZ2BQ4CJhQtI2h24HBgREQtzrMXMzIqQ25lCRKyWNBqYTPKR1KsjYrqkM4GpETGRpLloU+AWSQCvR8SIvGoya8t8HcVKIdfvKUTEJGBSnWmnFQzvnefzm5nZ+vE3ms3MLJPrmYK1b27OMGt9HApmJeQgtNbOzUdmZpZxKJiZWcahYGZmGYeCmZllHApmZpZxKJiZWcahYGZmGYeCmZllHApmZpZxKJiZWcahYGZmGYeCmZllHApmZpZxKJiZWcahYGZmGYeCmZllHApmZpZxKJiZWcahYGZmGYeCmZllHApmZpZxKJiZWcahYGZmGYeCmZllHApmZpbpWO4CWlL1ihtb/Dnntvgzmpk1n88UzMws41AwM7OMQ8HMzDIOBTMzy+QaCpKGSZolabakk+qZ30XSzen8JyVV51mPmZk1LrdQkFQBXAzsD/QHDpXUv85iRwJvR8QOwHnAOXnVY2ZmTcvzTGEgMDsi5kTESuAmYGSdZUYC16TDtwJDJSnHmszMrBGKiHw2LB0EDIuIo9Lx7wN7RcTogmVeTJepScdfSZdZXGdbo4BR6eiOwKxcim5cd2Bxk0tZqXh/txzv65ZVrv29bURUNrVQnl9eq+8df90EKmYZImIcMK4URTWXpKkRMaCcNbQn3t8tx/u6ZX3S93eezUc1QK+C8SpgfkPLSOoIbA68lWNNZmbWiDxDYQrQV1IfSZ2BQ4CJdZaZCPwgHT4IeCjyas8yM7Mm5dZ8FBGrJY0GJgMVwNURMV3SmcDUiJgIXAVcJ2k2yRnCIXnVUwJlbb5qh7y/W473dcv6RO/v3C40m5lZ6+NvNJuZWcahYGZmGYcCIOlqSQvT703UTjtL0vOSnpV0n6QeBfOGpNOnS3qkPFW3bpLmSnoh3Y9T68w7XlJI6l5n+uclrUm/A2NFkvQLSS+mf69j02lnSHoj3f/PShqeTt9H0rT0dzNN0lfLW33rUKpjSFNdA7WIiGj3D+DLwB7AiwXTPlUwfAxwWTq8BTAD6J2Ob1Xu+lvjg+T+Q93rmd6L5MMJrxXOJ/mwwkPAJOCgctffWh7ALsCLwMYkHyx5AOgLnAEcX8/yuwM9CtZ9o9yvoTU8SnEMSf/GXwG2AzoDzwH9W/q1+EwBiIhHqfP9iIh4t2B0Ez76Ut13gb9FxOvpcgtbpMj24zzgRNb9EuMY4DbA+3v97AQ8ERHLI2I18AjwjYYWjohnIqL2+0TTga6SurRAna1aiY4hxXQNlDuHQiMknS1pHnAYcFo6uR+wpaSH09Prw8tXYasWwH3pPhwFIGkEyTvT5woXlNST5EB2WcuX2eq9CHxZUjdJGwPD+ehLpaPT5o2rJW1Zz7rfAp6JiA9bqti2Zj2PIT2BeQWr16TTWpRDoRERcUpE9AJuAGr7bOoI7AkcAOwH/FpSvzKV2JoNiog9SHrR/bmkLwOn8NE/TqHzgV9GxJqWLLAtiIiZJL0P3w/cS9IksRq4FNge2A34D3Bu4XqSdk7X+3FL1tvWrOcxpKhuf/LmUCjOjSTvmiBJ73sj4v1IOu57FPhc2SprpWqbKNJT59uB/wb6AM9JmkvSLcrTkj4DDABuSqcfBFwi6evlqLs1ioirImKPiPgySRPHyxGxICLWRMRa4AqSpgsAJFWR/E4Oj4hXylN1m1PMMaSYroFy51BogKS+BaMjgH+nw3cCX5LUMT0d3wuY2dL1tWaSNpG0We0wsC8wJSK2iojqiKgm+QfZIyLejIg+BdNvBX4WEXeUq/7WRtJW6c/ewDeB8ZK2KVjkGyTNTEjaArgHODkiHm/pWtuSZhxDiukaKHd59pLaakgaDwwBukuqAU4HhkvaEVhL8kmYn0ByOi7pXuD5dN6VEfFivRu2hmwN3J7eOqMjcGNE3Fvektq02yR1A1YBP4+ItyVdJ2k3kuaJuXzUTDQa2IGkSePX6bR9/YGKxpXqGFJf10At/lrSj0KZmZm5+cjMzD7iUDAzs4xDwczMMg4FMzPLOBTMzCzjUDAzs4xDwdqNtK+Z/epMGyvpkkbWWVbiGv7qrr/tk8yhYO3JeNa9D/gh6XQzw6Fg7cutwNdqu4KWVA30AJ6V9KCkp9Oby6zTXXF6U5S7C8b/V9IR6fCekh5Je7ycXKcLiQY1tF56RnOOpKckvSTpSxv6ws2K5VCwdiMilgBPAcPSSYcANwMfAN9Ie239CnCu0j44miKpE3ARyY1/9gSuBs4uwXodI2IgMJakywSzFuG+j6y9qW1CujP9+SOSLov/J+2+ey1JH/ZbA28Wsb0dSe5Qdn+aIxUkXVFv6Hp/S39OA6qL2J5ZSTgUrL25A/izpD2AjSLi6bQZqBLYMyJWpV10d62z3mo+fmZdO1/A9Ij44nrW0dR6tTe2WYP/T60FufnI2pWIWAY8TNJcU3uBeXNgYRoIXwG2rWfV14D+krpI2hwYmk6fBVRK+iIkzULpDWqa0tz1zHLldyDWHo0naZ6p/STSDcBdkqYCz/JRv/eZiJgnaQJJd8cvA8+k01emHzG9MA2LjiR3imu0y+PmrmeWN3edbWZmGTcfmZlZxs1HZjmQdDEwqM7kCyLiL+Wox6xYbj4yM7OMm4/MzCzjUDAzs4xDwczMMg4FMzPL/H9qUIay+rkP6gAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] @@ -628,7 +628,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Jsons below the mean are 90.48% of all jsons\n" + "Jsons below the mean are 80.93% of all jsons\n" ] } ], @@ -638,6 +638,13 @@ "total = bellow_mean_count + above_mean_count\n", "print(\"Jsons below the mean are {0:.2f}% of all jsons\".format(bellow_mean_count/total * 100))" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I idenfified lots of falses positives for \"valid_json\", my false positives were all small values, like a number passing on as a 'valid json', it did not make too much of a difference in the overall analysis, but made me think, are there more false positives? how can I eliminate them? " + ] } ], "metadata": { From b77dccf91a843055989922f55ef85a4952d241d6 Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Wed, 17 Apr 2019 19:40:36 -0300 Subject: [PATCH 19/23] Add new notebookt 'isJson_Occurrence_of_operation_symbols_domains.ipynb' --- .../isJson_Value_Distribution.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/analyses/2019_03_aliamcami_value_analyses/isJson_Value_Distribution.ipynb b/analyses/2019_03_aliamcami_value_analyses/isJson_Value_Distribution.ipynb index 3ff820d..f7b718a 100644 --- a/analyses/2019_03_aliamcami_value_analyses/isJson_Value_Distribution.ipynb +++ b/analyses/2019_03_aliamcami_value_analyses/isJson_Value_Distribution.ipynb @@ -109,7 +109,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 1.5s\n" + "[########################################] | 100% Completed | 1.7s\n" ] } ], @@ -142,7 +142,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 4, From 2be179c944896ce9ff1362ae47508de92074fe7f Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Wed, 17 Apr 2019 19:41:14 -0300 Subject: [PATCH 20/23] Clean run of the dataPrep with all columns --- .../isJson_dataPrep.ipynb | 724 ++++++++++++------ 1 file changed, 501 insertions(+), 223 deletions(-) diff --git a/analyses/2019_03_aliamcami_value_analyses/isJson_dataPrep.ipynb b/analyses/2019_03_aliamcami_value_analyses/isJson_dataPrep.ipynb index f0dbbc4..7577bfa 100644 --- a/analyses/2019_03_aliamcami_value_analyses/isJson_dataPrep.ipynb +++ b/analyses/2019_03_aliamcami_value_analyses/isJson_dataPrep.ipynb @@ -52,7 +52,7 @@ "# client\n", "\n", "#Create folder to save/read new data\n", - "DIR = 'sample0_prep/'\n", + "DIR = 'sample_0_prep/'\n", "FILE_NAME = 's0'\n", "\n", "if not os.path.exists(DIR):\n", @@ -106,8 +106,12 @@ { "data": { "text/plain": [ - "Index(['value_1000', 'value', 'value_len', 'symbol', 'script_url', 'location',\n", - " 'operation'],\n", + "Index(['argument_0', 'argument_1', 'argument_2', 'argument_3', 'argument_4',\n", + " 'argument_5', 'argument_6', 'argument_7', 'argument_8', 'arguments',\n", + " 'arguments_n_keys', 'call_stack', 'crawl_id', 'file_name', 'func_name',\n", + " 'in_iframe', 'location', 'operation', 'script_col', 'script_line',\n", + " 'script_loc_eval', 'script_url', 'symbol', 'time_stamp', 'value',\n", + " 'value_1000', 'value_len'],\n", " dtype='object')" ] }, @@ -117,10 +121,10 @@ } ], "source": [ - "#Original sample \n", + "#Original sample sample_0.parquet'\n", "df = dd.read_parquet('sample_0.parquet', \n", - " engine='pyarrow', \n", - " columns=['value_1000', 'value', 'value_len', 'symbol', 'script_url', 'location', 'operation'])\n", + " engine='pyarrow', )\n", + "# columns=['value_1000', 'value', 'value_len', 'symbol', 'script_url', 'location', 'operation'])\n", "\n", "# df.astype({'value_1000': str, 'value': str,'value_len': int,'symbol': int,'script_url': str})\n", "df.columns" @@ -147,8 +151,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 58.7s\n", - "1356.9776628910975 0 4496861 26310.62140481331 11292867\n" + "[########################################] | 100% Completed | 1min 19.3s\n", + "MEAN: 1356.9776628910975,\n", + "MIN: 0,\n", + "MAX: 4496861,\n", + "std: 26310.62140481331,\n", + "LEN: 11292867\n" ] } ], @@ -160,7 +168,7 @@ " df_std = df['value_len'].std()\n", " df_len = df['value_len'].count()\n", " (df_mean, df_min, df_max, df_std, df_len) = dd.compute(df_mean, df_min, df_max, df_std, df_len);\n", - " print(df_mean, df_min, df_max, df_std, df_len)" + " print(\"MEAN: {},\\nMIN: {},\\nMAX: {},\\nstd: {},\\nLEN: {}\".format(df_mean, df_min, df_max, df_std, df_len))" ] }, { @@ -227,7 +235,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 6min 23.0s\n" + "[########################################] | 100% Completed | 7min 22.3s\n" ] } ], @@ -355,12 +363,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "Notebook name: s0_domains_isjson\n" + "Notebook name: s0_domains_isJson\n" ] } ], "source": [ - "FILE_NAME += '_isjson'\n", + "FILE_NAME += '_isJson'\n", "print('Notebook name: ', FILE_NAME)" ] }, @@ -387,7 +395,7 @@ "metadata": {}, "outputs": [], "source": [ - "df['is_json'] = df['value'].apply(is_json, meta=False)" + "df['is_json'] = df['value'].apply(is_json, meta='O')" ] }, { @@ -399,7 +407,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 4min 21.6s\n" + "[########################################] | 100% Completed | 5min 12.2s\n" ] } ], @@ -492,8 +500,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Add Column: value_md5\n", - "Include new columns called \"value_md5\" that is the md5 of value column" + "# Add json keys" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Extract the top level keys, sort them and add as a list into another column named 'json_keys'\n", + "Will be using \"https://github.com/rnd0101/json_schema_inferencer\" to guess the json schema and save it into another column called \"json_schema\"" ] }, { @@ -505,12 +520,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "Notebook name: s0_domains_isjson_md5\n" + "Notebook name: s0_domains_isJson_jsonKeys\n" ] } ], "source": [ - "FILE_NAME += '_md5'\n", + "FILE_NAME += '_jsonKeys'\n", "print('Notebook name: ', FILE_NAME)" ] }, @@ -520,41 +535,43 @@ "metadata": {}, "outputs": [], "source": [ - "def md5(value):\n", - " return hashlib.md5(value.encode('utf-8')).hexdigest()" + " def jsonKeys(r):\n", + " if(r['is_json']):\n", + " try:\n", + " dct = json.loads(r['value'])\n", + " keys = list(dct.keys())\n", + " keys.sort()\n", + " return str(keys)\n", + " except ValueError as e:\n", + " return ''\n", + " else:\n", + " return ''" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, - "outputs": [], - "source": [ - "df['value_md5'] = df['value'].apply(md5, meta='O')" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 2min 45.9s\n" + "[########################################] | 100% Completed | 8min 32.7s\n" ] } ], "source": [ - "#save\n", + "df['json_keys'] = df.apply(jsonKeys,axis=1, meta='O')\n", "save_parquet(df=df, name=FILE_NAME)" ] }, { "cell_type": "code", - "execution_count": 20, - "metadata": {}, + "execution_count": 19, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -578,198 +595,282 @@ " \n", " \n", " value_1000\n", - " value_md5\n", + " is_json\n", + " json_keys\n", " \n", " \n", " \n", " \n", " 0\n", " fXDcab74\n", - " 7df64196939a8b6ff11482ed6df4b25a\n", + " False\n", + " \n", " \n", " \n", " 1\n", " fXDcab74\n", - " 7df64196939a8b6ff11482ed6df4b25a\n", + " False\n", + " \n", " \n", " \n", " 2\n", " Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...\n", - " bc0aac3569031babbd73e069947a4b12\n", + " False\n", + " \n", " \n", " \n", " 3\n", " Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...\n", - " bc0aac3569031babbd73e069947a4b12\n", + " False\n", + " \n", " \n", " \n", " 4\n", " _ga=GA1.2.1529583939.1513387469; _gid=GA1.2.17...\n", - " 324dd29b8c6438bc700ac2d85e33f12d\n", + " False\n", + " \n", " \n", " \n", "\n", "" ], "text/plain": [ - " value_1000 \\\n", - "0 fXDcab74 \n", - "1 fXDcab74 \n", - "2 Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko... \n", - "3 Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko... \n", - "4 _ga=GA1.2.1529583939.1513387469; _gid=GA1.2.17... \n", - "\n", - " value_md5 \n", - "0 7df64196939a8b6ff11482ed6df4b25a \n", - "1 7df64196939a8b6ff11482ed6df4b25a \n", - "2 bc0aac3569031babbd73e069947a4b12 \n", - "3 bc0aac3569031babbd73e069947a4b12 \n", - "4 324dd29b8c6438bc700ac2d85e33f12d " + " value_1000 is_json json_keys\n", + "0 fXDcab74 False \n", + "1 fXDcab74 False \n", + "2 Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko... False \n", + "3 Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko... False \n", + "4 _ga=GA1.2.1529583939.1513387469; _gid=GA1.2.17... False " ] }, - "execution_count": 20, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "#read\n", + "#read \n", "df = read_parquet(FILE_NAME)\n", - "df[['value_1000', 'value_md5']].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Saving other possible usefull filtered samples to future analyses" + "df[['value_1000', 'is_json', 'json_keys']].head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## value_len > df_mean\n", - "1356 is the value_len mean\n", - "\n", - "To filter the data into something that is more interesting to this task I decided to only work with values that are at above the mean.\n", - "\n", - "All values above the mean count up to 499805 rows. That is just 4,42% of the whole sample, and a lot easier to work on. " + "# Add Column: keys_md5\n", + "Include new columns called \"keys_md5\" that is the md5 of json_keys column" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Notebook name: s0_domains_isjson_md5_above_mean\n" + "Notebook name: s0_domains_isJson_jsonKeys_md5\n" ] } ], "source": [ - "name = FILE_NAME + '_above_mean'\n", - "print('Notebook name: ', name)" + "FILE_NAME += '_md5'\n", + "print('Notebook name: ', FILE_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "def md5(value):\n", + " if (value == ''):\n", + " return ''\n", + " else:\n", + " return hashlib.md5(value.encode('utf-8')).hexdigest()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, + "outputs": [], + "source": [ + "df['keys_md5'] = df['json_keys'].apply(md5, meta='O')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 50.5s\n", - "Npartition: 245\n", - "[########################################] | 100% Completed | 1min 38.3s\n" + "[########################################] | 100% Completed | 3min 49.6s\n" ] } ], "source": [ - "#Save\n", - "save_parquet(df= df[df['value_len'] > df_mean], name= name, recalculate_partition=True)" + "#save\n", + "save_parquet(df=df, name=FILE_NAME)" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": { - "scrolled": false + "scrolled": true }, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_1000keys_md5
0fXDcab74
1fXDcab74
2Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...
3Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...
4_ga=GA1.2.1529583939.1513387469; _gid=GA1.2.17...
\n", + "
" + ], "text/plain": [ - "Index(['value_1000', 'value', 'value_len', 'symbol', 'script_url', 'location',\n", - " 'operation', 'location_domain', 'script_domain', 'is_json',\n", - " 'value_md5'],\n", - " dtype='object')" + " value_1000 keys_md5\n", + "0 fXDcab74 \n", + "1 fXDcab74 \n", + "2 Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko... \n", + "3 Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko... \n", + "4 _ga=GA1.2.1529583939.1513387469; _gid=GA1.2.17... " ] }, - "execution_count": 23, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "#Read\n", - "df = read_parquet(name)\n", - "df.columns" + "#read\n", + "df = read_parquet(FILE_NAME)\n", + "df[['value_1000', 'keys_md5']].head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Filter to parquet containing only JSON " + "# TLD\n", + "Include new columns called \"script_tld\" that is the the TLD for the script_domain" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Notebook name: s0_domains_isjson_md5_JSON_ONLY\n" + "Notebook name: s0_domains_isJson_jsonKeys_md5_TLD\n" ] } ], "source": [ - "name = FILE_NAME + '_JSON_ONLY'\n", - "print('Notebook name: ', name)" + "FILE_NAME += '_TLD'\n", + "print('Notebook name: ', FILE_NAME)" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "def extractTLD(domain):\n", + " return domain.split('.')[-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "df['script_tld'] = df['script_domain'].apply(extractTLD, meta='O')" + ] + }, + { + "cell_type": "code", + "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 28.9s\n", - "Npartition: 233\n", - "[########################################] | 100% Completed | 1min 5.0s\n" + "[########################################] | 100% Completed | 3min 59.4s\n" ] } ], "source": [ - "save_parquet(df=df[df['is_json'] == True], name=name, recalculate_partition=True)" + "#save\n", + "save_parquet(df=df, name=FILE_NAME)" ] }, { "cell_type": "code", - "execution_count": 26, - "metadata": {}, + "execution_count": 29, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -792,144 +893,194 @@ " \n", " \n", " \n", - " value_1000\n", - " is_json\n", + " script_domain\n", + " script_tld\n", " \n", " \n", " \n", " \n", " 0\n", - " {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...\n", - " True\n", + " vk.com\n", + " com\n", " \n", " \n", " 1\n", - " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " True\n", + " vk.com\n", + " com\n", " \n", " \n", " 2\n", - " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " True\n", + " vk.com\n", + " com\n", " \n", " \n", " 3\n", - " {\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279...\n", - " True\n", + " baidustatic.com\n", + " com\n", " \n", " \n", " 4\n", - " {\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279...\n", - " True\n", + " google.com\n", + " com\n", " \n", " \n", "\n", "" ], "text/plain": [ - " value_1000 is_json\n", - "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... True\n", - "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... True\n", - "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... True\n", - "3 {\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279... True\n", - "4 {\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279... True" + " script_domain script_tld\n", + "0 vk.com com\n", + "1 vk.com com\n", + "2 vk.com com\n", + "3 baidustatic.com com\n", + "4 google.com com" ] }, - "execution_count": 26, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "#read all_json_above_mean\n", - "df = read_parquet(name)\n", - "df[['value_1000', 'is_json']].head()" + "#read\n", + "df = read_parquet(FILE_NAME)\n", + "df[['script_domain', 'script_tld']].head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Add json keys and schema columns" + "# Saving other possible usefull filtered samples to future analyses" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Extract the top level keys, sort them and add as a list into another column named 'json_keys'\n", - "Will be using \"https://github.com/rnd0101/json_schema_inferencer\" to guess the json schema and save it into another column called \"json_schema\"" + "## value_len > df_mean\n", + "1356 is the value_len mean\n", + "\n", + "To filter the data into something that is more interesting to this task I decided to only work with values that are at above the mean.\n", + "\n", + "All values above the mean count up to 499805 rows. That is just 4,42% of the whole sample, and a lot easier to work on. " ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Notebook name: s0_domains_isjson_md5_JSON_ONLY_schema_keys\n" + "Notebook name: s0_domains_isJson_jsonKeys_md5_TLD_above_mean\n" ] } ], "source": [ - "name += '_schema_keys'\n", + "name = FILE_NAME + '_above_mean'\n", "print('Notebook name: ', name)" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 31, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 2min 23.6s\n" + ] + } + ], "source": [ - "from json_schema_inferencer.guess_json_schema import guess_schema\n", - "\n", - "def jsonSchema(myjson):\n", - " try:\n", - " dct = json.loads(myjson)\n", - " value = guess_schema(dct)\n", - " l = list(value['properties'])\n", - " l.sort()\n", - " return l\n", - " except ValueError as e:\n", - " return list()\n", - " \n", - "def jsonKeys(myjson):\n", - " try:\n", - " dct = json.loads(myjson)\n", - " keys = list(dct.keys())\n", - " keys.sort()\n", - " return keys\n", - " except ValueError as e:\n", - " return list()" + "#Save\n", + "save_parquet(df= df[df['value_len'] > df_mean], name= name)" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 32, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['argument_0', 'argument_1', 'argument_2', 'argument_3', 'argument_4',\n", + " 'argument_5', 'argument_6', 'argument_7', 'argument_8', 'arguments',\n", + " 'arguments_n_keys', 'call_stack', 'crawl_id', 'file_name', 'func_name',\n", + " 'in_iframe', 'location', 'operation', 'script_col', 'script_line',\n", + " 'script_loc_eval', 'script_url', 'symbol', 'time_stamp', 'value',\n", + " 'value_1000', 'value_len', 'location_domain', 'script_domain',\n", + " 'is_json', 'json_keys', 'keys_md5', 'script_tld'],\n", + " dtype='object')" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Read\n", + "df = read_parquet(name)\n", + "df.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Filter to parquet containing only JSON " + ] + }, + { + "cell_type": "code", + "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 4min 18.1s\n" + "Notebook name: s0_domains_isJson_jsonKeys_md5_TLD_JSON_ONLY\n" ] } ], "source": [ - "df['json_keys'] = df.value.apply(jsonKeys, meta='O')\n", - "df['json_schema'] = df.value.apply(jsonSchema, meta='O')\n", - "save_parquet(df=df, name=name)\n" + "name = FILE_NAME + '_JSON_ONLY'\n", + "print('Notebook name: ', name)" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 34, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 1min 20.0s\n" + ] + } + ], + "source": [ + "save_parquet(df=df[df['is_json'] == True], name=name)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -953,77 +1104,57 @@ " \n", " \n", " value_1000\n", - " json_keys\n", - " json_schema\n", + " is_json\n", " \n", " \n", " \n", " \n", " 0\n", " {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site...\n", - " [im-settings]\n", - " [im-settings]\n", + " True\n", " \n", " \n", " 1\n", " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " [APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c]\n", - " [APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c]\n", + " True\n", " \n", " \n", " 2\n", " {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c...\n", - " [APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c]\n", - " [APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c]\n", + " True\n", " \n", " \n", " 3\n", " {\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279...\n", - " [LastSearch, LastSearch_e, dueljs_channel_comm...\n", - " [LastSearch, LastSearch_e, dueljs_channel_comm...\n", + " True\n", " \n", " \n", " 4\n", " {\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279...\n", - " [LastSearch, LastSearch_e, dueljs_channel_comm...\n", - " [LastSearch, LastSearch_e, dueljs_channel_comm...\n", + " True\n", " \n", " \n", "\n", "" ], "text/plain": [ - " value_1000 \\\n", - "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... \n", - "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", - "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... \n", - "3 {\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279... \n", - "4 {\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279... \n", - "\n", - " json_keys \\\n", - "0 [im-settings] \n", - "1 [APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c] \n", - "2 [APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c] \n", - "3 [LastSearch, LastSearch_e, dueljs_channel_comm... \n", - "4 [LastSearch, LastSearch_e, dueljs_channel_comm... \n", - "\n", - " json_schema \n", - "0 [im-settings] \n", - "1 [APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c] \n", - "2 [APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c] \n", - "3 [LastSearch, LastSearch_e, dueljs_channel_comm... \n", - "4 [LastSearch, LastSearch_e, dueljs_channel_comm... " + " value_1000 is_json\n", + "0 {\"im-settings\":\"{\\\"val\\\":{\\\"settings\\\":{\\\"Site... True\n", + "1 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... True\n", + "2 {\"APLUS_S_CORE_0.17.12_20171214163401_2ee09a0c... True\n", + "3 {\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279... True\n", + "4 {\"dueljs_channel_comm\":\"[{\\\"id\\\":4734405521279... True" ] }, - "execution_count": 30, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "#read \n", + "#read all_json_above_mean\n", "df = read_parquet(name)\n", - "df[['value_1000', 'json_keys', 'json_schema']].head()" + "df[['value_1000', 'is_json']].head()" ] }, { @@ -1035,14 +1166,14 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Notebook name: s0_domains_isjson_md5_nonJSON_ONLY\n" + "Notebook name: s0_domains_isJson_jsonKeys_md5_TLD_nonJSON_ONLY\n" ] } ], @@ -1054,16 +1185,16 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 1min 54.5s\n", - "Npartition: 116\n", - "[########################################] | 100% Completed | 1min 13.1s\n" + "[########################################] | 100% Completed | 4min 34.1s\n", + "Npartition: 285\n", + "[########################################] | 100% Completed | 2min 11.3s\n" ] } ], @@ -1073,17 +1204,9 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 38, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/anaconda3/envs/overscripted/lib/python3.6/site-packages/dask/dataframe/core.py:4494: UserWarning: Insufficient elements for `head`. 5 elements requested, only 0 elements available. Try passing larger `npartitions` to `head`.\n", - " warnings.warn(msg.format(n, len(r)))\n" - ] - }, { "data": { "text/html": [ @@ -1105,33 +1228,195 @@ " \n", " \n", " \n", - " value_1000\n", + " argument_0\n", + " argument_1\n", + " argument_2\n", + " argument_3\n", + " argument_4\n", + " argument_5\n", + " argument_6\n", + " argument_7\n", + " argument_8\n", + " arguments\n", + " ...\n", + " time_stamp\n", " value\n", + " value_1000\n", " value_len\n", - " symbol\n", - " script_url\n", - " location\n", - " operation\n", " location_domain\n", " script_domain\n", " is_json\n", - " value_md5\n", " json_keys\n", - " json_schema\n", + " keys_md5\n", + " script_tld\n", " \n", " \n", " \n", + " \n", + " 0\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " {}\n", + " ...\n", + " 2017-12-16 19:02:31.406\n", + " fXDcab74\n", + " fXDcab74\n", + " 8\n", + " vk.com\n", + " vk.com\n", + " False\n", + " \n", + " \n", + " com\n", + " \n", + " \n", + " 1\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " {}\n", + " ...\n", + " 2017-12-16 19:02:31.407\n", + " fXDcab74\n", + " fXDcab74\n", + " 8\n", + " vk.com\n", + " vk.com\n", + " False\n", + " \n", + " \n", + " com\n", + " \n", + " \n", + " 2\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " {}\n", + " ...\n", + " 2017-12-16 19:02:31.659\n", + " Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...\n", + " Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...\n", + " 68\n", + " vk.com\n", + " vk.com\n", + " False\n", + " \n", + " \n", + " com\n", + " \n", + " \n", + " 3\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " {}\n", + " ...\n", + " 2017-12-16 00:24:09.355\n", + " Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...\n", + " Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...\n", + " 68\n", + " baidu.com\n", + " baidustatic.com\n", + " False\n", + " \n", + " \n", + " com\n", + " \n", + " \n", + " 4\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " {}\n", + " ...\n", + " 2017-12-16 01:24:30.372\n", + " _ga=GA1.2.1529583939.1513387469; _gid=GA1.2.17...\n", + " _ga=GA1.2.1529583939.1513387469; _gid=GA1.2.17...\n", + " 288\n", + " serienjunkies.org\n", + " google.com\n", + " False\n", + " \n", + " \n", + " com\n", + " \n", " \n", "\n", + "

5 rows × 33 columns

\n", "" ], "text/plain": [ - "Empty DataFrame\n", - "Columns: [value_1000, value, value_len, symbol, script_url, location, operation, location_domain, script_domain, is_json, value_md5, json_keys, json_schema]\n", - "Index: []" + " argument_0 argument_1 argument_2 argument_3 argument_4 argument_5 \\\n", + "0 None None None None None None \n", + "1 None None None None None None \n", + "2 None None None None None None \n", + "3 None None None None None None \n", + "4 None None None None None None \n", + "\n", + " argument_6 argument_7 argument_8 arguments ... time_stamp \\\n", + "0 None None None {} ... 2017-12-16 19:02:31.406 \n", + "1 None None None {} ... 2017-12-16 19:02:31.407 \n", + "2 None None None {} ... 2017-12-16 19:02:31.659 \n", + "3 None None None {} ... 2017-12-16 00:24:09.355 \n", + "4 None None None {} ... 2017-12-16 01:24:30.372 \n", + "\n", + " value \\\n", + "0 fXDcab74 \n", + "1 fXDcab74 \n", + "2 Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko... \n", + "3 Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko... \n", + "4 _ga=GA1.2.1529583939.1513387469; _gid=GA1.2.17... \n", + "\n", + " value_1000 value_len \\\n", + "0 fXDcab74 8 \n", + "1 fXDcab74 8 \n", + "2 Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko... 68 \n", + "3 Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko... 68 \n", + "4 _ga=GA1.2.1529583939.1513387469; _gid=GA1.2.17... 288 \n", + "\n", + " location_domain script_domain is_json json_keys keys_md5 script_tld \n", + "0 vk.com vk.com False com \n", + "1 vk.com vk.com False com \n", + "2 vk.com vk.com False com \n", + "3 baidu.com baidustatic.com False com \n", + "4 serienjunkies.org google.com False com \n", + "\n", + "[5 rows x 33 columns]" ] }, - "execution_count": 35, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -1141,13 +1426,6 @@ "df = read_parquet(name)\n", "df.head()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From f30f68ab2093d57dc2d32ee763b9cfb2ba373c1e Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Mon, 22 Apr 2019 00:08:51 -0300 Subject: [PATCH 21/23] Add isJson_Identify_Source.ipynb --- .../isJson_Identify_Source.ipynb | 350 +++++ ...urrence_of_operation_symbols_domains.ipynb | 1279 +++++++++++++++++ 2 files changed, 1629 insertions(+) create mode 100644 analyses/2019_03_aliamcami_value_analyses/isJson_Identify_Source.ipynb create mode 100644 analyses/2019_03_aliamcami_value_analyses/isJson_Occurrence_of_operation_symbols_domains.ipynb diff --git a/analyses/2019_03_aliamcami_value_analyses/isJson_Identify_Source.ipynb b/analyses/2019_03_aliamcami_value_analyses/isJson_Identify_Source.ipynb new file mode 100644 index 0000000..23bae7d --- /dev/null +++ b/analyses/2019_03_aliamcami_value_analyses/isJson_Identify_Source.ipynb @@ -0,0 +1,350 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Start Dask" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/overscripted/lib/python3.6/site-packages/dask/config.py:168: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", + " data = yaml.load(f.read()) or {}\n" + ] + } + ], + "source": [ + "import dask.dataframe as dd\n", + "from dask.diagnostics import ProgressBar\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import numpy as np\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Objective\n", + "\n", + "The objective of this notebook is to answer: \n", + " - \"The JSON values are always from the same location or related domains?\" \n", + "\n", + "To answer this we will use the sample data set produced by the notebook \"isJson_dataPrep.ipynb\":\n", + "- 's0_domains_isJson_jsonKeys_md5_TLD_JSON_ONLY.parquet'\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Findings\n", + "To answer the question \"The JSON values are always from the same location or related domains?\" \n", + "NO not aways, but usually. 83.09% of the JSONs are produced by a single script domain. \n", + "\n", + "---\n", + "\n", + "About 71% of the JSONs are seen more than once across the data set, that means that they MAY have different origins.\n", + "- Most JSON are from a single script domain. \n", + "- Almost 17% of the JSONs have multiple origins[1], mostly they have 2 to 3 origins, very few have more than this. \n", + "- They may be related for 40% of them have the same TLD[2]. \n", + "- Some of the ones that have multiple script domains have the same location domain (41%) calling different scripts but producing the same JSON[3]. \n", + "- They may have some similarities in usage, 99% of them have a single simbol across the different domains[4]\n", + "\n", + "---\n", + " For further investigation: \n", + " 1. Are this jsons any different? Are they big/small jsons? I may be that they have the same top keys but are in reality very different? \n", + " 2. Do the scripts domains that produces the same json have any relation between them? How can I relate domains?\n", + " 3. What does it mean to different scripts get the same JSON for a single Location? \n", + " 4. Are they used for the same purpose? can we really say that based on the symbol? " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "DIR = 'sample_0_prep/'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['symbol', 'location_domain', 'script_domain', 'json_keys', 'keys_md5',\n", + " 'script_tld', 'value_len'],\n", + " dtype='object')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = dd.read_parquet(DIR + 's0_domains_isJson_jsonKeys_md5_TLD_JSON_ONLY.parquet',\n", + " engine='pyarrow',\n", + " columns=['symbol', 'location_domain', 'script_domain', 'json_keys', 'keys_md5', 'script_tld', 'value_len'])\n", + "df.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# The JSON values are always from the same location or related domains?\n", + "How many locations one JSON has?\n", + "All bigger json have the same locations?\n", + "what is \"related domains\"?\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 1.8s\n", + "The total number of different keys_md5 is 14374\n" + ] + } + ], + "source": [ + "with ProgressBar():\n", + " group_by_keys_md5 = df.compute().groupby(['keys_md5'])\n", + " group_by_keys_md5_number_of_different_keys = len(group_by_keys_md5)\n", + " print(\"The total number of different {} is {}\".format('keys_md5', group_by_keys_md5_number_of_different_keys))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "agg = group_by_keys_md5.agg(['nunique'])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are a total of 10222(71.11%) JSONs that appear in multiple rows\n" + ] + } + ], + "source": [ + "\n", + "json_multiple_appearances = agg['symbol'][group_by_keys_md5['symbol'].count() > 1]\n", + "json_multiple_appearances_len = len(json_multiple_appearances)\n", + "agg_len = len(agg['symbol'])\n", + "print('There are a total of {0}({1:0.2f}%) JSONs that appear in multiple rows'.format(\n", + " json_multiple_appearances_len, \n", + " json_multiple_appearances_len*100/agg_len))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def get_multiple(agg, column, title=''):\n", + " agg_len = len(agg[column])\n", + " x = agg[agg[column]['nunique'] > 1]\n", + " x_len = len(x)\n", + " print(title + '{0} ({1:0.2f}%) multiple {2},\\n{3} ({4:0.2f}%) unique {2}'.format(\n", + " x_len,\n", + " x_len*100/agg_len,\n", + " column, \n", + " agg_len - x_len,\n", + " (agg_len - x_len) * 100 / agg_len\n", + " ))\n", + " return x" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### JSONs origin: script domain\n", + "\n", + "Plot that shows that most JSONs are originated from a single script domain" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SCRIPT DOMAIN data: from the total of json\n", + "2430 (16.91%) multiple script_domain,\n", + "11944 (83.09%) unique script_domain\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "multiple_script_domain = get_multiple(agg, 'script_domain', 'SCRIPT DOMAIN data: from the total of json\\n')\n", + "pd.DataFrame([[len(multiple_script_domain)/agg_len], \n", + " [(agg_len - len(multiple_script_domain))/agg_len]], \n", + " ['multiple', 'single']).plot(kind='bar', title='JSONs origin: Script domain')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 2430.000000\n", + "mean 2.483128\n", + "std 1.213823\n", + "min 2.000000\n", + "25% 2.000000\n", + "50% 2.000000\n", + "75% 3.000000\n", + "max 34.000000\n", + "Name: nunique, dtype: float64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "multiple_script_domain.script_domain['nunique'].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7 (0.29%) multiple symbol,\n", + "2423 (99.71%) unique symbol\n" + ] + } + ], + "source": [ + "# Out of the multiple_script_domain\n", + "multiple_script_domain_symbol = get_multiple(multiple_script_domain, 'symbol')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1413 (58.15%) multiple script_tld,\n", + "1017 (41.85%) unique script_tld\n" + ] + } + ], + "source": [ + "# Out of the multiple_script_domain\n", + "multiple_script_domain_location_tld = get_multiple(multiple_script_domain, 'script_tld')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "98 (4.03%) multiple location_domain,\n", + "2332 (95.97%) unique location_domain\n" + ] + } + ], + "source": [ + "# Out of the multiple_script_domain\n", + "multiple_script_domain_location_tld = get_multiple(multiple_script_domain, 'location_domain')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/analyses/2019_03_aliamcami_value_analyses/isJson_Occurrence_of_operation_symbols_domains.ipynb b/analyses/2019_03_aliamcami_value_analyses/isJson_Occurrence_of_operation_symbols_domains.ipynb new file mode 100644 index 0000000..cd64c26 --- /dev/null +++ b/analyses/2019_03_aliamcami_value_analyses/isJson_Occurrence_of_operation_symbols_domains.ipynb @@ -0,0 +1,1279 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Start Dask" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/overscripted/lib/python3.6/site-packages/dask/config.py:168: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", + " data = yaml.load(f.read()) or {}\n" + ] + } + ], + "source": [ + "import dask.dataframe as dd\n", + "from dask.diagnostics import ProgressBar\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import numpy as np\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook uses the parquet produced by the \"isJson_dataPrep.ipynb\":\n", + "- ‘s0_domains_isJson_jsonKeys_md5_TLD.parquet'\n", + "\t- It contains all the original 10% sample with extra columns.\n", + "\n", + "# Objective\n", + "Show and compare between samples the presence and occurrence of operation/symbols/domain/tld. \n", + "\n", + "I'll be doing two of each graph to show the difference between the whole data and the filtered data by only rows that have the value_len above the mean\n", + "\n", + "# Overview\n", + "### Operation\n", + "Most operation used across the entire sample is GET. \n", + "99.67% of the valid JSONs has GET as operation. If filtered by values_len above the mean then all 100% of the valid JSONs are GET. \n", + "\n", + "### Symbols\n", + "The difference for the unique symbols counting for the whole sample and the filtered one is really big. The one thing I can say is that 'window.localStorage' is the one that produces most JSONs (65%) and ‘window.document.cookie' is the one responsible for 34% of the non-JSON, anything else may require further investigation and understanding. \n", + "\n", + "### Domain\n", + "'Baidu' has the most occurrences for valid JSON values (15%) but it's only in the 5th position when it comes to the values above the mean (5.9%).\n", + "‘Google.Analytics’ is the top one for the non-JSON values for both all values and bigger values.\n", + "\n", + "\n", + "### TLD\n", + "The TLD is more balanced between the non-JSON and JSON values, and the top ones remain for the filtered data. \n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "DIR = 'sample_0_prep/'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 6.8s\n" + ] + } + ], + "source": [ + "columns=['operation', 'symbol', 'script_domain', 'is_json', 'keys_md5', 'script_tld', 'value_len']\n", + "df = dd.read_parquet(DIR + 's0_domains_isJson_jsonKeys_md5_TLD.parquet',\n", + " engine='pyarrow',\n", + " columns=columns)\n", + "with ProgressBar():\n", + " mean = df['value_len'].mean().compute()\n", + "\n", + "df_a = df[df.value_len > mean]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Support code\n", + "This section is where some support code is placed. \n", + "Some of the code here is where the math actually happen and the other section uses it. " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 7.1s\n", + "[########################################] | 100% Completed | 7.3s\n" + ] + } + ], + "source": [ + "with ProgressBar():\n", + " df_json = df[df.is_json == True].compute()\n", + " df_other = df[df.is_json == False].compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 7.1s\n", + "[########################################] | 100% Completed | 6.8s\n" + ] + } + ], + "source": [ + "with ProgressBar():\n", + " df_a_json = df_a[df_a.is_json == True].compute()\n", + " df_a_other = df_a[df_a.is_json == False].compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def calcUniquePercentual(df, column):\n", + " v = df[column].value_counts()\n", + " l = df[column].count()\n", + " return v/l" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def plotUsageComparation(df_json, df_other, column):\n", + " nonjsons = calcUniquePercentual(df_other, column=column)\n", + " jsons = calcUniquePercentual(df_json, column=column)\n", + " p1 = pd.DataFrame({'json': jsons,'other':nonjsons}).sort_values('json', ascending=False)\n", + " p1.plot(kind='bar')\n", + " return p1" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def plotTopUsageComparation(df_json, df_other, column, top):\n", + " nonjsons = calcUniquePercentual(df_other, column=column)\n", + " jsons = calcUniquePercentual(df_json, column=column)\n", + " \n", + " p1 = pd.DataFrame({'json': jsons,'other':nonjsons})\n", + " top_json = p1.sort_values('json', ascending=False).head(top)\n", + " top_other = p1.sort_values('other', ascending=False).head(top)\n", + " tops = pd.concat([top_json, top_other]).drop_duplicates()\n", + " tops.plot(kind='bar')\n", + " return tops" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "def plotUniqueValuesComparation(df_json, df_other, column):\n", + " nonjsons = calcUniquePercentual(df_other, column=column)\n", + " jsons = calcUniquePercentual(df_json, column=column)\n", + " \n", + " #Value counts\n", + " count_nonjson = len(nonjsons)\n", + " count_json = len(jsons)\n", + " p1 = pd.DataFrame([count_json, count_nonjson], \n", + " index= [ 'Json', 'Other' ], \n", + " columns=['Value Counts'])\n", + " p1.plot(kind='bar')\n", + " print(\"There are {} unique {} present on the non-json dataset and {} on the JSONs\".format(count_nonjson,\n", + " column,\n", + " count_json))\n", + " return p1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# OPERATION:\n", + "\n", + "The operation columns can have 3 different values \n", + " - GET\n", + " - SET\n", + " - CALL\n", + "\n", + "We can see below that pretty much all[1] JSONs have the operation GET when the whole sample is analysed and ALL JSONs have GET when we filter the sample to values above the mean. \n", + "The GET operation is the most common among the non-json values as well. \n", + "\n", + "---\n", + " For futher investigation: \n", + "1. Are the JSONs that have SET as operation really JSON? Are they false positives? Why are they different? " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Full sample:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEDCAYAAADOc0QpAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAEh1JREFUeJzt3X+QVeV9x/H3NwjSqMEE11ZZCExKVNwkK6yAoQOY6ChqAY1NtCapEyOTSdQ6/qhEWsfaZPLD1MTOaBpNNDWpqDGNMpaOk4mKvy2gkAEdHWK0bmASJEIVawTn2z/ulazryp5dLnt3H96vmZ2955znnvPdubuffe5zz3lOZCaSpLK8q9kFSJIaz3CXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFWivZh34gAMOyPHjxzfr8JI0JK1cufLFzGzprV3Twn38+PGsWLGiWYeXpCEpIp6v0s5hGUkqkOEuSQUy3CWpQE0bc5ekKrZt20ZnZyevvfZas0sZUCNHjqS1tZXhw4f36/m9hntE3ACcBPwuM9t62B7A1cAJwKvAmZn5eL+qkaRuOjs72W+//Rg/fjy1uClfZrJp0yY6OzuZMGFCv/ZRZVjmh8DxO9k+B5hY/1oAfLdflUhSD1577TVGjx69xwQ7QEQwevToXXq30mu4Z+b9wO930mQecFPWPArsHxEH9bsiSepmTwr2N+3qz9yID1THAC90We6sr5MkNUkjPlDt6d9LjzdmjYgF1IZuGDduXAMOXd34hf85oMd77usnDujxpD1Fo/+Wq/6tfvSjH+Xhhx9u6LF3p0b03DuBsV2WW4H1PTXMzOsysyMzO1paer16VpIGjaEU7NCYcF8CfDZqpgNbMnNDA/YrSYPGvvvuy4YNG5g5cybt7e20tbXxwAMPALB48WI+9KEP0dbWxiWXXPKW5yxatIiPfOQjTJ8+nd/+9rcDVm+v4R4Ri4FHgEMiojMizoqIL0TEF+pNlgLPAuuA64Ev7rZqJamJbr75Zo477jhWrVrF6tWraW9vZ/369VxyySXcc889rFq1iuXLl3PHHXcAsHXrVqZPn87q1auZOXMm119//YDV2uuYe2ae3sv2BL7UsIokaZA68sgj+dznPse2bduYP38+7e3t3HPPPcyePZs3h5rPOOMM7r//fubPn8+IESM46aSTAJgyZQo///nPB6xWpx+QpIpmzpzJ/fffz5gxY/jMZz7DTTfdRK1/27Phw4fvOKVx2LBhbN++faBKNdwlqarnn3+eAw88kLPPPpuzzjqLxx9/nGnTprFs2TJefPFF3njjDRYvXsysWbOaXapzy0gaWpp1mnFEcN9993HllVcyfPhw9t13X2666SYOOuggvva1r3H00UeTmZxwwgnMmzevKTW+pd6dvaXYnTo6OnIgb9bhee7S0PTUU09x2GGHNbWGTZs2MXnyZJ5/vtJ9Mhqmp589IlZmZkdvz3VYRpJ2Yv369Rx11FFcdNFFzS6lTxyWkaSdOPjgg3nmmWeaXUaf2XOXpAIZ7pJUIMNdkgpkuEtSgfxAVdLQcvmoBu9vS7+etnnzZm6++Wa++MXadFr33Xcf3/rWt7jrrrsaWV2/2XOXpH7YvHkz1157bcP21+ipCQx3Sargqquuoq2tjba2Nr7zne+wcOFCfvWrX9He3s7FF18MwCuvvMKpp57KoYceyhlnnLFj3pmVK1cya9YspkyZwnHHHceGDbVZ0WfPns2ll17KrFmzuPrqqxtar8MyktSLlStXcuONN/LYY4+RmUybNo0f//jHrFmzhlWrVgG1YZknnniCtWvXcvDBBzNjxgweeughpk2bxrnnnsudd95JS0sLt956K4sWLeKGG24Aau8Ali1b1vCaDXdJ6sWDDz7IySefzD777APAKaecsuNGHV1NnTqV1tZWANrb23nuuefYf//9WbNmDcceeywAb7zxBgcddNCO53zqU5/aLTUb7pLUi6pzcO299947Hr85xW9mcvjhh/PII4/0+Jw3/2E0mmPuktSLmTNncscdd/Dqq6+ydetWfvaznzFjxgxefvnlXp97yCGHsHHjxh3hvm3bNtauXbu7S7bnLmmI6eepi7ti8uTJnHnmmUydOhWAz3/+80yZMoUZM2bQ1tbGnDlzOPHEnmeCHTFiBLfffjvnnXceW7ZsYfv27Zx//vkcfvjhu7Vmp/zdTZzyV2qMwTDlb7M45a8k6S0Md0kqkOEuadBr1vBxM+3qz2y4SxrURo4cyaZNm/aogM9MNm3axMiRI/u9D8+WkTSotba20tnZycaNG5tdyoAaOXLkjgui+sNwlzSoDR8+nAkTJjS7jCHHYRlJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgSqFe0QcHxFPR8S6iFjYw/ZxEXFvRDwREb+MiBMaX6okqapewz0ihgHXAHOAScDpETGpW7O/B27LzCOA04BrG12oJKm6Kj33qcC6zHw2M18HbgHmdWuTwHvqj0cB6xtXoiSpr6qE+xjghS7LnfV1XV0OfDoiOoGlwLk97SgiFkTEiohYsadNAiRJA6lKuEcP67rPvXk68MPMbAVOAH4UEW/bd2Zel5kdmdnR0tLS92olSZVUCfdOYGyX5VbePuxyFnAbQGY+AowEDmhEgZKkvqsS7suBiRExISJGUPvAdEm3Nv8DfBwgIg6jFu6Ou0hSk/Qa7pm5HTgHuBt4itpZMWsj4oqImFtvdiFwdkSsBhYDZ+aedNsUSRpkKt2sIzOXUvugtOu6y7o8fhKY0djSJEn95RWqklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpALtVaVRRBwPXA0MA76fmV/voc0ngcuBBFZn5l83sM6h5/JRA3y8LQN7PEmDWq/hHhHDgGuAY4FOYHlELMnMJ7u0mQh8GZiRmS9FxIG7q2BJUu+qDMtMBdZl5rOZ+TpwCzCvW5uzgWsy8yWAzPxdY8uUJPVFlXAfA7zQZbmzvq6rDwIfjIiHIuLR+jCOJKlJqoy5Rw/rsof9TARmA63AAxHRlpmb37KjiAXAAoBx48b1uVhJUjVVeu6dwNguy63A+h7a3JmZ2zLz18DT1ML+LTLzuszsyMyOlpaW/tYsSepFlXBfDkyMiAkRMQI4DVjSrc0dwNEAEXEAtWGaZxtZqCSpul7DPTO3A+cAdwNPAbdl5tqIuCIi5tab3Q1siogngXuBizNz0+4qWpK0c5XOc8/MpcDSbusu6/I4gQvqX5KkJvMKVUkqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KBDHdJKlClcI+I4yPi6YhYFxELd9Lu1IjIiOhoXImSpL7qNdwjYhhwDTAHmAScHhGTemi3H3Ae8Fiji5Qk9U2VnvtUYF1mPpuZrwO3APN6aPdPwDeB1xpYnySpH6qE+xjghS7LnfV1O0TEEcDYzLxrZzuKiAURsSIiVmzcuLHPxUqSqqkS7tHDutyxMeJdwLeBC3vbUWZel5kdmdnR0tJSvUpJUp9UCfdOYGyX5VZgfZfl/YA24L6IeA6YDizxQ1VJap4q4b4cmBgREyJiBHAasOTNjZm5JTMPyMzxmTkeeBSYm5krdkvFkqRe9RrumbkdOAe4G3gKuC0z10bEFRExd3cXKEnqu72qNMrMpcDSbusue4e2s3e9LEnSrvAKVUkqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kq0F7NLkCSGuryUQN8vC0De7yK7LlLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklSgSuEeEcdHxNMRsS4iFvaw/YKIeDIifhkRv4iI9ze+VElSVb2Ge0QMA64B5gCTgNMjYlK3Zk8AHZn5YeB24JuNLlSSVF2VnvtUYF1mPpuZrwO3APO6NsjMezPz1frio0BrY8uUJPVFlXAfA7zQZbmzvu6dnAX8V08bImJBRKyIiBUbN26sXqUkqU+qhHv0sC57bBjxaaADuLKn7Zl5XWZ2ZGZHS0tL9SolSX1SZeKwTmBsl+VWYH33RhFxDLAImJWZf2hMeZKk/qjSc18OTIyICRExAjgNWNK1QUQcAXwPmJuZv2t8mZKkvug13DNzO3AOcDfwFHBbZq6NiCsiYm692ZXAvsBPImJVRCx5h91JkgZApfncM3MpsLTbusu6PD6mwXVJknaBV6hKUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SClTpIiZpj3L5qAE+3paBPZ72CPbcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSpQpXCPiOMj4umIWBcRC3vYvndE3Frf/lhEjG90oZKk6noN94gYBlwDzAEmAadHxKRuzc4CXsrMPwe+DXyj0YVKkqqr0nOfCqzLzGcz83XgFmBetzbzgH+rP74d+HhEROPKlCT1xV4V2owBXuiy3AlMe6c2mbk9IrYAo4EXuzaKiAXAgvriKxHxdH+KHgoCDqDbz79b/aP/SxvI125oK/31e3+VRlXCvafKsx9tyMzrgOsqHHPIi4gVmdnR7DrUd752Q5uvX02VYZlOYGyX5VZg/Tu1iYi9gFHA7xtRoCSp76qE+3JgYkRMiIgRwGnAkm5tlgB/U398KnBPZr6t5y5JGhi9DsvUx9DPAe4GhgE3ZObaiLgCWJGZS4AfAD+KiHXUeuyn7c6ih4g9YvipUL52Q5uvHxB2sCWpPF6hKkkFMtwlqUCGuyQVyHCXqM2PVGWdNFQY7g0SEROqrNOg9UjFddKQUOUKVVXzU2Byt3W3A1OaUIsqiog/ozZ9xp9ExBH88Wrr9wDvblphqiQiLtjZ9sy8aqBqGWwM910UEYcChwOjIuKULpveA4xsTlXqg+OAM6lded01CP4XuLQZBalP9mt2AYOV57nvooiYB8wH5vLWK3dfBm7JzIebUpj6JCI+kZk/bXYdUqMY7g0SEUdlpmO0Q1R9eOarwMGZOad+z4KjMvMHTS5NOxER/7Kz7Zl53kDVMtg4LNM4myLiF8CfZmZbRHwYmJuZX2l2YarkxvrXovryM8Ct1KbW0OC1stkFDFb23BskIpYBFwPfy8wj6uvWZGZbcytTFRGxPDOPjIgnurx+qzKzvdm1Sf1hz71x3p2Z/93tBlTbm1WM+mxrRIymfh+CiJgObGluSaoqIlqAS6jdCnTHiQyZ+bGmFdVkhnvjvBgRH+CP4XAqsKG5JakPLqD2gfgHIuIhoIXa9NUaGv6d2jDaicAXqE1BvrGpFTWZ4d44X6I21eihEfEb4NfAGc0tSX3wAWo3gR8LfILarST9+xg6RmfmDyLibzNzGbCsPlS6x/KXt3HmA0uBe6ld+bsVOCYiVmbmqqZWpir+ITN/EhHvBY4B/hn4Lm+/X7AGp2317xsi4kRqd4trbWI9Tef0A43TQe3t4HuB/andCHw2cH1E/F0T61I1b9S/nwj8a2beCYxoYj3qm69ExCjgQuAi4PvA+c0tqbkM98YZDUzOzIsy80JqYd8CzKR2BaQGt99ExPeATwJL65OG+fcxdPwVtbP/1mTm0cCxwMlNrqmp/OVtnHHA612WtwHvz8z/A/7QnJLUB5+kdivJ4zNzM/A+aqe2amj4cP11AyAzfw8c0cR6ms4x98a5GXg0Iu6sL/8lsDgi9gGebF5ZqiIzXwX+o8vyBjzbaSh5V0S8NzNfAoiI97GH55sXMTVQREwB/oLazIIPZuaKJpck7REi4rPAl6nNxJrU3ol9NTN/1NTCmshwl1SE+nxAH6PWufpFZu7R75gNd0kqkB+oSlKBDHdJKpDhLkkFMtwlqUD/D8IypErrsxOtAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "p1 = plotUsageComparation(df_json, df_other, 'operation')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Above the mean sample:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAECCAYAAAAFL5eMAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAEUZJREFUeJzt3X+M1/V9wPHnq/zwWkHd4MzE00I6quK1IlwBSwOYagTtZFrT6lw3EytpWjXGlkm1Mc5l61Y72y7RbbhqZxtQZ1ckysKSWrX+LEdFCxoNtTivkPa8VqZYK7jX/vieeJ6H97njy33v3vd8JCb3+Xzf970Xhnvyuc99vp9vZCaSpLK8p9EDSJLqz7hLUoGMuyQVyLhLUoGMuyQVyLhLUoGMuyQVyLhLUoGMuyQVaGyjvvDkyZNz6tSpjfrykjQibdy48cXMbO5vXcPiPnXqVNrb2xv15SVpRIqI56us87SMJBXIuEtSgYy7JBWoYefcJamK3bt309HRwWuvvdboUYZUU1MTLS0tjBs3blCfb9wlDWsdHR1MnDiRqVOnEhGNHmdIZCZdXV10dHQwbdq0QT1Hv6dlIuLmiPh1RGzex+MREf8UEVsj4smImDWoSSSpD6+99hqTJk0aNWEHiAgmTZq0Xz+tVDnn/h1g8bs8vgSY3v3fMuCfBz2NJPVhNIX9Tfv7Z+437pn5APCbd1myFLg1ax4FDouII/ZrKknSfqnHOfcjgRd6bHd079vRe2FELKN2dM/RRx9dhy994E1dcU+jR6hk29+f0egRpCFR7+/Jqt87H/3oR3n44Yfr+rUPpHrEva+fHfp81+3MXAmsBGhra/OduevpmkMbPUE11+xs9ATSoIyksEN9rnPvAI7qsd0CbK/D80rSsDFhwgR27NjBggULmDlzJq2trfz4xz8GYPXq1XzoQx+itbWVK6644m2fc9VVV3HCCScwb948fvWrXw3ZvPWI+1rgL7qvmpkH7MzMd5ySkaSRbtWqVZx22mls2rSJJ554gpkzZ7J9+3auuOIK7r33XjZt2sSGDRtYs2YNALt27WLevHk88cQTLFiwgJtuumnIZq1yKeRq4BHgmIjoiIgLI+JzEfG57iXrgOeArcBNwOcP2LSS1EAf+chHuOWWW7jmmmv42c9+xsSJE9mwYQOLFi2iubmZsWPHcv755/PAAw8AMH78eD7xiU8AMHv2bLZt2zZks/Z7zj0zz+vn8QS+ULeJJGmYWrBgAQ888AD33HMPn/nMZ1i+fDmHHHLIPtePGzdu7yWNY8aMYc+ePUM1qveWkaSqnn/+eQ4//HAuuugiLrzwQn76058yd+5c7r//fl588UXeeOMNVq9ezcKFCxs9qrcfkDSyNOqy34jgvvvu47rrrmPcuHFMmDCBW2+9lSOOOIKvfvWrnHzyyWQmp59+OkuXLm3IjG+bt3ZWZei1tbXlSHizjhFznXvTnzV6hGq8FFID9PTTT3Pcccc1dIauri5mzZrF889Xep+Muunrzx4RGzOzrb/P9bSMJL2L7du3c9JJJ/GlL32p0aMMiKdlJOldTJkyhWeffbbRYwyYR+6SVCDjLkkFMu6SVCDjLkkF8heqkkaWet8BdZCX57700kusWrWKz3++dseV++67j69//evcfffd9Zxu0Dxyl6RBeOmll7jxxhvr9nz1vjWBcZekCq6//npaW1tpbW3lm9/8JitWrODnP/85M2fOZPny5QC88sornHPOORx77LGcf/75vPki0Y0bN7Jw4UJmz57Naaedxo4dtRvnLlq0iCuvvJKFCxfyrW99q67zelpGkvqxceNGbrnlFh577DEyk7lz5/K9732PzZs3s2nTJqB2Wubxxx9ny5YtTJkyhfnz5/PQQw8xd+5cLrnkEu666y6am5u5/fbbueqqq7j55puB2k8A999/f91nNu6S1I8HH3yQs846i4MPPhiAs88+e+8bdfQ0Z84cWlpaAJg5cybbtm3jsMMOY/PmzZx66qkAvPHGGxxxxFtvM/3pT3/6gMxs3CWpH1XvwXXQQQft/fjNW/xmJscffzyPPPJIn5/z5j8Y9eY5d0nqx4IFC1izZg2vvvoqu3bt4gc/+AHz58/n5Zdf7vdzjznmGDo7O/fGfffu3WzZsuVAj+yRu6QRpgF3Fp01axYXXHABc+bMAeCzn/0ss2fPZv78+bS2trJkyRLOOKPvWxGPHz+eO++8k0svvZSdO3eyZ88eLrvsMo4//vgDOrO3/O2Ht/ytM2/5qwEaDrf8bRRv+StJehvjLkkFMu6Shr1GnT5upP39Mxt3ScNaU1MTXV1doyrwmUlXVxdNTU2Dfg6vlpE0rLW0tNDR0UFnZ2ejRxlSTU1Ne18QNRjGXdKwNm7cOKZNm9boMUYcT8tIUoGMuyQVyLhLUoGMuyQVyLhLUoGMuyQVyLhLUoGMuyQVqFLcI2JxRDwTEVsjYkUfjx8dET+KiMcj4smIOL3+o0qSquo37hExBrgBWALMAM6LiBm9ln0FuCMzTwTOBW6s96CSpOqqHLnPAbZm5nOZ+TpwG7C015oEDun++FBge/1GlCQNVJV7yxwJvNBjuwOY22vNNcB/R8QlwMHAKXWZTpI0KFWO3KOPfb3vvXke8J3MbAFOB74bEe947ohYFhHtEdE+2u7wJklDqUrcO4Cjemy38M7TLhcCdwBk5iNAEzC59xNl5srMbMvMtubm5sFNLEnqV5W4bwCmR8S0iBhP7Rema3ut+R/g4wARcRy1uHtoLkkN0m/cM3MPcDGwHnia2lUxWyLi2og4s3vZF4GLIuIJYDVwQY6mt02RpGGm0pt1ZOY6YF2vfVf3+PgpYH59R5MkDZavUJWkAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAlWKe0QsjohnImJrRKzYx5pPRcRTEbElIlbVd0xJ0kCM7W9BRIwBbgBOBTqADRGxNjOf6rFmOvBlYH5m/jYiDj9QA0uS+lflyH0OsDUzn8vM14HbgKW91lwE3JCZvwXIzF/Xd0xJ0kBUifuRwAs9tju69/X0QeCDEfFQRDwaEYv7eqKIWBYR7RHR3tnZObiJJUn9qhL36GNf9toeC0wHFgHnAf8WEYe945MyV2ZmW2a2NTc3D3RWSVJFVeLeARzVY7sF2N7Hmrsyc3dm/gJ4hlrsJUkNUCXuG4DpETEtIsYD5wJre61ZA5wMEBGTqZ2mea6eg0qSqus37pm5B7gYWA88DdyRmVsi4tqIOLN72XqgKyKeAn4ELM/MrgM1tCTp3fV7KSRAZq4D1vXad3WPjxO4vPs/SVKD+QpVSSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAhl3SSqQcZekAlWKe0QsjohnImJrRKx4l3XnRERGRFv9RpQkDVS/cY+IMcANwBJgBnBeRMzoY91E4FLgsXoPKUkamCpH7nOArZn5XGa+DtwGLO1j3d8AXwNeq+N8kqRBqBL3I4EXemx3dO/bKyJOBI7KzLvrOJskaZCqxD362Jd7H4x4D/AN4Iv9PlHEsohoj4j2zs7O6lNKkgakStw7gKN6bLcA23tsTwRagfsiYhswD1jb1y9VM3NlZrZlZltzc/Pgp5Ykvasqcd8ATI+IaRExHjgXWPvmg5m5MzMnZ+bUzJwKPAqcmZntB2RiSVK/+o17Zu4BLgbWA08Dd2Tmloi4NiLOPNADSpIGbmyVRZm5DljXa9/V+1i7aP/HkiTtD1+hKkkFMu6SVCDjLkkFMu6SVCDjLkkFMu6SVCDjLkkFMu6SVCDjLkkFMu6SVCDjLkkFMu6SVCDjLkkFMu6SVCDjLkkFMu6SVCDjLkkFMu6SVCDjLkkFMu6SVCDjLkkFMu6SVCDjLkkFMu6SVCDjLkkFMu6SVCDjLkkFMu6SVCDjLkkFMu6SVCDjLkkFMu6SVCDjLkkFqhT3iFgcEc9ExNaIWNHH45dHxFMR8WRE/DAi3l//USVJVfUb94gYA9wALAFmAOdFxIxeyx4H2jLzw8CdwNfqPagkqboqR+5zgK2Z+Vxmvg7cBiztuSAzf5SZr3ZvPgq01HdMSdJAVIn7kcALPbY7uvfty4XAf/X1QEQsi4j2iGjv7OysPqUkaUCqxD362Jd9Loz4c6ANuK6vxzNzZWa2ZWZbc3Nz9SklSQMytsKaDuCoHtstwPbeiyLiFOAqYGFm/r4+40mSBqPKkfsGYHpETIuI8cC5wNqeCyLiROBfgTMz89f1H1OSNBD9xj0z9wAXA+uBp4E7MnNLRFwbEWd2L7sOmAD8R0Rsioi1+3g6SdIQqHJahsxcB6zrte/qHh+fUue5JEn7wVeoSlKBjLskFci4S1KBjLskFci4S1KBjLskFci4S1KBjLskFci4S1KBjLskFci4S1KBjLskFci4S1KBjLskFci4S1KBjLskFci4S1KBjLskFci4S1KBjLskFci4S1KBjLskFci4S1KBjLskFci4S1KBjLskFci4S1KBjLskFci4S1KBjLskFci4S1KBjLskFahS3CNicUQ8ExFbI2JFH48fFBG3dz/+WERMrfegkqTq+o17RIwBbgCWADOA8yJiRq9lFwK/zcw/Br4B/EO9B5UkVVflyH0OsDUzn8vM14HbgKW91iwF/r374zuBj0dE1G9MSdJAjK2w5kjghR7bHcDcfa3JzD0RsROYBLzYc1FELAOWdW++EhHPDGZovVPAZHr9/x6W/tp/80ehkfF3c+R4f5VFVeLe13djDmINmbkSWFnha2qAIqI9M9saPYfUm383G6PKaZkO4Kge2y3A9n2tiYixwKHAb+oxoCRp4KrEfQMwPSKmRcR44Fxgba81a4G/7P74HODezHzHkbskaWj0e1qm+xz6xcB6YAxwc2ZuiYhrgfbMXAt8G/huRGyldsR+7oEcWn3ydJeGK/9uNkB4gC1J5fEVqpJUIOMuSQUy7pJUIOMuqe4i4qAq+3TgGPcRKiKmVdknNcgjFffpAKnyClUNT98HZvXadycwuwGzSABExB9Rux3JeyPiRN569fohwPsaNtgoZNxHmIg4FjgeODQizu7x0CFAU2OmkvY6DbiA2ivZr++x/3+BKxsx0Gjlde4jTEQsBf4UOJO3v1L4ZeC2zHy4IYNJPUTEJzPz+42eYzQz7iNURJyUmZ7D1LDUfXrmb4Epmbmk+z0gTsrMbzd4tFHDX6iOXF0R8cOI2AwQER+OiK80eiip2y3UblkypXv7WeCyxo0z+hj3kesm4MvAboDMfBLv6aPhY3Jm3gH8H9TuUQW80diRRhfjPnK9LzN/0mvfnoZMIr3TroiYRPf7OkTEPGBnY0caXbxaZuR6MSI+wFvfPOcAOxo7krTX5dR+4f+BiHgIaKZ2O3ANEeM+cn2B2q1Uj42IXwK/AM5v7EjSXh8AllB7E59PUntrTnszhLxaZoSKiMu7P3wvtdNru6j92LsxMzc1bDAJiIgnM/PDEfEx4O+AfwSuzMze77+sA8Rz7iNXG/A54A+Aw6i98fgi4KaI+KsGziXBW788PQP4l8y8CxjfwHlGHY/cR6iIWA98MjNf6d6eQO32A2dRO3qf0cj5NLpFxN3AL4FTqN0S43fATzLzhIYONop45D5yHQ283mN7N/D+zPwd8PvGjCTt9Slq17kvzsyXgD8Eljd2pNHFX3CMXKuARyPiru7tPwFWR8TBwFONG0uCzHwV+M8e2zvwaq4h5WmZESwiZgMfo3bnvQczs73BI0kaJoy7JBXIc+6SVCDjLkkFMu6SVCDjLkkF+n9G2kb/7zCg+wAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "p2 = plotUsageComparation(df_a_json, df_a_other, 'operation')" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
jsonotherjsonother
get0.9967190.6167281.00.999047
set0.0032810.131672NaN0.000953
callNaN0.251601NaNNaN
\n", + "
" + ], + "text/plain": [ + " json other json other\n", + "get 0.996719 0.616728 1.0 0.999047\n", + "set 0.003281 0.131672 NaN 0.000953\n", + "call NaN 0.251601 NaN NaN" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEDCAYAAADOc0QpAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFhZJREFUeJzt3X2QVfWd5/H3NzzIjqBGbXfB1kBlHUWa2EoLKFsNzmgh6sro9EaFdbDKh0o56rqJLIzsptjsTJkZXUenSmejiUyMC+o4K1LKllsVA8SHuDTajCDRQgfXHiiDJLCK6wSo7/5xW9JpWvr05dKXPr5fVZT3nPs79366Ln4499fnITITSVK5fKneASRJtWe5S1IJWe6SVEKWuySVkOUuSSVkuUtSCVnuklRClrsklZDlLkklNLReb3ziiSfm2LFj6/X2kjQorVu37sPMbOhrXN3KfezYsbS3t9fr7SVpUIqI94qMc1pGkkrIcpekErLcJamE6jbnLklF7Nmzh87OTj799NN6RxlQI0aMoLGxkWHDhlW1veUu6YjW2dnJqFGjGDt2LBFR7zgDIjPZsWMHnZ2djBs3rqrX6HNaJiIeiYhfRMSGz3k+IuKvImJzRPx9RJxTVRJJ6sWnn37KCSec8IUpdoCI4IQTTjikbytF5tz/Brj4IM/PAk7r+nMT8NdVp5GkXnyRiv0zh/oz91numbkG+OVBhswGHs2KnwHHRcToQ0olSToktZhzPxl4v9tyZ9e6bT0HRsRNVPbuOfXUU2vw1keuiT+cWNV2b8x7o8ZJpHIZu/C5mr7elu9eWmjc+eefz8svv1zT9z6calHuvX136PWu25n5EPAQQEtLy4DembvavxBFP/ha2XTG+Kq2G//zTTVOIqm7wVTsUJvj3DuBU7otNwJba/C6knTEGDlyJNu2baO1tZXm5maampr46U9/CsCyZcuYOHEiTU1NLFiw4Le2WbRoEWeddRZTp07lgw8+GLC8tSj3FcAfdR01MxXYlZkHTMlI0mC3dOlSZs6cSUdHB+vXr6e5uZmtW7eyYMECXnjhBTo6Oli7di3Lly8HYPfu3UydOpX169fT2trKww8/PGBZixwKuQx4BTg9Ijoj4vqI+EZEfKNryErgXWAz8DBw82FLK0l1dO6557JkyRIWL17MG2+8wahRo1i7di0zZsygoaGBoUOHMnfuXNasWQPA8OHDueyyywCYNGkSW7ZsGbCsfc65Z+Y1fTyfwB/XLJEkHaFaW1tZs2YNzz33HNdeey3z58/nmGOO+dzxw4YN239I45AhQ9i7d+9ARfXaMpJU1HvvvcdJJ53EjTfeyPXXX89rr73GlClTWL16NR9++CH79u1j2bJlTJ8+vd5RvfyApMFloI9g+0xEsGrVKu6++26GDRvGyJEjefTRRxk9ejR33XUXF1xwAZnJJZdcwuzZs+uSsTvLXZL6sGPHDo4//njmzZvHvHnzDnh+zpw5zJkz54D1H3/88f7HbW1ttLW1Hdac3TktI0kHsXXrVs477zzuuOOOekfpF/fcJekgxowZw9tvv13vGP3mnrsklZDlLkklZLlLUglZ7pJUQv5CVdLgsvjYGr/erqo227lzJ0uXLuXmmytXXFm1ahX33HMPzz77bC3TVc09d0mqws6dO3nwwQdr9nq1vjSB5S5JBdx77700NTXR1NTEfffdx8KFC3nnnXdobm5m/vz5QOWkpba2Ns444wzmzp1L5dJbsG7dOqZPn86kSZOYOXMm27ZVLpw7Y8YM7rzzTqZPn879999f07xOy0hSH9atW8eSJUt49dVXyUymTJnCY489xoYNG+jo6AAq0zKvv/46GzduZMyYMUybNo2XXnqJKVOmcOutt/LMM8/Q0NDAE088waJFi3jkkUeAyjeA1atX1zyz5S5JfXjxxRe54oorOProowG48sor99+oo7vJkyfT2NgIQHNzM1u2bOG4445jw4YNXHTRRQDs27eP0aN/c5vpq6666rBkttwlqQ+fTa/05aijjtr/+LNL/GYmEyZM4JVXXul1m8/+wag159wlqQ+tra0sX76cTz75hN27d/P0008zbdo0Pvrooz63Pf3009m+ffv+ct+zZw8bN2483JHdc5c0yFR56OKhOOecc7juuuuYPHkyADfccAOTJk1i2rRpNDU1MWvWLC69tPdLEQ8fPpynnnqK2267jV27drF3715uv/12JkyYcFgzR9GvG7XW0tKS7e3tA/Z+Yxc+V9V21V47euIPJ1a13ZN3VXc41Pifb6pqO+lIt2nTJsaPH1/vGHXR288eEesys6WvbZ2WkaQSstwlqYQsd0kqIctdkkrIcpekErLcJamEPM5d0qBS7WHGn+eNeW8UGnf++efz8ssv1/S9Dyf33CWpgMFU7GC5S1IhI0eOZNu2bbS2ttLc3ExTU9P+i4ctW7aMiRMn0tTUxIIFC35rm0WLFnHWWWcxdepUPvjggwHLa7lLUkFLly5l5syZdHR0sH79epqbm9m6dSsLFizghRdeoKOjg7Vr17J8+XIAdu/ezdSpU1m/fj2tra08/PDDA5bVcpekgs4991yWLFnC4sWLeeONNxg1ahRr165lxowZNDQ0MHToUObOncuaNWuAynVlLrvsMgAmTZrEli1bBiyr5S5JBbW2trJmzRpOPvlkrr32Wh599NGDXg542LBhRATwm0sADxTLXZIKeu+99zjppJO48cYbuf7663nttdeYMmUKq1ev5sMPP2Tfvn0sW7aM6dOn1zuqh0JKGlyKHrpYaxHBqlWruPvuuxk2bBgjR47k0UcfZfTo0dx1111ccMEFZCaXXHIJs2fPrkvG7gqVe0RcDNwPDAG+n5nf7fH8qcAPgeO6xizMzJU1zipJdbFjxw6OP/545s2bx7x58w54fs6cOcyZM+eA9R9//PH+x21tbbS1tR3WnN31OS0TEUOAB4BZwJnANRFxZo9h/xF4MjPPBq4GHqx1UEmqh61bt3Leeedxxx131DtKvxTZc58MbM7MdwEi4nFgNvBmtzEJHNP1+Fhgay1DSlK9jBkzhrfffrveMfqtSLmfDLzfbbkTmNJjzGLgf0XErcDRwIU1SSdJqkqRo2Wil3U9j/25BvibzGwELgF+FBEHvHZE3BQR7RHRvn379v6nlSQVUqTcO4FTui03cuC0y/XAkwCZ+QowAjix5wtl5kOZ2ZKZLQ0NDdUlliT1qUi5rwVOi4hxETGcyi9MV/QY83+A3weIiPFUyt1dc0mqkz7n3DNzb0TcAjxP5TDHRzJzY0R8B2jPzBXAt4CHI+LfU5myuS4PdtqWJFVp0xnja/p643++qartdu7cydKlS7n55psBWLVqFffccw/PPvtsLeNVrdBx7l3HrK/sse7b3R6/CUyrbTRJOnLt3LmTBx98cH+5H6q9e/cydGjtziv18gOSVMC9995LU1MTTU1N3HfffSxcuJB33nmH5uZm5s+fD1ROWmpra+OMM85g7ty5+687s27dOqZPn86kSZOYOXMm27ZtA2DGjBnceeedTJ8+nfvvv7+meb38gCT1Yd26dSxZsoRXX32VzGTKlCk89thjbNiwgY6ODqAyLfP666+zceNGxowZw7Rp03jppZeYMmUKt956K8888wwNDQ088cQTLFq0iEceeQSofANYvXp1zTNb7pLUhxdffJErrriCo48+GoArr7xy/406ups8eTKNjY0ANDc3s2XLFo477jg2bNjARRddBMC+ffsYPXr0/m2uuuqqw5LZcpekPhQ9PuSoo47a//izS/xmJhMmTOCVV17pdZvP/sGoNefcJakPra2tLF++nE8++YTdu3fz9NNPM23aND766KM+tz399NPZvn37/nLfs2cPGzduPNyR3XOXNLhUe+jioTjnnHO47rrrmDx5MgA33HADkyZNYtq0aTQ1NTFr1iwuvfTSXrcdPnw4Tz31FLfddhu7du1i79693H777UyYMOGwZo56HY7e0tKS7e3tA/Z+Yxc+V9V2W77b+wfWl4k/nFjVdk/eVd2dWurxF14aCJs2bWL8+Noe2z5Y9PazR8S6zGzpa1unZSSphCx3SSohy13SEe+LeDWTQ/2ZLXdJR7QRI0awY8eOL1TBZyY7duxgxIgRVb+GR8tIOqI1NjbS2dnJF+0eECNGjNh/QlQ1LHdJR7Rhw4Yxbty4escYdJyWkaQSstwlqYQsd0kqIctdkkrIcpekErLcJamELHdJKiHLXZJKyHKXpBKy3CWphCx3SSohy12SSshyl6QSstwlqYQsd0kqIctdkkrIcpekErLcJamELHdJKiHLXZJKqNANsiPiYuB+YAjw/cz8bi9jvg4sBhJYn5lzapizfhYfW912406tbQ5J6oc+yz0ihgAPABcBncDaiFiRmW92G3Ma8CfAtMz8VUScdLgCS5L6VmRaZjKwOTPfzcxfA48Ds3uMuRF4IDN/BZCZv6htTElSfxQp95OB97std3at6+53gd+NiJci4mdd0zgHiIibIqI9Itq3b99eXWJJUp+KlHv0si57LA8FTgNmANcA34+I4w7YKPOhzGzJzJaGhob+ZpUkFVSk3DuBU7otNwJbexnzTGbuycx/AN6iUvaSpDooUu5rgdMiYlxEDAeuBlb0GLMcuAAgIk6kMk3zbi2DSpKK67PcM3MvcAvwPLAJeDIzN0bEdyLi8q5hzwM7IuJN4CfA/MzccbhCS5IOrtBx7pm5EljZY923uz1O4JtdfyRJdeYZqpJUQpa7JJWQ5S5JJWS5S1IJWe6SVEKWuySVkOUuSSVkuUtSCVnuklRClrsklZDlLkklZLlLUglZ7pJUQpa7JJWQ5S5JJWS5S1IJWe6SVEKWuySVkOUuSSVkuUtSCVnuklRClrsklZDlLkklZLlLUglZ7pJUQpa7JJWQ5S5JJWS5S1IJWe6SVEKWuySVkOUuSSVkuUtSCRUq94i4OCLeiojNEbHwIOPaIiIjoqV2ESVJ/dVnuUfEEOABYBZwJnBNRJzZy7hRwG3Aq7UOKUnqnyJ77pOBzZn5bmb+GngcmN3LuP8C/AXwaQ3zSZKqUKTcTwbe77bc2bVuv4g4GzglM5+tYTZJUpWKlHv0si73PxnxJeAvgW/1+UIRN0VEe0S0b9++vXhKSVK/FCn3TuCUbsuNwNZuy6OAJmBVRGwBpgIrevulamY+lJktmdnS0NBQfWpJ0kEVKfe1wGkRMS4ihgNXAys+ezIzd2XmiZk5NjPHAj8DLs/M9sOSWJLUpz7LPTP3ArcAzwObgCczc2NEfCciLj/cASVJ/Te0yKDMXAms7LHu258zdsahx5IkHQrPUJWkErLcJamELHdJKiHLXZJKyHKXpBKy3CWphCx3SSohy12SSshyl6QSstwlqYQsd0kqIctdkkrIcpekErLcJamELHdJKiHLXZJKyHKXpBKy3CWphCx3SSohy12SSshyl6QSstwlqYQsd0kqIctdkkrIcpekEhpa7wCS1KvFx1a53a7a5hik3HOXpBKy3CWphCx3SSohy12SSshyl6QSstwlqYQsd0kqoULlHhEXR8RbEbE5Ihb28vw3I+LNiPj7iPhxRHyl9lElSUX1We4RMQR4AJgFnAlcExFn9hj2OtCSmV8DngL+otZBJUnFFdlznwxszsx3M/PXwOPA7O4DMvMnmflJ1+LPgMbaxpQk9UeRcj8ZeL/bcmfXus9zPfA/e3siIm6KiPaIaN++fXvxlJKkfilS7tHLuux1YMS/BVqAu3t7PjMfysyWzGxpaGgonlKS1C9FLhzWCZzSbbkR2NpzUERcCCwCpmfmP9UmniSpGkX23NcCp0XEuIgYDlwNrOg+ICLOBr4HXJ6Zv6h9TElSf/RZ7pm5F7gFeB7YBDyZmRsj4jsRcXnXsLuBkcDfRkRHRKz4nJeTJA2AQtdzz8yVwMoe677d7fGFNc4lSToEnqEqSSVkuUtSCVnuklRClrsklZDlLkklZLlLUglZ7pJUQpa7JJVQoZOYpEFr8bFVbLOr9jmkAeaeuySVkOUuSSVkuUtSCVnuklRClrsklZDlLkklZLlLUglZ7pJUQpa7JJWQ5S5JJWS5S1IJWe6SVEKWuySVkOUuSSVkuUtSCVnuklRClrsklZDlLkklZLlLUglZ7pJUQpa7JJWQ5S5JJWS5S1IJFSr3iLg4It6KiM0RsbCX54+KiCe6nn81IsbWOqgkqbg+yz0ihgAPALOAM4FrIuLMHsOuB36Vmf8S+Evgz2sdVJJUXJE998nA5sx8NzN/DTwOzO4xZjbww67HTwG/HxFRu5iSpP4YWmDMycD73ZY7gSmfNyYz90bELuAE4MPugyLiJuCmrsWPI+KtakIPpOr/hdpwIj1+/iJ6fiUqzH9La+c/R1WfnY4Q5f/8vlJkUJFy7601sooxZOZDwEMF3nPQi4j2zGypdw71n5/d4ObnV1FkWqYTOKXbciOw9fPGRMRQ4Fjgl7UIKEnqvyLlvhY4LSLGRcRw4GpgRY8xK4B5XY/bgBcy84A9d0nSwOhzWqZrDv0W4HlgCPBIZm6MiO8A7Zm5AvgB8KOI2Exlj/3qwxl6kPhCTD+VlJ/d4ObnB4Q72JJUPp6hKkklZLlLUglZ7pJUQpa7ROX6SEXWSYOF5V4jETGuyDodsV4puE4aFIqcoapi/g44p8e6p4BJdciigiLiX1C5fMY/i4iz+c3Z1scAv1O3YCokIr55sOcz896BynKksdwPUUScAUwAjo2IK7s9dQwwoj6p1A8zgeuonHndvQj+L3BnPQKpX0bVO8CRyuPcD1FEzAb+ALic3z5z9yPg8cx8uS7B1C8R8YeZ+Xf1ziHViuVeIxFxXmY6RztIdU3P/BkwJjNndd2z4LzM/EGdo+kgIuKvDvZ8Zt42UFmONE7L1M6OiPgx8M8zsykivgZcnpl/Wu9gKmRJ159FXctvA09QubSGjlzr6h3gSOWee41ExGpgPvC9zDy7a92GzGyqbzIVERFrM/PciHi92+fXkZnN9c4mVcM999r5ncz83z1uQLW3XmHUb7sj4gS67kMQEVOBXfWNpKIiogFYQOV+N/sPZMjM36tbqDqz3Gvnw4j4Kr8phzZgW30jqR++SeUX4l+NiJeABiqXr9bg8N+pTKNdCnyDyiXIt9c1UZ1Z7rXzx1QuNXpGRPwj8A/A3PpGUj98lcpN4E8B/pDKrST9/2PwOCEzfxAR/y4zVwOru6ZKv7D8y1s7fwCsBH5C5czf3cCFEbEuMzvqmkxF/KfM/NuI+DJwIfBfgb/mwPsF68i0p+u/2yLiUip3i2usY5668/IDtdNC5evgl4HjqNwIfAbwcET8hzrmUjH7uv57KfDfMvMZYHgd86h//jQijgW+BdwBfB+4vb6R6styr50TgHMy847M/BaVsm8AWqmcAakj2z9GxPeArwMruy4a5v8fg8e/oXL034bMvAC4CLiizpnqyr+8tXMq8Otuy3uAr2Tm/wP+qT6R1A9fp3IryYszcydwPJVDWzU4fK3rcwMgM38JnF3HPHXnnHvtLAV+FhHPdC3/a2BZRBwNvFm/WCoiMz8B/ke35W14tNNg8qWI+HJm/gogIo7nC95vnsRUQxExCfhXVK4s+GJmttc5kvSFEBF/BPwJlSuxJpVvYn+WmT+qa7A6stwllULX9YB+j8rO1Y8z8wv9jdlyl6QS8heqklRClrsklZDlLkklZLlLUgn9fwMN6ewEpG0IAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "p3 = pd.concat([p1, p2], axis=1, sort=False).drop_duplicates()\n", + "p3.plot(kind='bar')\n", + "p3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SYMBOLS\n", + "There is a pretty big difference[1] of unique values that appear on the whole sample and the filtered sample. \n", + "For the whole sample there is as much as 245 different symbols on the non json values, but it is drastically reduced to 2 symbols[2] for the filtered sample with only value_len above the mean, most being 'window.document.cookie' (99%). \n", + "For the valid JSONs there are only 12 symbols total reduced to 5 symbols[3]. \n", + "\n", + "---\n", + " For futher investigation: \n", + "1. Why is the difference so big? does it have any meaning?\n", + "2. What are the meaning of the 2 symbols of the non-json? are they special? why only 2? Why 'window.document.cookie'?\n", + "3. Why these 5 symbols? what do they do? what do they represent?\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Full Sample:" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 245 unique symbol present on the non-json dataset and 12 on the JSONs\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEOCAYAAACHE9xHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAE8NJREFUeJzt3X9wXWWdx/H3FxqoClYshQVaCDhFobaEEll2caVLcalF5cfKjw7ajnWtzoDKLhXbsjP+WMXKoLLsrJ3BUSlM15atMjCIbKFTBpzxV1MjtFS0owVCawmIFQRqW777R04wtGmTJrm9yZP3a+bOOee5zznnmyb95OS5zz03MhNJUrkOqHcBkqTaMuglqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhRtR7wIADj/88GxsbKx3GZI0pLS0tDyTmWN66jcogr6xsZHVq1fXuwxJGlIi4vHe9HPoRpIK12PQR8S4iFgVEesjYl1EfKpq/1xEPBURrdVjepd95kfEhoh4LCLOreUXIEnau94M3ewArs7MNRFxKNASEfdVz309M2/o2jkiTgYuAyYARwP3R8SJmblzIAuXJPVOj0GfmZuBzdX68xGxHjhmL7ucDyzNzG3A7yJiA3A68ON9KWz79u20tbXx8ssv78tuqqGRI0cyduxYGhoa6l2KpH2wTy/GRkQjcCrwU+BM4MqImAmspuOq/zk6fgn8pMtubXTziyEi5gBzAI499tjdztXW1sahhx5KY2MjEbEvZaoGMpNnn32WtrY2jj/++HqXI2kf9PrF2Ig4BPgecFVm/glYBLwFaKLjiv+rnV272X23TzfJzJszszkzm8eM2X120Msvv8zo0aMN+UEiIhg9erR/YUlDUK+CPiIa6Aj5JZn5fYDM3JKZOzPzFeCbdAzPQMcV/Lguu48FNvWlOEN+cPH7IQ1NvZl1E8C3gPWZ+bUu7Ud16XYhsLZavwu4LCIOjojjgfHAzwauZEnSvujNGP2ZwIeARyKitWpbAMyIiCY6hmU2Ah8DyMx1EXE78CgdM3auGIgZN43zftDfQ7zGxoXn7fX5KVOmMH/+fM4996+zQ2+88UZ+/etf841vfGOP+x1yyCG88MILA1LjrbfeyvXXX09mkpnMnj2buXPnDsixO1133XUsWLBgQI+p/W+g/38Mdz3lw1DT4xV9Zv4oMyMzJ2VmU/W4JzM/lJkTq/b3V7NzOvf5Uma+JTPfmpk/rO2XUBszZsxg6dKlr2lbunQpM2bM2C/n/+EPf8iNN97IihUrWLduHWvWrGHUqFEDfp7rrrtuwI8paXDxnbF78IEPfIC7776bbdu2AbBx40Y2bdrEO9/5Tl544QWmTp3K5MmTmThxInfeeedu+z/wwAO8973vfXX7yiuv5JZbbgGgpaWFs846i9NOO41zzz2XzZs377b/l7/8ZW644QaOPvpooGNq40c/+lEAWltbOeOMM5g0aRIXXnghzz33HNDxV0jnrSSeeeYZOu8fdMstt3DRRRcxbdo0xo8fzzXXXAPAvHnzeOmll2hqauLyyy/nz3/+M+eddx6nnHIKb3/721m2bNkA/EtKqjeDfg9Gjx7N6aefzr333gt0XM1feumlRAQjR47kjjvuYM2aNaxatYqrr76azN0mFnVr+/btfOITn2D58uW0tLQwe/Zsrr322t36rV27ltNOO63bY8ycOZOvfOUrPPzww0ycOJHPf/7zPZ63tbWVZcuW8cgjj7Bs2TKefPJJFi5cyOte9zpaW1tZsmQJ9957L0cffTS//OUvWbt2LdOmTevV1yRpcDPo96Lr8E3XYZvMZMGCBUyaNIlzzjmHp556ii1btvTqmI899hhr167l3e9+N01NTXzxi1+kra2t1zVt3bqVP/7xj5x11lkAzJo1iwcffLDH/aZOncqoUaMYOXIkJ598Mo8/vvu9kCZOnMj999/PZz7zGR566KGaDBVJ2v8M+r244IILWLlyJWvWrOGll15i8uTJACxZsoT29nZaWlpobW3lyCOP3G1++YgRI3jllVde3e58PjOZMGECra2ttLa28sgjj7BixYrdzj1hwgRaWlr2qd6u59y1noMPPvjV9QMPPJAdO3bstv+JJ55IS0sLEydOZP78+XzhC1/Yp/NLGpwM+r045JBDmDJlCrNnz37Ni7Bbt27liCOOoKGhgVWrVnV7dXzcccfx6KOPsm3bNrZu3crKlSsBeOtb30p7ezs//nHHHSG2b9/OunXrdtt//vz5XHPNNfz+978HYNu2bdx0002MGjWKww47jIceegiA22677dWr+8bGxld/OSxfvrxXX2NDQwPbt28HYNOmTbz+9a/ngx/8IHPnzmXNmjW9OoakwW1Q3I++N+o13WnGjBlcdNFFr5mBc/nll/O+972P5uZmmpqaeNvb3rbbfuPGjeOSSy5h0qRJjB8/nlNPPRWAgw46iOXLl/PJT36SrVu3smPHDq666iomTJjwmv2nT5/Oli1bOOecc8hMIoLZs2cDsHjxYj7+8Y/z4osvcsIJJ/Cd73wHgLlz53LJJZdw2223cfbZZ/fq65szZw6TJk1i8uTJzJw5k09/+tMccMABNDQ0sGjRoj79m0kaXKK3LyLWUnNzc+76wSPr16/npJNOqlNF2hO/L4OT8+gH1lCZRx8RLZnZ3FM/h24kqXAGvSQVblAH/WAYVtJf+f2QhqZBG/QjR47k2WefNVwGic770Y8cObLepUjaR4N21s3YsWNpa2ujvb293qWo0vkJU5KGlkEb9A0NDX6SkSQNgEE7dCNJGhgGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klS4HoM+IsZFxKqIWB8R6yLiU1X7myPivoj4TbU8rGqPiLgpIjZExMMRMbnWX4Qkac96c0W/A7g6M08CzgCuiIiTgXnAyswcD6ystgHeA4yvHnOARQNetSSp13oM+szcnJlrqvXngfXAMcD5wOKq22Lggmr9fODW7PAT4E0RcdSAVy5J6pV9GqOPiEbgVOCnwJGZuRk6fhkAR1TdjgGe7LJbW9UmSaqDXgd9RBwCfA+4KjP/tLeu3bRlN8ebExGrI2J1e3t7b8uQJO2jXgV9RDTQEfJLMvP7VfOWziGZavl01d4GjOuy+1hg067HzMybM7M5M5vHjBnT1/olST3ozaybAL4FrM/Mr3V56i5gVrU+C7izS/vMavbNGcDWziEeSdL+N6IXfc4EPgQ8EhGtVdsCYCFwe0R8BHgCuLh67h5gOrABeBH48IBWLEnaJz0GfWb+iO7H3QGmdtM/gSv6WZckaYD4zlhJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSpcj0EfEd+OiKcjYm2Xts9FxFMR0Vo9pnd5bn5EbIiIxyLi3FoVLknqnd5c0d8CTOum/euZ2VQ97gGIiJOBy4AJ1T7fiIgDB6pYSdK+6zHoM/NB4A+9PN75wNLM3JaZvwM2AKf3oz5JUj/1Z4z+yoh4uBraOaxqOwZ4skuftqpNklQnfQ36RcBbgCZgM/DVqj266ZvdHSAi5kTE6ohY3d7e3scyJEk96VPQZ+aWzNyZma8A3+SvwzNtwLguXccCm/ZwjJszszkzm8eMGdOXMiRJvdCnoI+Io7psXgh0zsi5C7gsIg6OiOOB8cDP+leiJKk/RvTUISK+C0wBDo+INuCzwJSIaKJjWGYj8DGAzFwXEbcDjwI7gCsyc2dtSpck9UaPQZ+ZM7pp/tZe+n8J+FJ/ipIkDRzfGStJhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcD0GfUR8OyKejoi1XdreHBH3RcRvquVhVXtExE0RsSEiHo6IybUsXpLUs95c0d8CTNulbR6wMjPHAyurbYD3AOOrxxxg0cCUKUnqqx6DPjMfBP6wS/P5wOJqfTFwQZf2W7PDT4A3RcRRA1WsJGnf9XWM/sjM3AxQLY+o2o8BnuzSr61q201EzImI1RGxur29vY9lSJJ6MtAvxkY3bdldx8y8OTObM7N5zJgxA1yGJKlTX4N+S+eQTLV8umpvA8Z16TcW2NT38iRJ/dXXoL8LmFWtzwLu7NI+s5p9cwawtXOIR5JUHyN66hAR3wWmAIdHRBvwWWAhcHtEfAR4Ari46n4PMB3YALwIfLgGNUuS9kGPQZ+ZM/bw1NRu+iZwRX+LkiQNHN8ZK0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhRvRn50jYiPwPLAT2JGZzRHxZmAZ0AhsBC7JzOf6V6Ykqa8G4or+HzOzKTObq+15wMrMHA+srLYlSXVSi6Gb84HF1fpi4IIanEOS1Ev9DfoEVkRES0TMqdqOzMzNANXyiO52jIg5EbE6Ila3t7f3swxJ0p70a4weODMzN0XEEcB9EfGr3u6YmTcDNwM0NzdnP+uQJO1Bv67oM3NTtXwauAM4HdgSEUcBVMun+1ukJKnv+hz0EfGGiDi0cx34J2AtcBcwq+o2C7izv0VKkvquP0M3RwJ3RETncf4nM++NiJ8Dt0fER4AngIv7X6Ykqa/6HPSZ+VvglG7anwWm9qcoSdLA8Z2xklQ4g16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCjei3gUMJY3zflDvEoqyceF59S5BGha8opekwhn0klQ4g16SCmfQS1LhDHpJKpxBL0mFq1nQR8S0iHgsIjZExLxanUeStHc1CfqIOBD4b+A9wMnAjIg4uRbnkiTtXa2u6E8HNmTmbzPzL8BS4PwanUuStBe1CvpjgCe7bLdVbZKk/axWt0CIbtryNR0i5gBzqs0XIuKxGtUyHB0OPFPvInoSX6l3BaoDfzYH1nG96VSroG8DxnXZHgts6tohM28Gbq7R+Ye1iFidmc31rkPalT+b9VGroZufA+Mj4viIOAi4DLirRueSJO1FTa7oM3NHRFwJ/B9wIPDtzFxXi3NJkvauZrcpzsx7gHtqdXztlUNiGqz82ayDyMyee0mShixvgSBJhTPoJalwBr2kmomIAyLi7+tdx3DnGP0QFxFjgI8CjXR5cT0zZ9erJqmriPhxZv5dvesYzvxw8KHvTuAh4H5gZ51rkbqzIiL+Gfh+emVZF17RD3ER0ZqZTfWuQ9qTiHgeeAMdFyIv0XGLlMzMN9a1sGHEMfqh7+6ImF7vIqQ9ycxDM/OAzGzIzDdW24b8fuQV/RDX5WrpL8D2qtmrJQ0aERHA5cDxmfkfETEOOCozf1bn0oYNg15STUXEIuAV4OzMPCkiDgNWZOY76lzasOGLsQWIiPcD76o2H8jMu+tZj7SLv83MyRHxC4DMfK662aH2E8foh7iIWAh8Cni0enyqapMGi+3Vx4smvDol+JX6ljS8OHQzxEXEw0BTZr5SbR8I/CIzJ9W3MqlDRFwOXApMBhYDHwD+PTP/t66FDSMO3ZThTcAfqvVR9SxE2lVmLomIFmAqHVMrL8jM9XUua1gx6Ie+LwO/iIhVdPwnehcwv74lSbv5DfAnqsyJiGMz84n6ljR8OHRTgIg4CngHHUH/08z8fZ1Lkl4VEZ8APgtsoeNNU51vmHJ4cT8x6Ie4iDgTaM3MP0fEB+kYB/3PzHy8zqVJAETEBjpm3jxb71qGK2fdDH2LgBcj4hTg08DjwK31LUl6jSeBrfUuYjhzjH7o25GZGRHnAzdl5rciYla9i5Ii4t+q1d8CD0TED4Btnc9n5tfqUtgwZNAPfc9HxHzgQ8A/VNMrG+pckwRwaLV8onocVD2gmlOv/cMx+iEuIv4GmAH8PDN/VN1HZEpm3lbn0iQAIuLiXefMd9em2jHoh6jqZmad37yollmtbwM2ANdm5so6lCe9KiLWZObkntpUOw7dDFGZeeienquGb94OLKmW0n4XEe8BpgPHRMRNXZ56I7CjPlUNTwZ9gTJzJ/DLiPiveteiYW0TsBq4GPg1HX9x7qRjPv2/1rGuYcehG0k1ERENwJeAfwE20jGsOA74DrAgM7fveW8NJOfRS6qV64HDgOMyc3JmngqcQMf9mG6oa2XDjFf0kmoiIn4DnLjrB4JXryH9KjPH16ey4ccrekm1kruGfNW4E+fR71cGvaRaeTQiZu7aWN2T6Vd1qGfYcuhGUk1ExDHA94GXgBY6ruLfAbwOuDAzn6pjecOKQS+ppiLibGACHbNu1vkmvv3PoJekwjlGL0mFM+glqXAGvSQVzqCXpMIZ9JJUuP8HBP+YLYpPQ7EAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "p1 = plotUniqueValuesComparation(df_json, df_other, 'symbol')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
jsonother
window.localStorage0.6530110.020276
window.sessionStorage0.3077430.009565
HTMLCanvasElement.style0.0208590.000780
window.document.cookieNaN0.342406
window.navigator.userAgentNaN0.149935
window.Storage.getItemNaN0.101712
\n", + "
" + ], + "text/plain": [ + " json other\n", + "window.localStorage 0.653011 0.020276\n", + "window.sessionStorage 0.307743 0.009565\n", + "HTMLCanvasElement.style 0.020859 0.000780\n", + "window.document.cookie NaN 0.342406\n", + "window.navigator.userAgent NaN 0.149935\n", + "window.Storage.getItem NaN 0.101712" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plotTopUsageComparation(df_json, df_other, 'symbol', 3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Above the mean Sample:" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 2 unique symbol present on the non-json dataset and 5 on the JSONs\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAEOCAYAAACpVv3VAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAEKNJREFUeJzt3X9s1HWex/HXGxitLsgarHciYDXBX0gtpXLe6QkHKAi6q5yL2+Bi0jsbk/PXKSLFixdzHqIxG9bLaY7crihLFu44jRdUjtWDiImny9Si/FDXGFwrioXVrih0i7zvj04JlMJ8kZl+590+H0nTmel3vvOGlme+fOY7U3N3AQDi6Jf2AACAY0O4ASAYwg0AwRBuAAiGcANAMIQbAIIh3AAQDOEGgGAINwAEM6AYOz3ttNO8oqKiGLsGgF4pm83udPfyJNsWJdwVFRXasGFDMXYNAL2SmX2UdFuWSgAgGMINAMEQbgAIpihr3ABKQ3t7u5qbm7V37960R0FOWVmZhg0bpkwm8533QbiBXqy5uVmDBg1SRUWFzCztcfo8d9euXbvU3Nyss88++zvvJ1G4zWybpK8kfStpn7vXfOdHBNBj9u7dS7RLiJlpyJAhamlpOa79HMsR91+5+87jejQAPY5ol5ZCfD94chIAgkl6xO2S1piZS/o3d1/cdQMzq5dUL0kjRowo3IRFUjHvhbRH6FW2LZye9ghIoNA/9/m+7xMmTFBDQ4OmTJly4LZFixbp/fff1xNPPHHE+w0cOFC7d+8uyIzPPPOMHn30Ubm73F11dXWaM2dOQfbdacGCBZo/f35B93k0SY+4L3P3aklXS/o7M7ui6wbuvtjda9y9prw80as2AfRytbW1Wr58+SG3LV++XLW1tT3y+C+99JIWLVqkNWvWaPPmzWpsbNTgwYML/jgLFiwo+D6PJlG43X177vPnkp6TNK6YQwHoHW644QatWrVKbW1tkqRt27Zp+/btuvzyy7V7925NmjRJ1dXVGj16tJ5//vnD7r9u3Tpdc801B67fdtttWrJkiSQpm81q/PjxGjt2rKZMmaJPP/30sPs//PDDeuyxxzR06FBJHafi3XLLLZKkpqYmXXrppaqsrNT111+vL774QlLH/xI637Jj586d6nzfpSVLlmjGjBmaOnWqRo4cqblz50qS5s2bpz179qiqqkqzZs3S119/renTp+viiy/WRRddpBUrVhTgb/JQecNtZt8zs0GdlyVdJWlTwScB0OsMGTJE48aN0+rVqyV1HG3feOONMjOVlZXpueeeU2Njo9auXat77rlH7p5ov+3t7br99tu1cuVKZbNZ1dXV6f777z9su02bNmns2LHd7mP27Nl65JFH9Pbbb2v06NF68MEH8z5uU1OTVqxYoXfeeUcrVqzQxx9/rIULF+qkk05SU1OTli1bptWrV2vo0KHauHGjNm3apKlTpyb6Mx2LJEfcfyLpNTPbKOlNSS+4++qCTwKgVzp4ueTgZRJ31/z581VZWanJkyfrk08+0Y4dOxLt87333tOmTZt05ZVXqqqqSg899JCam5sTz9Ta2qovv/xS48ePlyTdfPPNevXVV/Peb9KkSRo8eLDKysp04YUX6qOPDn9fqNGjR+vll1/Wfffdp/Xr1xdlaSbvk5Pu/qGkiwv+yAD6hOuuu0533323GhsbtWfPHlVXV0uSli1bppaWFmWzWWUyGVVUVBz2Cs8BAwZo//79B653ft3dNWrUKL3++utHfexRo0Ypm81q4sSJiec9+DG7znPiiSceuNy/f3/t27fvsPufe+65ymazevHFF9XQ0KCrrrpKDzzwQOLHT4LTAQEU1cCBAzVhwgTV1dUd8qRka2urTj/9dGUyGa1du7bbo9ezzjpLW7ZsUVtbm1pbW/XKK69Iks477zy1tLQcCHd7e7s2b9582P0bGho0d+5cffbZZ5KktrY2Pf744xo8eLBOPfVUrV+/XpK0dOnSA0ffFRUVymazkqSVK1cm+jNmMhm1t7dLkrZv366TTz5ZN910k+bMmaPGxsZE+zgWvOQd6EPSOm2ztrZWM2bMOOQMk1mzZunaa69VTU2NqqqqdP755x92v+HDh2vmzJmqrKzUyJEjNWbMGEnSCSecoJUrV+qOO+5Qa2ur9u3bp7vuukujRo065P7Tpk3Tjh07NHnyZLm7zEx1dXWSpKefflq33nqrvvnmG51zzjl66qmnJElz5szRzJkztXTp0sRH6vX19aqsrFR1dbVmz56te++9V/369VMmk9GTTz75nf7OjsaSPhlwLGpqarzUf5EC53EXFudxl6atW7fqggsuSHsMdNHd98XMsknfToSlEgAIhnADQDCEG+jlirEciu+uEN8Pwg30YmVlZdq1axfxLhGd78ddVlZ2XPvhrBKgFxs2bJiam5uP+/2fUTidvwHneBBuoBfLZDLH9ZtWUJpYKgGAYAg3AARDuAEgGMINAMEQbgAIhnADQDCEGwCCIdwAEAzhBoBgCDcABEO4ASAYwg0AwRBuAAiGcANAMIQbAIIh3AAQDOEGgGAINwAEQ7gBIBjCDQDBEG4ACIZwA0AwhBsAgkkcbjPrb2ZvmdmqYg4EADi6YznivlPS1mINAgBIJlG4zWyYpOmS/r244wAA8kl6xL1I0lxJ+4s4CwAggbzhNrNrJH3u7tk829Wb2QYz29DS0lKwAQEAh0pyxH2ZpB+Y2TZJyyVNNLNfdt3I3Re7e42715SXlxd4TABAp7zhdvcGdx/m7hWSfizpf939pqJPBgDoFudxA0AwA45lY3dfJ2ldUSYBACTCETcABEO4ASAYwg0AwRBuAAiGcANAMIQbAIIh3AAQDOEGgGAINwAEQ7gBIBjCDQDBEG4ACIZwA0AwhBsAgiHcABAM4QaAYAg3AARDuAEgGMINAMEQbgAIhnADQDCEGwCCIdwAEAzhBoBgCDcABEO4ASAYwg0AwRBuAAiGcANAMIQbAIIh3AAQDOEGgGAINwAEkzfcZlZmZm+a2UYz22xmD/bEYACA7g1IsE2bpInuvtvMMpJeM7OX3P3/ijwbAKAbecPt7i5pd+5qJvfhxRwKAHBkida4zay/mTVJ+lzSr939jW62qTezDWa2oaWlpdBzAgByEoXb3b919ypJwySNM7OLutlmsbvXuHtNeXl5oecEAOQc01kl7v6lpHWSphZlGgBAXknOKik3s+/nLp8kabKkd4s9GACge0nOKjlD0tNm1l8dof8Pd19V3LEAAEeS5KyStyWN6YFZAAAJ8MpJAAiGcANAMIQbAIIh3AAQDOEGgGAINwAEQ7gBIBjCDQDBEG4ACIZwA0AwhBsAgiHcABAM4QaAYAg3AARDuAEgGMINAMEQbgAIhnADQDCEGwCCIdwAEAzhBoBgCDcABEO4ASAYwg0AwRBuAAiGcANAMIQbAIIh3AAQDOEGgGAINwAEQ7gBIBjCDQDB5A23mQ03s7VmttXMNpvZnT0xGACgewMSbLNP0j3u3mhmgyRlzezX7r6lyLMBALqR94jb3T9198bc5a8kbZV0ZrEHAwB075jWuM2sQtIYSW8UYxgAQH5JlkokSWY2UNJ/SbrL3f/QzdfrJdVL0ogRIwo2INAXVcx7Ie0RepVtC6enPUJBJTriNrOMOqK9zN2f7W4bd1/s7jXuXlNeXl7IGQEAB0lyVolJ+rmkre7+0+KPBAA4miRH3JdJ+omkiWbWlPuYVuS5AABHkHeN291fk2Q9MAsAIAFeOQkAwRBuAAiGcANAMIQbAIIh3AAQDOEGgGAINwAEQ7gBIBjCDQDBEG4ACIZwA0AwhBsAgiHcABAM4QaAYAg3AARDuAEgGMINAMEQbgAIhnADQDCEGwCCIdwAEAzhBoBgCDcABEO4ASAYwg0AwRBuAAiGcANAMIQbAIIh3AAQDOEGgGAINwAEQ7gBIBjCDQDB5A23mf3CzD43s009MRAA4OiSHHEvkTS1yHMAABLKG253f1XS73tgFgBAAgVb4zazejPbYGYbWlpaCrVbAEAXBQu3uy929xp3rykvLy/UbgEAXXBWCQAEQ7gBIJgkpwP+StLrks4zs2Yz+5vijwUAOJIB+TZw99qeGAQAkAxLJQAQDOEGgGAINwAEQ7gBIBjCDQDBEG4ACIZwA0AwhBsAgiHcABAM4QaAYAg3AARDuAEgGMINAMEQbgAIhnADQDCEGwCCIdwAEAzhBoBgCDcABEO4ASAYwg0AwRBuAAiGcANAMIQbAIIh3AAQDOEGgGAINwAEQ7gBIBjCDQDBEG4ACIZwA0AwhBsAgkkUbjObambvmdkHZjav2EMBAI4sb7jNrL+kf5V0taQLJdWa2YXFHgwA0L0kR9zjJH3g7h+6+x8lLZf0w+KOBQA4kiThPlPSxwddb87dBgBIwYAE21g3t/lhG5nVS6rPXd1tZu8dz2A44DRJO9MeIh97JO0JkBJ+PgvnrKQbJgl3s6ThB10fJml7143cfbGkxUkfGMmY2QZ3r0l7DqA7/HymI8lSyW8kjTSzs83sBEk/lvTfxR0LAHAkeY+43X2fmd0m6X8k9Zf0C3ffXPTJAADdSrJUInd/UdKLRZ4F3WP5CaWMn88UmPthzzMCAEoYL3kHgGAINwAEQ7gBJGJm/czsL9KeA6xxlxwzK5d0i6QKHfTksbvXpTUT0MnMXnf3P097jr4u0Vkl6FHPS1ov6WVJ36Y8C9DVGjP7a0nPOkd9qeGIu8SYWZO7V6U9B9AdM/tK0vfUcVCxRx1vieHufkqqg/UxrHGXnlVmNi3tIYDuuPsgd+/n7hl3PyV3nWj3MI64S8xBRzR/lNSeu5kjGpQEMzNJsySd7e7/ZGbDJZ3h7m+mPFqfQrgBJGZmT0raL2miu19gZqdKWuPul6Q8Wp/Ck5MlyMx+IOmK3NV17r4qzXmAg/yZu1eb2VuS5O5f5N58Dj2INe4SY2YLJd0paUvu487cbUApaM/9OkOXDpy+uj/dkfoelkpKjJm9LanK3ffnrveX9Ja7V6Y7GSCZ2SxJN0qqlvS0pBsk/YO7/2eqg/UxLJWUpu9L+n3u8uA0BwEO5u7LzCwraZI6TgW8zt23pjxWn0O4S8/Dkt4ys7Xq+IdxhaSGdEcCDvFbSX9Qrh9mNsLdf5fuSH0LSyUlyMzOkHSJOsL9hrt/lvJIgCTJzG6X9I+SdqjjRTidL8BhKa8HEe4SY2aXSWpy96/N7CZ1rCX+zN0/Snk0QGb2gTrOLNmV9ix9GWeVlJ4nJX1jZhdLulfSR5KeSXck4ICPJbWmPURfxxp36dnn7m5mP5T0uLv/3MxuTnso9G1mdnfu4oeS1pnZC5LaOr/u7j9NZbA+inCXnq/MrEHSTyT9Ze50wEzKMwGDcp9/l/s4Ifch5c7pRs9hjbvEmNmfSqqV9Bt3fy33XhAT3H1pyqMBMrMfdT1nu7vbUFyEu0Tk3lyq85thuc+eu9wm6QNJ97v7KymMB0iSzKzR3avz3YbiYqmkRLj7oCN9LbdccpGkZbnPQI8ys6slTZN0ppk9ftCXTpG0L52p+i7CHYC7fytpo5n9S9qzoM/aLmmDpB9Jel8d/xv8Vh3nc/99inP1SSyVAMjLzDKS/lnS30rapo4lvOGSnpI0393bj3xvFBrncQNI4lFJp0o6y92r3X2MpHPU8V46j6U6WR/EETeAvMzst5LO7foLgnPPv7zr7iPTmaxv4ogbQBLe3W91zz3/wtFfDyPcAJLYYmazu96Yez+dd1OYp09jqQRAXmZ2pqRnJe2RlFXHUfYlkk6SdL27f5LieH0O4QaQmJlNlDRKHWeVbOYFYekg3AAQDGvcABAM4QaAYAg3AARDuAEgGMINAMH8P/RhZ+yV/TDFAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "p2 = plotUniqueValuesComparation(df_a_json, df_a_other, 'symbol')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
jsonother
window.localStorage0.822601NaN
window.sessionStorage0.171297NaN
HTMLCanvasElement.ownerDocument0.004006NaN
window.name0.0020760.000182
HTMLCanvasElement.style0.000021NaN
window.document.cookieNaN0.999818
\n", + "
" + ], + "text/plain": [ + " json other\n", + "window.localStorage 0.822601 NaN\n", + "window.sessionStorage 0.171297 NaN\n", + "HTMLCanvasElement.ownerDocument 0.004006 NaN\n", + "window.name 0.002076 0.000182\n", + "HTMLCanvasElement.style 0.000021 NaN\n", + "window.document.cookie NaN 0.999818" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plotUsageComparation(df_a_json, df_a_other, 'symbol')" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Value CountsValue Counts
Json125
Other2452
\n", + "
" + ], + "text/plain": [ + " Value Counts Value Counts\n", + "Json 12 5\n", + "Other 245 2" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEOCAYAAACHE9xHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFCRJREFUeJzt3X+QnVWd5/H3F2hoHTBiCCyQSEcrKMSEJrQsuzhDhuASgw4/RpAUmtTENWMVqOwSMYlTJTqKaDEOy9ZKFVMqgcpMwmSkoBCZSCoUWOWvdGwhIaIpJ0CTGJqILQhkk/CdP/ppbJJO/7653affr6pb97nnnud5vp3u/vSTc889NzITSVK5Dqt3AZKk2jLoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYU7ot4FABx33HHZ1NRU7zIkaUxpbW19PjMn9ddvVAR9U1MTGzZsqHcZkjSmRMRTA+nn0I0kFa7foI+IKRGxPiK2RMTmiPhM1X5DRDwbEW3VbV6PfZZFxNaIeDIiLqzlFyBJ6ttAhm72Atdl5saIOAZojYgfVM/9Y2be3LNzRJwOXAlMB04CHoqIUzNz30gWLkkamH6DPjN3ADuq7RcjYgtwch+7XAysyszdwH9ExFbgbOBHgylsz549tLe38+qrrw5mN9VQY2MjkydPpqGhod6lSBqEQb0YGxFNwJnAT4BzgWsiYgGwga6r/hfo+iPw4x67tdPLH4aIWAwsBnj7299+wLna29s55phjaGpqIiIGU6ZqIDPZtWsX7e3tTJ06td7lSBqEAb8YGxFHA/8GXJuZfwBuA94JNNN1xf8P3V172f2ATzfJzNszsyUzWyZNOnB20KuvvsrEiRMN+VEiIpg4caL/w5LGoAEFfUQ00BXyKzPzuwCZuTMz92Xma8A/0TU8A11X8FN67D4Z2D6U4gz50cXvhzQ2DWTWTQDfArZk5jd6tJ/Yo9ulwKZq+z7gyog4KiKmAtOAn45cyZKkwRjIGP25wMeAxyOirWpbDsyPiGa6hmW2AX8LkJmbI+Ju4Am6ZuxcPRIzbpqWfm+4h3iDbTdd1Ofzs2fPZtmyZVx44Z9mh95yyy386le/4pvf/OZB9zv66KN56aWXRqTGO++8k69//etkJpnJokWLWLJkyYgcu9uNN97I8uXLR/SYOvRG+vejVvr7vVNt9HtFn5k/zMzIzJmZ2VzdHsjMj2XmjKr9r6rZOd37fCUz35mZ78rM79f2S6iN+fPns2rVqje0rVq1ivnz5x+S83//+9/nlltuYe3atWzevJmNGzcyYcKEET/PjTfeOOLHlDS6+M7Yg/jwhz/M/fffz+7duwHYtm0b27dv533vex8vvfQSc+bMYdasWcyYMYN77733gP0ffvhhPvjBD77++JprruGOO+4AoLW1lfPOO4+zzjqLCy+8kB07dhyw/1e/+lVuvvlmTjrpJKBrauMnPvEJANra2jjnnHOYOXMml156KS+88ALQ9b+Q7qUknn/+ebrXD7rjjju47LLLmDt3LtOmTeP6668HYOnSpbzyyis0Nzdz1VVX8cc//pGLLrqIM844g/e85z2sXr16BP4lJdWbQX8QEydO5Oyzz+bBBx8Euq7mP/KRjxARNDY2cs8997Bx40bWr1/PddddR+YBE4t6tWfPHj71qU+xZs0aWltbWbRoEZ///OcP6Ldp0ybOOuusXo+xYMECvva1r/HYY48xY8YMvvjFL/Z73ra2NlavXs3jjz/O6tWreeaZZ7jpppt405veRFtbGytXruTBBx/kpJNO4he/+AWbNm1i7ty5A/qaJI1uBn0feg7f9By2yUyWL1/OzJkzueCCC3j22WfZuXPngI755JNPsmnTJt7//vfT3NzMl7/8Zdrb2wdcU2dnJ7///e8577zzAFi4cCGPPPJIv/vNmTOHCRMm0NjYyOmnn85TTx24FtKMGTN46KGH+NznPsejjz5ak6EiSYeeQd+HSy65hHXr1rFx40ZeeeUVZs2aBcDKlSvp6OigtbWVtrY2TjjhhAPmlx9xxBG89tprrz/ufj4zmT59Om1tbbS1tfH444+zdu3aA849ffp0WltbB1Vvz3PuX89RRx31+vbhhx/O3r17D9j/1FNPpbW1lRkzZrBs2TK+9KUvDer8kkYng74PRx99NLNnz2bRokVveBG2s7OT448/noaGBtavX9/r1fEpp5zCE088we7du+ns7GTdunUAvOtd76Kjo4Mf/ahrRYg9e/awefPmA/ZftmwZ119/Pb/97W8B2L17N7feeisTJkzg2GOP5dFHHwXgrrvuev3qvqmp6fU/DmvWrBnQ19jQ0MCePXsA2L59O29+85v56Ec/ypIlS9i4ceOAjiFpdBsV69EPRL2mZc2fP5/LLrvsDTNwrrrqKj70oQ/R0tJCc3Mz7373uw/Yb8qUKVxxxRXMnDmTadOmceaZZwJw5JFHsmbNGj796U/T2dnJ3r17ufbaa5k+ffob9p83bx47d+7kggsuIDOJCBYtWgTAihUr+OQnP8nLL7/MO97xDr7zne8AsGTJEq644gruuusuzj///AF9fYsXL2bmzJnMmjWLBQsW8NnPfpbDDjuMhoYGbrvttiH9m0kaXWKgLyLWUktLS+7/wSNbtmzhtNNOq1NFOhi/L6OT8+jHp4hozcyW/vo5dCNJhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKN2bm0XPDCL8d/4bOPp92mWJJpfCK/iBcplhSKQz6g3CZYpcplkph0B+EyxS7TLFUCoO+Dy5T7DLFUgkM+j64TLHLFEslMOj74DLFLlMslWAMTa/sezpkrbhMsaSxzmWKNSh+X0Ynlyken1ymWJIEGPSSVLxRHfSjYVhJf+L3QxqbRm3QNzY2smvXLsNllMhMdu3aRWNjY71LkTRIo3bWzeTJk2lvb6ejo6PepajS2NjI5MmT612GpEEatUHf0NDA1KlT612GJI15o3boRpI0Mgx6SSqcQS9JhTPoJalwBr0kFa7foI+IKRGxPiK2RMTmiPhM1f62iPhBRPy6uj+2ao+IuDUitkbEYxExq9ZfhCTp4AZyRb8XuC4zTwPOAa6OiNOBpcC6zJwGrKseA3wAmFbdFgMugShJddRv0GfmjszcWG2/CGwBTgYuBlZU3VYAl1TbFwN3ZpcfA2+NiBNHvHJJ0oAMaow+IpqAM4GfACdk5g7o+mMAHF91Oxl4psdu7VWbJKkOBhz0EXE08G/AtZn5h7669tJ2wII1EbE4IjZExAaXOZCk2hlQ0EdEA10hvzIzv1s17+wekqnun6va24EpPXafDGzf/5iZeXtmtmRmy6RJk4ZavySpHwOZdRPAt4AtmfmNHk/dByysthcC9/ZoX1DNvjkH6Owe4pEkHXoDWdTsXOBjwOMR0Va1LQduAu6OiI8DTwOXV889AMwDtgIvA38zohVLkgal36DPzB/S+7g7wJxe+idw9TDrkiSNEN8ZK0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9Jhes36CPi2xHxXERs6tF2Q0Q8GxFt1W1ej+eWRcTWiHgyIi6sVeGSpIEZyBX9HcDcXtr/MTObq9sDABFxOnAlML3a55sRcfhIFStJGrx+gz4zHwF+N8DjXQysyszdmfkfwFbg7GHUJ0kapuGM0V8TEY9VQzvHVm0nA8/06NNetUmS6mSoQX8b8E6gGdgB/EPVHr30zd4OEBGLI2JDRGzo6OgYYhmSpP4MKegzc2dm7svM14B/4k/DM+3AlB5dJwPbD3KM2zOzJTNbJk2aNJQyJEkDMKSgj4gTezy8FOiekXMfcGVEHBURU4FpwE+HV6IkaTiO6K9DRPwLMBs4LiLagS8AsyOima5hmW3A3wJk5uaIuBt4AtgLXJ2Z+2pTuiRpIPoN+syc30vzt/ro/xXgK8MpSpI0cnxnrCQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TC9Rv0EfHtiHguIjb1aHtbRPwgIn5d3R9btUdE3BoRWyPisYiYVcviJUn9G8gV/R3A3P3algLrMnMasK56DPABYFp1WwzcNjJlSpKGqt+gz8xHgN/t13wxsKLaXgFc0qP9zuzyY+CtEXHiSBUrSRq8oY7Rn5CZOwCq++Or9pOBZ3r0a6/aDhARiyNiQ0Rs6OjoGGIZkqT+jPSLsdFLW/bWMTNvz8yWzGyZNGnSCJchSeo21KDf2T0kU90/V7W3A1N69JsMbB96eZKk4Rpq0N8HLKy2FwL39mhfUM2+OQfo7B7ikSTVxxH9dYiIfwFmA8dFRDvwBeAm4O6I+DjwNHB51f0BYB6wFXgZ+Jsa1CxJGoR+gz4z5x/kqTm99E3g6uEWJUkaOb4zVpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCnfEcHaOiG3Ai8A+YG9mtkTE24DVQBOwDbgiM18YXpmSpKEaiSv6v8zM5sxsqR4vBdZl5jRgXfVYklQntRi6uRhYUW2vAC6pwTkkSQM03KBPYG1EtEbE4qrthMzcAVDdH9/bjhGxOCI2RMSGjo6OYZYhSTqYYY3RA+dm5vaIOB74QUT8cqA7ZubtwO0ALS0tOcw6JEkHMawr+szcXt0/B9wDnA3sjIgTAar754ZbpCRp6IYc9BHxZxFxTPc28D+ATcB9wMKq20Lg3uEWKUkauuEM3ZwA3BMR3cf558x8MCJ+BtwdER8HngYuH36ZkqShGnLQZ+ZvgDN6ad8FzBlOUZKkkeM7YyWpcAa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXuiHoXMJY0Lf1evUsYkG03XVTvEiSNIl7RS1LhDHpJKpxBL0mFM+glqXAGvSQVzlk3JbphQr0rGJgbOutdgTQu1OyKPiLmRsSTEbE1IpbW6jySpL7VJOgj4nDg/wEfAE4H5kfE6bU4lySpb7Uaujkb2JqZvwGIiFXAxcATNTqfpLHAYcW6qNXQzcnAMz0et1dtkqRDrFZX9NFLW76hQ8RiYHH18KWIeLJGtYw7AccBz9e7jn59sbcfE5XMn80Rd8pAOtUq6NuBKT0eTwa29+yQmbcDt9fo/ONaRGzIzJZ61yHtz5/N+qjV0M3PgGkRMTUijgSuBO6r0bkkSX2oyRV9Zu6NiGuAfwcOB76dmZtrcS5JUt9q9oapzHwAeKBWx1efHBLTaOXPZh1EZvbfS5I0ZrnWjSQVzqCXpMIZ9JJqJiIOi4j/Xu86xjvH6Me4iJgEfAJooseL65m5qF41ST1FxI8y87/Vu47xzGWKx757gUeBh4B9da5F6s3aiPhr4LvplWVdeEU/xkVEW2Y217sO6WAi4kXgz+i6EHmFriVSMjPfUtfCxhHH6Me++yNiXr2LkA4mM4/JzMMysyEz31I9NuQPIa/ox7geV0v/H9hTNXu1pFEjIgK4CpiamX8fEVOAEzPzp3Uubdww6CXVVETcBrwGnJ+Zp0XEscDazHxvnUsbN3wxtgAR8VfAX1QPH87M++tZj7Sf/5qZsyLi5wCZ+UK12KEOEcfox7iIuAn4DF2f3vUE8JmqTRot9lQfL5rw+pTg1+pb0vji0M0YFxGPAc2Z+Vr1+HDg55k5s76VSV0i4irgI8AsYAXwYeDvMvNf61rYOOLQTRneCvyu2h4jH8qp8SIzV0ZEKzCHrqmVl2TmljqXNa4Y9GPfV4GfR8R6un6J/gJYVt+SpAP8GvgDVeZExNsz8+n6ljR+OHRTgIg4EXgvXUH/k8z8bZ1Lkl4XEZ8CvgDspOtNU91vmHJ48RAx6Me4iDgXaMvMP0bER+kaB/0/mflUnUuTAIiIrXTNvNlV71rGK2fdjH23AS9HxBnAZ4GngDvrW5L0Bs8AnfUuYjxzjH7s25uZGREXA7dm5rciYmG9i5Ii4n9Xm78BHo6I7wG7u5/PzG/UpbBxyKAf+16MiGXAx4A/r6ZXNtS5JgngmOr+6ep2ZHWDak69Dg3H6Me4iPgvwHzgZ5n5w2odkdmZeVedS5MAiIjL958z31ubasegH6Oqxcy6v3lR3We1vRvYCnw+M9fVoTzpdRGxMTNn9dem2nHoZozKzGMO9lw1fPMeYGV1Lx1yEfEBYB5wckTc2uOptwB761PV+GTQFygz9wG/iIj/W+9aNK5tBzYAlwO/out/nPvomk//v+pY17jj0I2kmoiIBuArwP8EttE1rDgF+A6wPDP3HHxvjSTn0Uuqla8DxwKnZOaszDwTeAdd6zHdXNfKxhmv6CXVRET8Gjh1/w8Er15D+mVmTqtPZeOPV/SSaiX3D/mqcR/Ooz+kDHpJtfJERCzYv7Fak+mXdahn3HLoRlJNRMTJwHeBV4BWuq7i3wu8Cbg0M5+tY3njikEvqaYi4nxgOl2zbjb7Jr5Dz6CXpMI5Ri9JhTPoJalwBr0kFc6gl6TCGfSSVLj/BI9aQOe2deiMAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "p3 = pd.concat([p1, p2], axis=1, sort=False).drop_duplicates()\n", + "p3.plot(kind='bar')\n", + "p3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DOMAINS\n", + "\n", + "Again, the difference for unique values for the whole sample and the filtered one is really big. Only 7.2%[1] of the values remain on the filtered sample for the non-JSON values and 30% for the JSON’s. \n", + "\n", + "---\n", + " For futher investigation: \n", + "1. Only few of domains produce bigger values, why? Do they have something in common? Does that mean that some domains only produce bigger values? \n", + " \n", + "2. What are the top domains commonly used for?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Full Sample" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 11166 unique script_domain present on the non-json dataset and 3507 on the JSONs\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAEOCAYAAABiodtuAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFdNJREFUeJzt3X+MV/Wd7/HnWxhFqyJF9CpwHczSVhFEnFp321uNWEFtF+uqldiFFLekN9a2e2ut2E1MrVptzNb15mpi1h9oyILLttFY649STN3EWhmcKkgtxKqMKI6CbP1FQd/3jzm4I59BcL4jZ+D7fCST7znv8zln3sOPec35nHO+E5mJJEk97VF3A5KkgcdwkCQVDAdJUsFwkCQVDAdJUsFwkCQVDAdJUsFwkCQVDAdJUmFw3Q301YEHHpitra11tyFJu4z29vZXMnPEjozdZcOhtbWVJUuW1N2GJO0yIuK5HR3rtJIkqWA4SJIKhoMkqbDLXnPozaZNm+js7OTtt9+uuxVVhgwZwqhRo2hpaam7FUkfwm4VDp2dney33360trYSEXW30/Qyk1dffZXOzk7GjBlTdzuSPoTdalrp7bffZvjw4QbDABERDB8+3DM5aRe0W4UDYDAMMP59SLum3S4cJEmN262uOWyt9ZJf9Ovxnr369A/cfuKJJzJnzhymTJnyXu26667jj3/8IzfccMM299t33315/fXX+6XH22+/nZ/85CdkJpnJrFmzuOiii/rl2FtcddVVXHrppf16TO18/f3/o9lt7/vDrsYzh340ffp05s+f/77a/PnzmT59+k75/L/85S+57rrreOCBB1i+fDlLly5l6NCh/f55rrrqqn4/pqSBxXDoR2eddRb33HMPGzduBODZZ59lzZo1fO5zn+P1119n8uTJTJo0ifHjx3PXXXcV+z/00EN88YtffG/9m9/8JrfddhsA7e3tnHDCCRx77LFMmTKFF198sdj/xz/+Mddeey2HHnoo0H0b6de//nUAOjo6OP7445kwYQJf/vKXWb9+PdB9trPlbUheeeUVtrxf1W233caZZ57J1KlTGTt2LBdffDEAl1xyCW+99RYTJ07kvPPO44033uD000/n6KOP5qijjmLBggX98CcpqW6GQz8aPnw4xx13HPfddx/Qfdbwla98hYhgyJAh/PznP2fp0qUsXryY7373u2TmDh1306ZNXHjhhSxcuJD29nZmzZrFD37wg2LcsmXLOPbYY3s9xowZM7jmmmt44oknGD9+PD/84Q+3+3k7OjpYsGABTz75JAsWLGD16tVcffXV7L333nR0dDBv3jzuu+8+Dj30UH7/+9+zbNkypk6dukNfk6SBzXDoZz2nlnpOKWUml156KRMmTODkk0/mhRdeYO3atTt0zKeffpply5bxhS98gYkTJ3LFFVfQ2dm5wz1t2LCB1157jRNOOAGAmTNn8pvf/Ga7+02ePJmhQ4cyZMgQjjzySJ57rnzPrvHjx/OrX/2K73//+zz88MMfyTSWpJ3PcOhnZ5xxBosWLWLp0qW89dZbTJo0CYB58+bR1dVFe3s7HR0dHHzwwcX9/4MHD+bdd999b33L9sxk3LhxdHR00NHRwZNPPskDDzxQfO5x48bR3t7+ofrt+Tm37mevvfZ6b3nQoEFs3ry52P8Tn/gE7e3tjB8/njlz5nD55Zd/qM8vaWAyHPrZvvvuy4knnsisWbPedyF6w4YNHHTQQbS0tLB48eJefwo/7LDDeOqpp9i4cSMbNmxg0aJFAHzyk5+kq6uLRx55BOieZlq+fHmx/5w5c7j44ot56aWXANi4cSPXX389Q4cOZdiwYTz88MMA3HHHHe+dRbS2tr4XKAsXLtyhr7GlpYVNmzYBsGbNGvbZZx+++tWvctFFF7F06dIdOoakgW23vpW1rlvLpk+fzplnnvm+O5fOO+88vvSlL9HW1sbEiRP51Kc+Vew3evRozjnnHCZMmMDYsWM55phjANhzzz1ZuHAh3/rWt9iwYQObN2/mO9/5DuPGjXvf/qeddhpr167l5JNPJjOJCGbNmgXA3Llz+cY3vsGbb77J4Ycfzq233grARRddxDnnnMMdd9zBSSedtENf3+zZs5kwYQKTJk1ixowZfO9732OPPfagpaWFG2+8sU9/ZpIGltjRi6IDTVtbW279y35WrFjBEUccUVNH2hb/XgYmn3PoX7vCcw4R0Z6ZbTsy1mklSVLBcJAkFXa7cNhVp8l2V/59SLum3SochgwZwquvvuo3pAFiy+9zGDJkSN2tSPqQdqu7lUaNGkVnZyddXV11t6LKlt8EJ2nXsluFQ0tLi79xTJL6wW41rSRJ6h+GgySpYDhIkgqGgySpsN1wiIhbIuLliFjWo/bxiHgwIlZWr8OqekTE9RGxKiKeiIhJPfaZWY1fGREze9SPjYgnq32uD38jvSTVbkfOHG4Dtv4NLpcAizJzLLCoWgc4FRhbfcwGboTuMAEuAz4DHAdctiVQqjGze+znb4uRpJptNxwy8zfAuq3K04C51fJc4Iwe9duz22+BAyLiEGAK8GBmrsvM9cCDwNRq2/6Z+Uh2P7l2e49jSZJq0tdrDgdn5osA1etBVX0ksLrHuM6q9kH1zl7qvYqI2RGxJCKW+KCbJH10+vuCdG/XC7IP9V5l5k2Z2ZaZbSNGjOhji5Kk7elrOKytpoSoXl+u6p3A6B7jRgFrtlMf1UtdklSjvobD3cCWO45mAnf1qM+o7lo6HthQTTvdD5wSEcOqC9GnAPdX2/4cEcdXdynN6HEsSVJNtvveShHxb8CJwIER0Un3XUdXA3dGxPnA88DZ1fB7gdOAVcCbwNcAMnNdRPwIeKwad3lmbrnI/b/pviNqb+CX1YckqUbbDYfMnL6NTZN7GZvABds4zi3ALb3UlwBHba8PSdLO4xPSkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKjQUDhHxjxGxPCKWRcS/RcSQiBgTEY9GxMqIWBARe1Zj96rWV1XbW3scZ05VfzoipjT2JUmSGtXncIiIkcC3gLbMPAoYBJwLXAP8NDPHAuuB86tdzgfWZ+ZfAT+txhERR1b7jQOmAjdExKC+9iVJalyj00qDgb0jYjCwD/AicBKwsNo+FzijWp5WrVNtnxwRUdXnZ+bGzPwTsAo4rsG+JEkN6HM4ZOYLwLXA83SHwgagHXgtMzdXwzqBkdXySGB1te/mavzwnvVe9nmfiJgdEUsiYklXV1dfW5ckbUcj00rD6P6pfwxwKPAx4NRehuaWXbaxbVv1sph5U2a2ZWbbiBEjPnzTkqQd0si00snAnzKzKzM3AT8D/gY4oJpmAhgFrKmWO4HRANX2ocC6nvVe9pEk1aCRcHgeOD4i9qmuHUwGngIWA2dVY2YCd1XLd1frVNt/nZlZ1c+t7mYaA4wFftdAX5KkBg3e/pDeZeajEbEQWApsBh4HbgJ+AcyPiCuq2s3VLjcDd0TEKrrPGM6tjrM8Iu6kO1g2Axdk5jt97UuS1Lg+hwNAZl4GXLZV+Rl6udsoM98Gzt7Gca4ErmykF0lS//EJaUlSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUMB0lSwXCQJBUaCoeIOCAiFkbEHyJiRUT8dUR8PCIejIiV1euwamxExPURsSoinoiIST2OM7MavzIiZjb6RUmSGtPomcO/APdl5qeAo4EVwCXAoswcCyyq1gFOBcZWH7OBGwEi4uPAZcBngOOAy7YEiiSpHn0Oh4jYH/g8cDNAZv4lM18DpgFzq2FzgTOq5WnA7dntt8ABEXEIMAV4MDPXZeZ64EFgal/7kiQ1rpEzh8OBLuDWiHg8Iv41Ij4GHJyZLwJUrwdV40cCq3vs31nVtlUvRMTsiFgSEUu6uroaaF2S9EEaCYfBwCTgxsw8BniD/55C6k30UssPqJfFzJsysy0z20aMGPFh+5Uk7aBGwqET6MzMR6v1hXSHxdpquojq9eUe40f32H8UsOYD6pKkmvQ5HDLzJWB1RHyyKk0GngLuBrbccTQTuKtavhuYUd21dDywoZp2uh84JSKGVReiT6lqkqSaDG5w/wuBeRGxJ/AM8DW6A+fOiDgfeB44uxp7L3AasAp4sxpLZq6LiB8Bj1XjLs/MdQ32JUlqQEPhkJkdQFsvmyb3MjaBC7ZxnFuAWxrpRZLUf3xCWpJUMBwkSQXDQZJUMBwkSQXDQZJUMBwkSQXDQZJUMBwkSQXDQZJUMBwkSQXDQZJUMBwkSQXDQZJUMBwkSQXDQZJUMBwkSQXDQZJUMBwkSQXDQZJUMBwkSQXDQZJUMBwkSQXDQZJUMBwkSQXDQZJUMBwkSQXDQZJUMBwkSYWGwyEiBkXE4xFxT7U+JiIejYiVEbEgIvas6ntV66uq7a09jjGnqj8dEVMa7UmS1Jj+OHP4NrCix/o1wE8zcyywHji/qp8PrM/MvwJ+Wo0jIo4EzgXGAVOBGyJiUD/0JUnqo4bCISJGAacD/1qtB3ASsLAaMhc4o1qeVq1TbZ9cjZ8GzM/MjZn5J2AVcFwjfUmSGtPomcN1wMXAu9X6cOC1zNxcrXcCI6vlkcBqgGr7hmr8e/Ve9nmfiJgdEUsiYklXV1eDrUuStqXP4RARXwRezsz2nuVehuZ2tn3QPu8vZt6UmW2Z2TZixIgP1a8kaccNbmDfzwJ/GxGnAUOA/ek+kzggIgZXZwejgDXV+E5gNNAZEYOBocC6HvUteu4jSapBn88cMnNOZo7KzFa6Lyj/OjPPAxYDZ1XDZgJ3Vct3V+tU23+dmVnVz63uZhoDjAV+19e+JEmNa+TMYVu+D8yPiCuAx4Gbq/rNwB0RsYruM4ZzATJzeUTcCTwFbAYuyMx3PoK+JEk7qF/CITMfAh6qlp+hl7uNMvNt4Oxt7H8lcGV/9CJJapxPSEuSCoaDJKlgOEiSCoaDJKnwUdytpErrJb+ou4XdyrNXn153C1LT8MxBklQwHCRJBcNBklQwHCRJBcNBklQwHCRJBcNBklQwHCRJBcNBklQwHCRJBcNBklQwHCRJBcNBklQwHCRJBcNBklQwHCRJBcNBklQwHCRJBcNBklQwHCRJBcNBklQwHCRJBcNBklToczhExOiIWBwRKyJieUR8u6p/PCIejIiV1euwqh4RcX1ErIqIJyJiUo9jzazGr4yImY1/WZKkRjRy5rAZ+G5mHgEcD1wQEUcClwCLMnMssKhaBzgVGFt9zAZuhO4wAS4DPgMcB1y2JVAkSfXoczhk5ouZubRa/jOwAhgJTAPmVsPmAmdUy9OA27Pbb4EDIuIQYArwYGauy8z1wIPA1L72JUlqXL9cc4iIVuAY4FHg4Mx8EboDBDioGjYSWN1jt86qtq16b59ndkQsiYglXV1d/dG6JKkXDYdDROwL/Afwncz8rw8a2kstP6BeFjNvysy2zGwbMWLEh29WkrRDGgqHiGihOxjmZebPqvLaarqI6vXlqt4JjO6x+yhgzQfUJUk1aeRupQBuBlZk5j/32HQ3sOWOo5nAXT3qM6q7lo4HNlTTTvcDp0TEsOpC9ClVTZJUk8EN7PtZ4O+BJyOio6pdClwN3BkR5wPPA2dX2+4FTgNWAW8CXwPIzHUR8SPgsWrc5Zm5roG+JEkN6nM4ZOZ/0vv1AoDJvYxP4IJtHOsW4Ja+9iJJ6l8+IS1JKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqSC4SBJKhgOkqTCgAmHiJgaEU9HxKqIuKTufiSpmQ2IcIiIQcD/A04FjgSmR8SR9XYlSc1rQIQDcBywKjOfycy/APOBaTX3JElNa6CEw0hgdY/1zqomSarB4LobqEQvtSwGRcwGZlerr0fE0x9pV83jQOCVupvYnrim7g5UE/999p/DdnTgQAmHTmB0j/VRwJqtB2XmTcBNO6upZhERSzKzre4+pN7477MeA2Va6TFgbESMiYg9gXOBu2vuSZKa1oA4c8jMzRHxTeB+YBBwS2Yur7ktSWpaAyIcADLzXuDeuvtoUk7VaSDz32cNIrO47itJanID5ZqDJGkAMRwkSQXDQdKAERF7RMTf1N2HvObQlCJiBPB1oJUeNyVk5qy6epK2iIhHMvOv6+6j2Q2Yu5W0U90FPAz8Cnin5l6krT0QEX8H/Cz96bU2njk0oYjoyMyJdfch9SYi/gx8jO4fXN6i++11MjP3r7WxJuM1h+Z0T0ScVncTUm8yc7/M3CMzWzJz/2rdYNjJPHNoQj1+MvsLsKkq+5OZBoSICOA8YExm/igiRgOHZObvam6tqRgOkgaUiLgReBc4KTOPiIhhwAOZ+emaW2sqXpBuUhHxt8Dnq9WHMvOeOvuRevhMZk6KiMcBMnN99Yac2om85tCEIuJq4NvAU9XHt6uaNBBsqn51cMJ7t16/W29LzcdppSYUEU8AEzPz3Wp9EPB4Zk6otzMJIuI84CvAJGAucBbwT5n577U21mScVmpeBwDrquWhdTYi9ZSZ8yKiHZhM922sZ2TmiprbajqGQ3P6MfB4RCym+z/f54E59bYkvc9K4L+ovkdFxP/MzOfrbam5OK3UpCLiEODTdIfDo5n5Us0tSQBExIXAZcBauh+E2/IQnNOeO5Hh0IQi4rNAR2a+ERFfpXtu918y87maW5OIiFV037H0at29NDPvVmpONwJvRsTRwPeA54Db621Jes9qYEPdTTQ7rzk0p82ZmRExDbg+M2+OiJl1N6XmFhH/p1p8BngoIn4BbNyyPTP/uZbGmpTh0Jz+HBFzgL8H/ld1K2tLzT1J+1Wvz1cfe1YfUD3zoJ3Haw5NKCL+BzAdeCwz/7N675oTM/OOmluTiIizt36mobeaPlqGQxOp3nBvy194VK9ZLW8EVgE/yMxFNbQnARARSzNz0vZq+mg5rdREMnO/bW2rppaOAuZVr9JOFRGnAqcBIyPi+h6b9gc219NV8zIcBEBmvgP8PiL+b929qGmtAZYAZwN/pPus9h26n3f4xxr7akpOK0kaECKiBbgS+AfgWbqnO0cDtwKXZuambe+t/uZzDpIGip8Aw4DDMnNSZh4DHE73e39dW2tnTcgzB0kDQkSsBD6RW31Tqq6H/SEzx9bTWXPyzEHSQJFbB0NVfAefc9jpDAdJA8VTETFj62L1/l9/qKGfpua0kqQBISJGAj8D3gLa6T5b+DSwN/DlzHyhxvaajuEgaUCJiJOAcXTfrbTchzLrYThIkgpec5AkFQwHSVLBcJAkFQwHSVLBcJAkFf4/Ts5bWohEuncAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plotUniqueValuesComparation(df_json, df_other, 'script_domain')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
jsonother
baidu.com0.1561620.029125
cloudfront.net0.0823070.031655
rambler.ru0.0482660.010561
google-analytics.com0.0000050.121869
yandex.ru0.0219040.030423
\n", + "
" + ], + "text/plain": [ + " json other\n", + "baidu.com 0.156162 0.029125\n", + "cloudfront.net 0.082307 0.031655\n", + "rambler.ru 0.048266 0.010561\n", + "google-analytics.com 0.000005 0.121869\n", + "yandex.ru 0.021904 0.030423" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plotTopUsageComparation(df_json, df_other, 'script_domain', 3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Above the mean Sample" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 811 unique script_domain present on the non-json dataset and 1051 on the JSONs\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEOCAYAAACAfcAXAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFHtJREFUeJzt3X+QVeWd5/H3V2lFo0GC6CqwttaQRBFE7BhnkomWmFXRDMT1RygysMENlapokjFEwUxVKpmsQcuaGKcmbFljFC1qIMsmhWWMY0Kw4lQlbmjsKEiMlKvSotgq9kRFAvE7f9yDaaGh277tvdDP+1XVdc95znPO+V66uZ97nnPOvZGZSJLKc1CzC5AkNYcBIEmFMgAkqVAGgCQVygCQpEIZAJJUKANAkgplAEhSoQwASSrUsGYXsC9HH310tra2NrsMSTqgtLe3v5SZo/vqt18HQGtrK2vWrGl2GZJ0QImIZ/rTzyEgSSqUASBJhTIAJKlQ+/U5AEn7jx07dtDZ2cmbb77Z7FJUGT58OGPHjqWlpWVA6xsAkvqls7OTI488ktbWViKi2eUULzN5+eWX6ezs5MQTTxzQNhwCktQvb775JqNGjfLFfz8REYwaNaquIzIDQFK/+eK/f6n392EASFKhPAcwCFoX/KTZJQwpTy+6qNklqB8G++++r9/7Oeecw8KFCzn//PPfbrvlllv4/e9/z/e///29rnfEEUfw2muvDUqNd911FzfddBOZSWYyd+5c5s+fPyjb3uWGG27g+uuvH9Rt7o1HAJIOCDNnzmTZsmXvaFu2bBkzZ85syP5/+tOfcsstt/DAAw+wfv161q5dy4gRIwZ9PzfccMOgb3NvDABJB4RLL72Ue++9l+3btwPw9NNPs3nzZj7+8Y/z2muvMXXqVKZMmcLEiRNZuXLlHus/+OCDXHzxxW/PX3XVVdx5550AtLe3c/bZZ3PGGWdw/vnn8/zzz++x/ne+8x1uvvlmjj/+eKB2CebnP/95ADo6OjjrrLOYNGkSn/70p9m6dStQO2rZ9XE2L730Ers+2+zOO+/kkksu4YILLmD8+PFce+21ACxYsIBt27YxefJkZs2axeuvv85FF13Eaaedxqmnnsry5csH4V/yzwwASQeEUaNGceaZZ3L//fcDtXf/V1xxBRHB8OHD+fGPf8zatWtZvXo1X/3qV8nMfm13x44dXH311axYsYL29nbmzp3L17/+9T36rVu3jjPOOKPXbcyePZsbb7yRRx99lIkTJ/LNb36zz/12dHSwfPlyHnvsMZYvX86mTZtYtGgRhx12GB0dHSxdupT777+f448/nt/+9resW7eOCy64oF/Pqb8MAEkHjJ7DQD2HfzKT66+/nkmTJnHeeefx3HPPsWXLln5t84knnmDdunV88pOfZPLkyXz729+ms7Oz3zV1d3fz6quvcvbZZwMwZ84cfvnLX/a53tSpUxkxYgTDhw/nlFNO4Zln9vz8tokTJ/Lzn/+c6667joceemjQh5wMAEkHjBkzZrBq1SrWrl3Ltm3bmDJlCgBLly6lq6uL9vZ2Ojo6OPbYY/e4Pn7YsGG89dZbb8/vWp6ZTJgwgY6ODjo6Onjsscd44IEH9tj3hAkTaG9vf1f19tzn7vUceuihb08ffPDB7Ny5c4/1P/jBD9Le3s7EiRNZuHAh3/rWt97V/vtiAEg6YBxxxBGcc845zJ079x0nf7u7uznmmGNoaWlh9erVvb6bPuGEE3j88cfZvn073d3drFq1CoAPfehDdHV18atf/QqoDQmtX79+j/UXLlzItddeywsvvADA9u3bufXWWxkxYgQjR47koYceAuDuu+9++2igtbX17dBYsWJFv55jS0sLO3bsAGDz5s0cfvjhfPazn2X+/PmsXbu2X9vorz4vA42IHwAXAy9m5qlV2weA5UAr8DRweWZujdpdCd8DpgFvAP8jM9dW68wB/r7a7Lczc8mgPhNJDdWsy3VnzpzJJZdc8o4rgmbNmsWnPvUp2tramDx5Mh/+8If3WG/cuHFcfvnlTJo0ifHjx3P66acDcMghh7BixQq+9KUv0d3dzc6dO/nKV77ChAkT3rH+tGnT2LJlC+eddx6ZSUQwd+5cAJYsWcIXvvAF3njjDU466STuuOMOAObPn8/ll1/O3Xffzbnnntuv5zdv3jwmTZrElClTmD17Nl/72tc46KCDaGlpYfHixQP6N9ub6OtESUR8AngNuKtHANwEvJKZiyJiATAyM6+LiGnA1dQC4KPA9zLzo1VgrAHagATagTMyc+u+9t3W1pYHwhfCeB/A4PI+gP3Thg0bOPnkk5tdhnbT2+8lItozs62vdfscAsrMXwKv7NY8Hdj1Dn4JMKNH+11Z82vgqIg4Djgf+FlmvlK96P8MGNzT2ZKkd2Wg5wCOzcznAarHY6r2McCmHv06q7a9tUuSmmSwTwL39slEuY/2PTcQMS8i1kTEmq6urkEtTlJ9+nttvRqj3t/HQANgSzW0Q/X4YtXeCYzr0W8ssHkf7XvIzNsysy0z20aP7vNL7SU1yPDhw3n55ZcNgf3Eru8DGD58+IC3MdAPg7sHmAMsqh5X9mi/KiKWUTsJ3J2Zz0fEvwE3RMTIqt9/AxYOuGpJDTd27Fg6OzvxyHz/sesbwQaqP5eB/itwDnB0RHQC36D2wv/DiLgSeBa4rOp+H7UrgDZSuwz0cwCZ+UpE/APwm6rftzJz9xPLkvZjLS0tA/7mKe2f+gyAzNzbR+1N7aVvAl/cy3Z+APzgXVUnSXrPeCewJBXKAJCkQhkAklQoA0CSCmUASFKhDABJKpQBIEmFMgAkqVAGgCQVygCQpEIZAJJUKANAkgplAEhSoQwASSqUASBJhTIAJKlQBoAkFcoAkKRCGQCSVCgDQJIKZQBIUqEMAEkqlAEgSYUyACSpUAaAJBXKAJCkQhkAklQoA0CSCmUASFKhDABJKpQBIEmFqisAIuLvImJ9RKyLiH+NiOERcWJEPBwRT0bE8og4pOp7aDW/sVreOhhPQJI0MMMGumJEjAG+BJySmdsi4ofAZ4BpwHczc1lE/G/gSmBx9bg1M/8iIj4D3AhcUfczkLRPrQt+0uwShoynF13U7BIGVb1DQMOAwyJiGHA48DxwLrCiWr4EmFFNT6/mqZZPjYioc/+SpAEacABk5nPAzcCz1F74u4F24NXM3Fl16wTGVNNjgE3Vujur/qN2325EzIuINRGxpqura6DlSZL6MOAAiIiR1N7VnwgcD7wPuLCXrrlrlX0s+3ND5m2Z2ZaZbaNHjx5oeZKkPtQzBHQe8P8zsyszdwA/Av4KOKoaEgIYC2yupjuBcQDV8hHAK3XsX5JUh3oC4FngrIg4vBrLnwo8DqwGLq36zAFWVtP3VPNUy3+RmXscAUiSGqOecwAPUzuZuxZ4rNrWbcB1wDURsZHaGP/t1Sq3A6Oq9muABXXULUmq04AvAwXIzG8A39it+SngzF76vglcVs/+JEmDxzuBJalQBoAkFcoAkKRCGQCSVCgDQJIKZQBIUqEMAEkqlAEgSYUyACSpUAaAJBXKAJCkQhkAklQoA0CSCmUASFKhDABJKpQBIEmFMgAkqVAGgCQVygCQpEIZAJJUKANAkgplAEhSoQwASSqUASBJhTIAJKlQBoAkFcoAkKRCGQCSVCgDQJIKZQBIUqHqCoCIOCoiVkTE7yJiQ0T8ZUR8ICJ+FhFPVo8jq74REbdGxMaIeDQipgzOU5AkDUS9RwDfA+7PzA8DpwEbgAXAqswcD6yq5gEuBMZXP/OAxXXuW5JUhwEHQES8H/gEcDtAZv4xM18FpgNLqm5LgBnV9HTgrqz5NXBURBw34MolSXWp5wjgJKALuCMiHomIf4mI9wHHZubzANXjMVX/McCmHut3Vm3vEBHzImJNRKzp6uqqozxJ0r7UEwDDgCnA4sw8HXidPw/39CZ6acs9GjJvy8y2zGwbPXp0HeVJkvalngDoBDoz8+FqfgW1QNiya2inenyxR/9xPdYfC2yuY/+SpDoMOAAy8wVgU0R8qGqaCjwO3APMqdrmACur6XuA2dXVQGcB3buGiiRJjTeszvWvBpZGxCHAU8DnqIXKDyPiSuBZ4LKq733ANGAj8EbVV5LUJHUFQGZ2AG29LJraS98EvljP/iRJg8c7gSWpUAaAJBXKAJCkQhkAklQoA0CSCmUASFKhDABJKpQBIEmFMgAkqVAGgCQVygCQpEIZAJJUKANAkgplAEhSoQwASSqUASBJhTIAJKlQBoAkFcoAkKRCGQCSVCgDQJIKZQBIUqEMAEkqlAEgSYUyACSpUAaAJBXKAJCkQhkAklQoA0CSCmUASFKh6g6AiDg4Ih6JiHur+RMj4uGIeDIilkfEIVX7odX8xmp5a737liQN3GAcAXwZ2NBj/kbgu5k5HtgKXFm1Xwlszcy/AL5b9ZMkNUldARARY4GLgH+p5gM4F1hRdVkCzKimp1fzVMunVv0lSU1Q7xHALcC1wFvV/Cjg1czcWc13AmOq6THAJoBqeXfVX5LUBAMOgIi4GHgxM9t7NvfSNfuxrOd250XEmohY09XVNdDyJEl9qOcI4GPA30TE08AyakM/twBHRcSwqs9YYHM13QmMA6iWjwBe2X2jmXlbZrZlZtvo0aPrKE+StC8DDoDMXJiZYzOzFfgM8IvMnAWsBi6tus0BVlbT91TzVMt/kZl7HAFIkhrjvbgP4DrgmojYSG2M//aq/XZgVNV+DbDgPdi3JKmfhvXdpW+Z+SDwYDX9FHBmL33eBC4bjP1JkurnncCSVCgDQJIKZQBIUqEMAEkqlAEgSYUyACSpUAaAJBXKAJCkQhkAklQoA0CSCmUASFKhDABJKpQBIEmFMgAkqVAGgCQVygCQpEIZAJJUKANAkgplAEhSoQwASSqUASBJhTIAJKlQBoAkFcoAkKRCGQCSVCgDQJIKZQBIUqEMAEkqlAEgSYUyACSpUAaAJBVqwAEQEeMiYnVEbIiI9RHx5ar9AxHxs4h4snocWbVHRNwaERsj4tGImDJYT0KS9O7VcwSwE/hqZp4MnAV8MSJOARYAqzJzPLCqmge4EBhf/cwDFtexb0lSnQYcAJn5fGaurab/AGwAxgDTgSVVtyXAjGp6OnBX1vwaOCoijhtw5ZKkugzKOYCIaAVOBx4Gjs3M56EWEsAxVbcxwKYeq3VWbbtva15ErImINV1dXYNRniSpF3UHQEQcAfxf4CuZ+R/76tpLW+7RkHlbZrZlZtvo0aPrLU+StBd1BUBEtFB78V+amT+qmrfsGtqpHl+s2juBcT1WHwtsrmf/kqSBq+cqoABuBzZk5j/2WHQPMKeangOs7NE+u7oa6Cyge9dQkSSp8YbVse7HgL8FHouIjqrtemAR8MOIuBJ4FrisWnYfMA3YCLwBfK6OfUuS6jTgAMjMf6f3cX2Aqb30T+CLA92fJGlweSewJBXKAJCkQhkAklQoA0CSCmUASFKhDABJKpQBIEmFMgAkqVAGgCQVygCQpEIZAJJUKANAkgplAEhSoQwASSqUASBJhTIAJKlQBoAkFcoAkKRCGQCSVCgDQJIKZQBIUqEMAEkqlAEgSYUyACSpUAaAJBXKAJCkQhkAklQoA0CSCmUASFKhDABJKlTDAyAiLoiIJyJiY0QsaPT+JUk1DQ2AiDgY+GfgQuAUYGZEnNLIGiRJNY0+AjgT2JiZT2XmH4FlwPQG1yBJovEBMAbY1GO+s2qTJDXYsAbvL3ppy3d0iJgHzKtmX4uIJ97zqspxNPBSs4voS9zY7ArUBP5tDq4T+tOp0QHQCYzrMT8W2NyzQ2beBtzWyKJKERFrMrOt2XVIu/NvszkaPQT0G2B8RJwYEYcAnwHuaXANkiQafASQmTsj4irg34CDgR9k5vpG1iBJqmn0EBCZeR9wX6P3K8ChNe2//NtsgsjMvntJkoYcPwpCkgplAEhSoQwASQ0XEQdFxF81u47SeQ5gCIuI0cDngVZ6nPDPzLnNqknaJSJ+lZl/2ew6Stbwq4DUUCuBh4CfA39qci3S7h6IiP8O/Ch9J9oUHgEMYRHRkZmTm12H1JuI+APwPmpvTrZR+6iYzMz3N7WwgngOYGi7NyKmNbsIqTeZeWRmHpSZLZn5/mreF/8G8ghgCOvxDuuPwI6q2XdY2i9ERACzgBMz8x8iYhxwXGb+vyaXVgwDQFJTRMRi4C3g3Mw8OSJGAg9k5keaXFoxPAk8xEXE3wCfqGYfzMx7m1mP1MNHM3NKRDwCkJlbqw+JVIN4DmAIi4hFwJeBx6ufL1dt0v5gR/U1sQlvX7b8VnNLKotDQENYRDwKTM7Mt6r5g4FHMnNScyuTICJmAVcAU4AlwKXA32fm/2lqYQVxCGjoOwp4pZoe0cxCpJ4yc2lEtANTqV0COiMzNzS5rKIYAEPbd4BHImI1tf9gnwAWNrck6R2eBP6D6rUoIv5rZj7b3JLK4RDQEBcRxwEfoRYAD2fmC00uSQIgIq4GvgFsoXYz2K4bwRyibBADYAiLiI8BHZn5ekR8ltpY6/cy85kmlyYRERupXQn0crNrKZVXAQ1ti4E3IuI04GvAM8BdzS1JetsmoLvZRZTMcwBD287MzIiYDtyambdHxJxmF6WyRcQ11eRTwIMR8RNg+67lmfmPTSmsQAbA0PaHiFgI/C3w19VloC1Nrkk6snp8tvo5pPqB6p4ANYbnAIawiPgvwEzgN5n579VnrZyTmXc3uTSJiLhs92v+e2vTe8cAGIKqD4Hb9YuN6jGr6e3ARuDrmbmqCeVJAETE2syc0leb3jsOAQ1BmXnk3pZVw0CnAkurR6mhIuJCYBowJiJu7bHo/cDO5lRVJgOgMJn5J+C3EfFPza5FxdoMrAEuA35P7ej0T9TuB/i7JtZVHIeAJDVURLQA/wv4n8DT1IYmxwF3ANdn5o69r63B5H0AkhrtJmAkcEJmTsnM04GTqH1W1c1NrawwHgFIaqiIeBL44O5fBF+dn/pdZo5vTmXl8QhAUqPl7i/+VeOf8D6AhjIAJDXa4xExe/fG6vOqfteEeorlEJCkhoqIMcCPgG1AO7V3/R8BDgM+nZnPNbG8ohgAkpoiIs4FJlC7Cmi9NyY2ngEgSYXyHIAkFcoAkKRCGQCSVCgDQJIKZQBIUqH+E63PxydaKaS5AAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plotUniqueValuesComparation(df_a_json, df_a_other, 'script_domain')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
jsonother
sociaplus.com0.0899090.006558
tiqcdn.com0.0818660.037636
twimg.com0.079437NaN
google-analytics.com0.0000050.100417
adobedtm.com0.0084220.050673
yoox.biz0.0014980.041437
\n", + "
" + ], + "text/plain": [ + " json other\n", + "sociaplus.com 0.089909 0.006558\n", + "tiqcdn.com 0.081866 0.037636\n", + "twimg.com 0.079437 NaN\n", + "google-analytics.com 0.000005 0.100417\n", + "adobedtm.com 0.008422 0.050673\n", + "yoox.biz 0.001498 0.041437" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plotTopUsageComparation(df_a_json, df_a_other, 'script_domain', 3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# TLD\n", + "All top 3 domains are the same for both valid JSON and non-JSON, and they remain for the filtered data. But the are some domains that only appear in the whole sample producing only smaller values[1].\n", + "\n", + "---\n", + " For futher investigation: \n", + "1. Why are there TLD that only produces smaller values? What are they? Are there also the ones that only produces bigger values? " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 248 unique script_tld present on the non-json dataset and 141 on the JSONs\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEOCAYAAACHE9xHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAE8JJREFUeJzt3X+QnVWd5/H3F9IQHTDGEBhIIo1WUIgJTWhZdnWHDMElBh1+rCApNKmJS9YqUJklYhK3ytFVRIpxGLbWVGVKJVDZSdiMFBQiE0lBgVWOmo5tSIhoygmkSQwNYgSBTBK++0c/HZuk079vbvfp96vq1n2ec895nm+nO59++txz743MRJJUrmPqXYAkqbYMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCmfQS1LhxtS7AICTTjopGxsb612GJI0oLS0tL2TmxN76DYugb2xsZMOGDfUuQ5JGlIh4pi/9nLqRpMIZ9JJUOINekgrX6xx9REwB7gb+HHgDWJGZ/xARfwtcB7RXXZdl5kPVmKXAp4ADwGcz81/6W9i+fftoa2vj9ddf7+9Q1cjYsWOZPHkyDQ0N9S5FUj/05cnY/cBNmbkxIk4EWiLih9Vjf5+Zt3ftHBFnA9cA04DTgEci4szMPNCfwtra2jjxxBNpbGwkIvozVDWQmbz44ou0tbVxxhln1LscSf3Q69RNZu7KzI3V9svAVmBSD0MuA1Zn5t7M/DdgG3B+fwt7/fXXmTBhgiE/TEQEEyZM8C8saQTq1xx9RDQC5wI/qZpuiIhNEfGdiBhftU0CdnQZ1kbPvxh6Ot9AhqlG/H5II1Ofgz4iTgD+GbgxM/8ALAfeDTQBu4C/6+zazfDDPq8wIhZFxIaI2NDe3t7NEEnSUOjTC6YiooGOkF+Vmd8DyMzdXR7/R+DBarcNmNJl+GRg56HHzMwVwAqA5ubmXj+4tnHJ9/tSap9tv/XSHh+fNWsWS5cu5ZJLLjnYdscdd/CrX/2Kb33rW0ccd8IJJ/DKK68MSY133303t912G5lJZrJw4UIWL148JMfudMstt7Bs2bIhPaaOvqH+/zHa9ZYPI02vV/TR8ff6t4GtmfnNLu2ndul2BbC52n4AuCYijo+IM4CpwE+HruSjY968eaxevfpNbatXr2bevHlH5fw/+MEPuOOOO1i3bh1btmxh48aNjBs3bsjPc8sttwz5MSUNL32ZuvkA8EngoohorW5zgdsi4smI2AT8JfA3AJm5BbgXeAp4GLi+vytuhoOPfexjPPjgg+zduxeA7du3s3PnTj74wQ/yyiuvMHv2bGbOnMn06dO5//77Dxv/2GOP8ZGPfOTg/g033MBdd90FQEtLCxdeeCHnnXcel1xyCbt27Tps/Ne//nVuv/12TjvtNKBjaeN1110HQGtrKxdccAEzZszgiiuu4KWXXgI6/grpfCuJF154gc73D7rrrru48sormTNnDlOnTuXmm28GYMmSJbz22ms0NTVx7bXX8sc//pFLL72Uc845h/e9732sWbNmCP4lJdVbX1bd/CgzIzNnZGZTdXsoMz+ZmdOr9r/KzF1dxnwtM9+dme/JzB/U9kuojQkTJnD++efz8MMPAx1X8x//+MeJCMaOHct9993Hxo0befTRR7npppvI7HX2Ceh4fcBnPvMZ1q5dS0tLCwsXLuSLX/ziYf02b97Meeed1+0x5s+fzze+8Q02bdrE9OnT+fKXv9zreVtbW1mzZg1PPvkka9asYceOHdx666285S1vobW1lVWrVvHwww9z2mmn8Ytf/ILNmzczZ86cPn1NkoY3Xxnbg67TN12nbTKTZcuWMWPGDC6++GKee+45du/e3dOhDnr66afZvHkzH/rQh2hqauKrX/0qbW1tfa5pz549/P73v+fCCy8EYMGCBTz++OO9jps9ezbjxo1j7NixnH322TzzzOHvhTR9+nQeeeQRvvCFL/DEE0/UZKpI0tFn0Pfg8ssvZ/369WzcuJHXXnuNmTNnArBq1Sra29tpaWmhtbWVU0455bD15WPGjOGNN944uN/5eGYybdo0WltbaW1t5cknn2TdunWHnXvatGm0tLT0q96u5zy0nuOPP/7g9rHHHsv+/fsPG3/mmWfS0tLC9OnTWbp0KV/5ylf6dX5Jw5NB34MTTjiBWbNmsXDhwjc9Cbtnzx5OPvlkGhoaePTRR7u9Oj799NN56qmn2Lt3L3v27GH9+vUAvOc976G9vZ0f//jHQMdUzpYtWw4bv3TpUm6++WZ++9vfArB3717uvPNOxo0bx/jx43niiScAuOeeew5e3Tc2Nh785bB27do+fY0NDQ3s27cPgJ07d/LWt76VT3ziEyxevJiNGzf26RiShrdh8X70fVGv5U7z5s3jyiuvfNMKnGuvvZaPfvSjNDc309TUxHvf+97Dxk2ZMoWrr76aGTNmMHXqVM4991wAjjvuONauXctnP/tZ9uzZw/79+7nxxhuZNm3am8bPnTuX3bt3c/HFF5OZRAQLFy4EYOXKlXz605/m1Vdf5V3vehff/e53AVi8eDFXX30199xzDxdddFGfvr5FixYxY8YMZs6cyfz58/n85z/PMcccQ0NDA8uXLx/Qv5mk4SX6+iRiLTU3N+ehHzyydetWzjrrrDpVpCPx+zI8uY5+aI2UdfQR0ZKZzb31c+pGkgpn0EtS4YZ10A+HaSX9id8PaWQatkE/duxYXnzxRcNlmOh8P/qxY8fWuxRJ/TRsV91MnjyZtrY2fGfL4aPzE6YkjSzDNugbGhr8JCNJGgLDdupGkjQ0DHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwvUa9BExJSIejYitEbElIj5Xtb8jIn4YEb+u7sdX7RERd0bEtojYFBEza/1FSJKOrC9X9PuBmzLzLOAC4PqIOBtYAqzPzKnA+mof4MPA1Oq2CFg+5FVLkvqs16DPzF2ZubHafhnYCkwCLgNWVt1WApdX25cBd2eHfwXeHhGnDnnlkqQ+6dccfUQ0AucCPwFOycxd0PHLADi56jYJ2NFlWFvVJkmqgz4HfUScAPwzcGNm/qGnrt20ZTfHWxQRGyJiQ3t7e1/LkCT1U5+CPiIa6Aj5VZn5vap5d+eUTHX/fNXeBkzpMnwysPPQY2bmisxszszmiRMnDrR+SVIv+rLqJoBvA1sz85tdHnoAWFBtLwDu79I+v1p9cwGwp3OKR5J09I3pQ58PAJ8EnoyI1qptGXArcG9EfAp4FriqeuwhYC6wDXgV+OshrViS1C+9Bn1m/oju590BZnfTP4HrB1mXJGmI+MpYSSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqXK9BHxHfiYjnI2Jzl7a/jYjnIqK1us3t8tjSiNgWEU9HxCW1KlyS1Dd9uaK/C5jTTfvfZ2ZTdXsIICLOBq4BplVjvhURxw5VsZKk/us16DPzceB3fTzeZcDqzNybmf8GbAPOH0R9kqRBGswc/Q0Rsama2hlftU0CdnTp01a1HSYiFkXEhojY0N7ePogyJEk9GWjQLwfeDTQBu4C/q9qjm77Z3QEyc0VmNmdm88SJEwdYhiSpNwMK+szcnZkHMvMN4B/50/RMGzClS9fJwM7BlShJGowBBX1EnNpl9wqgc0XOA8A1EXF8RJwBTAV+OrgSJUmDMaa3DhHxT8As4KSIaAO+BMyKiCY6pmW2A/8dIDO3RMS9wFPAfuD6zDxQm9IlSX3Ra9Bn5rxumr/dQ/+vAV8bTFGSpKHjK2MlqXAGvSQVrtepG/1J45Lv17uEomy/9dJ6lyCNCl7RS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpML1GvQR8Z2IeD4iNndpe0dE/DAifl3dj6/aIyLujIhtEbEpImbWsnhJUu/6ckV/FzDnkLYlwPrMnAqsr/YBPgxMrW6LgOVDU6YkaaB6DfrMfBz43SHNlwErq+2VwOVd2u/ODv8KvD0iTh2qYiVJ/TfQOfpTMnMXQHV/ctU+CdjRpV9b1XaYiFgUERsiYkN7e/sAy5Ak9Waon4yNbtqyu46ZuSIzmzOzeeLEiUNchiSp00CDfnfnlEx1/3zV3gZM6dJvMrBz4OVJkgZroEH/ALCg2l4A3N+lfX61+uYCYE/nFI8kqT7G9NYhIv4JmAWcFBFtwJeAW4F7I+JTwLPAVVX3h4C5wDbgVeCva1CzJKkfeg36zJx3hIdmd9M3gesHW5Qkaej4ylhJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4cYMZnBEbAdeBg4A+zOzOSLeAawBGoHtwNWZ+dLgypQkDdRQXNH/ZWY2ZWZztb8EWJ+ZU4H11b4kqU5qMXVzGbCy2l4JXF6Dc0iS+miwQZ/AuohoiYhFVdspmbkLoLo/eZDnkCQNwqDm6IEPZObOiDgZ+GFE/LKvA6tfDIsA3vnOdw6yDEnSkQzqij4zd1b3zwP3AecDuyPiVIDq/vkjjF2Rmc2Z2Txx4sTBlCFJ6sGAgz4i/iwiTuzcBv4LsBl4AFhQdVsA3D/YIiVJAzeYqZtTgPsiovM4/zczH46InwH3RsSngGeBqwZfpiRpoAYc9Jn5G+CcbtpfBGYPpihJ0tDxlbGSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klS4mgV9RMyJiKcjYltELKnVeSRJPatJ0EfEscD/AT4MnA3Mi4iza3EuSVLPanVFfz6wLTN/k5n/DqwGLqvRuSRJPahV0E8CdnTZb6vaJElH2ZgaHTe6acs3dYhYBCyqdl+JiKdrVMtodBLwQr2L6E18o94VqA782Rxap/elU62Cvg2Y0mV/MrCza4fMXAGsqNH5R7WI2JCZzfWuQzqUP5v1Uaupm58BUyPijIg4DrgGeKBG55Ik9aAmV/SZuT8ibgD+BTgW+E5mbqnFuSRJPavV1A2Z+RDwUK2Orx45Jabhyp/NOojM7L2XJGnE8i0QJKlwBr0kFc6gl1QzEXFMRPynetcx2jlHP8JFxETgOqCRLk+uZ+bCetUkdRURP87M/1jvOkazmq260VFzP/AE8AhwoM61SN1ZFxH/FfheemVZF17Rj3AR0ZqZTfWuQzqSiHgZ+DM6LkReo+MtUjIz31bXwkYR5+hHvgcjYm69i5COJDNPzMxjMrMhM99W7RvyR5FX9CNcl6ulfwf2Vc1eLWnYiIgArgXOyMz/FRFTgFMz86d1Lm3UMOgl1VRELAfeAC7KzLMiYjywLjPfX+fSRg2fjC1ARPwV8BfV7mOZ+WA965EO8R8yc2ZE/BwgM1+q3uxQR4lz9CNcRNwKfA54qrp9rmqThot91ceLJhxcEvxGfUsaXZy6GeEiYhPQlJlvVPvHAj/PzBn1rUzqEBHXAh8HZgIrgY8B/zMz/19dCxtFnLopw9uB31Xb4+pZiHSozFwVES3AbDqWVl6emVvrXNaoYtCPfF8Hfh4Rj9Lxn+gvgKX1LUk6zK+BP1BlTkS8MzOfrW9Jo4dTNwWIiFOB99MR9D/JzN/WuSTpoIj4DPAlYDcdL5rqfMGU04tHiUE/wkXEB4DWzPxjRHyCjnnQf8jMZ+pcmgRARGyjY+XNi/WuZbRy1c3Itxx4NSLOAT4PPAPcXd+SpDfZAeypdxGjmXP0I9/+zMyIuAy4MzO/HREL6l2UFBH/o9r8DfBYRHwf2Nv5eGZ+sy6FjUIG/cj3ckQsBT4J/OdqeWVDnWuSAE6s7p+tbsdVN6jW1OvocI5+hIuIPwfmAT/LzB9V7yMyKzPvqXNpEgARcdWha+a7a1PtGPQjVPVmZp3fvKjus9reC2wDvpiZ6+tQnnRQRGzMzJm9tal2nLoZoTLzxCM9Vk3fvA9YVd1LR11EfBiYC0yKiDu7PPQ2YH99qhqdDPoCZeYB4BcR8b/rXYtGtZ3ABuAq4Fd0/MV5gI719H9Tx7pGHaduJNVERDQAXwP+G7CdjmnFKcB3gWWZue/IozWUXEcvqVZuA8YDp2fmzMw8F3gXHe/HdHtdKxtlvKKXVBMR8WvgzEM/ELx6DumXmTm1PpWNPl7RS6qVPDTkq8YDuI7+qDLoJdXKUxEx/9DG6j2ZflmHekYtp24k1URETAK+B7wGtNBxFf9+4C3AFZn5XB3LG1UMekk1FREXAdPoWHWzxRfxHX0GvSQVzjl6SSqcQS9JhTPoJalwBr0kFc6gl6TC/X/qB5qnoz9HUQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plotUniqueValuesComparation(df_json, df_other, 'script_tld')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
jsonother
com0.6464850.650045
net0.1436210.170509
ru0.0823250.053212
fr0.0230780.006061
cn0.0060600.014413
\n", + "
" + ], + "text/plain": [ + " json other\n", + "com 0.646485 0.650045\n", + "net 0.143621 0.170509\n", + "ru 0.082325 0.053212\n", + "fr 0.023078 0.006061\n", + "cn 0.006060 0.014413" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEHCAYAAABV4gY/AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFCRJREFUeJzt3X+QXWWd5/H3l/wgrkSYgcaFNNgZJ4IhUpE0CZgyiZYMRByyOswAC9Qwi6RmnAiWK0XULYpld3T8xWjtRNc4E0a0QnCZESJmh9qRBeSHTHcwDCRMNGIYeuLshMgPgULozHf/uJ1wbZr06fTtPt3Pfb+qqLrnuU+f/vSF+nDuc885NzITSVJZDqk7gCSp9Sx3SSqQ5S5JBbLcJalAlrskFchyl6QCWe6SVCDLXZIKZLlLUoGm1vWLjzrqqOzq6qrr10vSpLR58+YnM7NjuHm1lXtXVxe9vb11/XpJmpQi4vEq81yWkaQCWe6SVCDLXZIKVNuauyRV8fLLL9PX18eLL75Yd5RxNWPGDDo7O5k2bdpB/bzlLmlC6+vrY+bMmXR1dRERdccZF5nJnj176OvrY/bs2Qe1D5dlJE1oL774IkceeWTbFDtARHDkkUeO6t2K5S5pwmunYt9ntH+z5S5JBXLNXdKk0rX6uy3d384/PbvSvHe84x3cd999Lf3dY8lyv+bwUf78M63JIWlCm0zFDpO83Fvxf/CdM1oQRFLxDjvsMH784x9z3nnn8eyzz9Lf389XvvIV3vnOd3LjjTfyqU99iszk7LPP5jOf+cz+n7niiiu47bbbeN3rXsett97KG9/4xnHJ65q7JFW0fv16zjzzTLZs2cJDDz3E/Pnz2bVrF1dddRV33HEHW7Zsoaenh1tuuQWA559/ntNOO42HHnqIJUuW8LWvfW3cslruklTRqaeeyvXXX88111zDww8/zMyZM+np6WHZsmV0dHQwdepULrzwQu6++24Apk+fzvve9z4AFixYwM6dO8ctq+UuSRUtWbKEu+++m1mzZnHxxRdzww03kJmvOX/atGn7T2mcMmUK/f394xXVcpekqh5//HGOPvpoLrvsMi699FIefPBBFi1axF133cWTTz7J3r17ufHGG1m6dGndUSf3B6qS2k/VUxdbLSK48847+dznPse0adM47LDDuOGGGzjmmGP49Kc/zbve9S4yk/e+972sWLGiloy/kvdAbynGUnd3d472yzpac7bMfxzdDjwVUhpTjz76KG9961trzbBnzx5OOeUUHn+80vdktMxQf3tEbM7M7uF+1mUZSTqAXbt2cfrpp/Oxj32s7igj4rKMJB3Asccey49+9KO6Y4yYR+6SVCDLXZIKVKncI+KsiNgeETsiYvVrzPm9iNgWEVsjYn1rY0qSRmLYNfeImAKsAc4A+oCeiNiYmdua5swBPg4szsynIuLosQosSRpelQ9UFwI7MvMxgIjYAKwAtjXNuQxYk5lPAWTmv7Y6qCQBo7+T66v2d3CnMz/99NOsX7+eD33oQwDceeedfP7zn+e2225rZbqDVmVZZhbwRNN238BYs7cAb4mIeyPiBxFxVqsCStJE9PTTT/PlL3+5Zftr9a0JqpT7UN/1NPjKp6nAHGAZcAHwFxFxxKt2FLEyInojonf37t0jzSpJtbnuuuuYN28e8+bN44tf/CKrV6/mJz/5CfPnz+fKK68E4LnnnuPcc8/lxBNP5MILL9x/35nNmzezdOlSFixYwJlnnsnPfvYzAJYtW8YnPvEJli5dype+9KWW5q2yLNMHHNe03QnsGmLODzLzZeCnEbGdRtn3NE/KzLXAWmhcoXqwoSVpPG3evJnrr7+eBx54gMxk0aJFfPOb3+SRRx5hy5YtQGNZ5oc//CFbt27l2GOPZfHixdx7770sWrSID3/4w9x66610dHRw00038clPfpJ169YBjXcAd911V8szVyn3HmBORMwG/hk4Hxh8zf4tNI7Y/yoijqKxTPNYK4NKUl3uuece3v/+9/P6178egA984AN8//vff9W8hQsX0tnZCcD8+fPZuXMnRxxxBI888ghnnHEGAHv37uWYY47Z/zPnnXfemGQettwzsz8iVgG3A1OAdZm5NSKuBXozc+PAc78VEduAvcCVmblnTBJL0jireg+uQw89dP/jfbf4zUxOOukk7r///iF/Zt//MFqt0nnumbkpM9+SmW/OzD8ZGLt6oNjJho9m5tzMfFtmbhiTtJJUgyVLlnDLLbfwwgsv8Pzzz/Ptb3+bxYsX84tf/GLYnz3hhBPYvXv3/nJ/+eWX2bp161hH9t4ykiaZGu7Eesopp3DJJZewcOFCAD74wQ+yYMECFi9ezLx581i+fDlnnz30rYinT5/OzTffzOWXX84zzzxDf38/H/nIRzjppJPGNLO3/PWWv9KENhFu+VsXb/krSfoVlrskFchylzTh1bV8XKfR/s2Wu6QJbcaMGezZs6etCj4z2bNnDzNmzDjofXi2jKQJrbOzk76+PtrtliUzZszYf0HUwbDcJU1o06ZNY/bs2XXHmHRclpGkAlnuklQgy12SCmS5S1KBLHdJKpDlLkkFstwlqUCWuyQVyHKXpAJZ7pJUIMtdkgpkuUtSgSx3SSqQ5S5JBbLcJalAlco9Is6KiO0RsSMiVg/x/CURsTsitgz888HWR5UkVTXsl3VExBRgDXAG0Af0RMTGzNw2aOpNmblqDDJKkkaoypH7QmBHZj6WmS8BG4AVYxtLkjQaVcp9FvBE03bfwNhgvxMR/xARN0fEcUPtKCJWRkRvRPS22/chStJ4qlLuMcTY4K8h/w7QlZknA38HfH2oHWXm2szszszujo6OkSWVJFVWpdz7gOYj8U5gV/OEzNyTmb8c2PwasKA18SRJB6NKufcAcyJidkRMB84HNjZPiIhjmjbPAR5tXURJ0kgNe7ZMZvZHxCrgdmAKsC4zt0bEtUBvZm4ELo+Ic4B+4OfAJWOYWZI0jGHLHSAzNwGbBo1d3fT448DHWxtNknSwvEJVkgpkuUtSgSx3SSqQ5S5JBbLcJalAlrskFchyl6QCWe6SVCDLXZIKZLlLUoEsd0kqkOUuSQWy3CWpQJa7JBXIcpekAlnuklQgy12SCmS5S1KBLHdJKpDlLkkFstwlqUCWuyQVyHKXpAJVKveIOCsitkfEjohYfYB550ZERkR36yJKkkZq2HKPiCnAGmA5MBe4ICLmDjFvJnA58ECrQ0qSRqbKkftCYEdmPpaZLwEbgBVDzPtvwGeBF1uYT5J0EKqU+yzgiabtvoGx/SLi7cBxmXnbgXYUESsjojcienfv3j3isJKkaqqUewwxlvufjDgE+DPgPw+3o8xcm5ndmdnd0dFRPaUkaUSqlHsfcFzTdiewq2l7JjAPuDMidgKnARv9UFWS6lOl3HuAORExOyKmA+cDG/c9mZnPZOZRmdmVmV3AD4BzMrN3TBJLkoY1bLlnZj+wCrgdeBT4VmZujYhrI+KcsQ4oSRq5qVUmZeYmYNOgsatfY+6y0ceSJI2GV6hKUoEsd0kqkOUuSQWy3CWpQJa7JBXIcpekAlnuklQgy12SCmS5S1KBLHdJKpDlLkkFstwlqUCWuyQVyHKXpAJZ7pJUIMtdkgpkuUtSgSx3SSqQ5S5JBbLcJalAlrskFchyl6QCWe6SVKBK5R4RZ0XE9ojYERGrh3j+DyPi4YjYEhH3RMTc1keVJFU1bLlHxBRgDbAcmAtcMER5r8/Mt2XmfOCzwHUtTypJqqzKkftCYEdmPpaZLwEbgBXNEzLz2abN1wPZuoiSpJGaWmHOLOCJpu0+YNHgSRHxx8BHgenAu4faUUSsBFYCHH/88SPNKkmqqMqRewwx9qoj88xck5lvBq4C/stQO8rMtZnZnZndHR0dI0sqSaqsSrn3Acc1bXcCuw4wfwPwH0YTSpI0OlXKvQeYExGzI2I6cD6wsXlCRMxp2jwb+HHrIkqSRmrYNffM7I+IVcDtwBRgXWZujYhrgd7M3Aisioj3AC8DTwG/P5ahJUkHVuUDVTJzE7Bp0NjVTY+vaHEuSdIoeIWqJBXIcpekAlnuklQgy12SCmS5S1KBLHdJKpDlLkkFstwlqUCWuyQVyHKXpAJZ7pJUIMtdkgpkuUtSgSx3SSqQ5S5JBbLcJalAlrskFchyl6QCWe6SVCDLXZIKZLlLUoEsd0kqUKVyj4izImJ7ROyIiNVDPP/RiNgWEf8QEd+LiDe1Pqokqaphyz0ipgBrgOXAXOCCiJg7aNoPge7MPBm4Gfhsq4NKkqqrcuS+ENiRmY9l5kvABmBF84TM/L+Z+cLA5g+AztbGlCSNRJVynwU80bTdNzD2Wi4F/vdoQkmSRmdqhTkxxFgOOTHiIqAbWPoaz68EVgIcf/zxFSNKkkaqypF7H3Bc03YnsGvwpIh4D/BJ4JzM/OVQO8rMtZnZnZndHR0dB5NXklRBlXLvAeZExOyImA6cD2xsnhARbwe+SqPY/7X1MSVJIzFsuWdmP7AKuB14FPhWZm6NiGsj4pyBaZ8DDgP+V0RsiYiNr7E7SdI4qLLmTmZuAjYNGru66fF7WpxLkjQKXqEqSQWy3CWpQJa7JBXIcpekAlX6QFVt4prDR/nzz7Qmh6RR88hdkgpkuUtSgSx3SSqQ5S5JBbLcJalAlrskFchyl6QCWe6SVCAvYipE1+rvjnofO2e0IIikCcEjd0kqkOUuSQWy3CWpQJa7JBXIcpekAlnuklQgy12SCmS5S1KBLHdJKlClco+IsyJie0TsiIjVQzy/JCIejIj+iDi39TElSSMxbLlHxBRgDbAcmAtcEBFzB037J+ASYH2rA0qSRq7KvWUWAjsy8zGAiNgArAC27ZuQmTsHnvu3McgoSRqhKssys4Anmrb7BsYkSRNUlXKPIcbyYH5ZRKyMiN6I6N29e/fB7EKSVEGVcu8Djmva7gR2Hcwvy8y1mdmdmd0dHR0HswtJUgVVyr0HmBMRsyNiOnA+sHFsY0mSRmPYcs/MfmAVcDvwKPCtzNwaEddGxDkAEXFqRPQBvwt8NSK2jmVoSdKBVfompszcBGwaNHZ10+MeGss1kqQJwK/ZU3FG+5WDO//07BYlkerj7QckqUCWuyQVyHKXpAJZ7pJUIMtdkgpkuUtSgSx3SSqQ5S5JBbLcJalAXqEqDXbN4S3YxzOj34c0Ch65S1KBLHdJKpDlLkkFstwlqUCWuyQVyHKXpAJZ7pJUIMtdkgrkRUxSwfzKwfblkbskFchyl6QCuSwjSVVMsnsOVTpyj4izImJ7ROyIiNVDPH9oRNw08PwDEdHV6qCSpOqGLfeImAKsAZYDc4ELImLuoGmXAk9l5m8CfwZ8ptVBJUnVVVmWWQjsyMzHACJiA7AC2NY0ZwVwzcDjm4E/j4jIzGxhVknjbZItRRzIqM8cmtGiIOOkyrLMLOCJpu2+gbEh52RmP/AMcGQrAkqSRq7KkXsMMTb4iLzKHCJiJbByYPO5iNhe4fePqYCjgCcPegf/dag/fXLytWgY9esAvhbNfC1e0ZrX4k1VJlUp9z7guKbtTmDXa8zpi4ipwOHAzwfvKDPXAmurBBsvEdGbmd1155gIfC0afB1e4Wvxisn2WlRZlukB5kTE7IiYDpwPbBw0ZyPw+wOPzwXucL1dkuoz7JF7ZvZHxCrgdmAKsC4zt0bEtUBvZm4E/hL4RkTsoHHEfv5YhpYkHVili5gycxOwadDY1U2PXwR+t7XRxs2EWiaqma9Fg6/DK3wtXjGpXotw9USSyuO9ZSSpQJa7JBXIcpekArXlXSEj4mSgi6a/PzP/prZANYmI2Zn50+HG1D4G7iX19cy8qO4sGp22K/eIWAecDGwF/m1gOIG2K3fgr4FTBo3dDCyoIUutIuJ6hriqOjP/Uw1xapOZeyOiIyKmZ+ZLdeepW0R8gMaNEI+mcSV+AJmZb6g1WAVtV+7AaZk5+K6WbSUiTgROAg4f+I93nzcAk+z2SC1zW9PjGcD7efWV2O1iJ3BvRGwEnt83mJnX1ZaoPp8FfjszH607yEi1Y7nfHxFzM3Pb8FOLdQLwPuAI4Lebxn8BXFZLoppl5l83b0fEjcDf1RSnFhHxjcy8GDiPxq27DwFm1puqdv9vMhY7tOF57hGxBPgO8C/AL3nlbdbJtQarQUScnpn3151jIoqIE4DvDnxHQVuIiG00vrfhO8Cywc9n5qvuF1W6iPgS8O+BW2j0BTA5PqNrxyP3dcDFwMO8subervZExPeAN2bmvIEPms/JzP9ed7DxFBEB7AWeaxr+F+CqehLV5n8CfwvMBnqbxoPG5xG/UUeomr0BeAH4raaxSfEZXTseud+Rme+uO8dEEBF3AVcCX83Mtw+MPZKZ8+pNNv4i4sHMHPzhcluKiK9k5h/VnWMiiIivA1dk5tMD278GfGEyfNDejkfu/xgR62m89ZxUb7PGwL/LzL9vHLju119XmJrdFxGnZmZP3UHqZrH/ipP3FTtAZj4VEW+vM1BV7Vjur6NR6pPubdYYeDIi3szAKYARcS7ws3oj1ebdwB9FxE4aZ4i07Wcx+hWHRMSvZeZTABHx60yS3pwUIVspM/+g7gwTyB/TuNPdiRHxz8BPgQvrjVSb5XUH0IT0BRrv6m6mcRD0e8Cf1BupmnZcc+8E/gewmMa/rHtorKn11RqsBhFxKI0vV+kCfh14lsbR6rV15pImkoiYS+OdXQDfmyynUbdjuf8fYD3wjYGhi4ALM/OM+lLVIyL+FngaeJDG2SIAZOYXagslqSXasdy3ZOb84cbaQbueGSO1g3a8K+STEXFRREwZ+OciYE/doWpyX0S8re4QklqvHY/cjwf+HDidxpr7fcDlmflPtQarwcAVib9J44PUtr5aVypNO5b714GPDDq16fOT4aKEVouINw01npmPj3cWSa3VdqdC0rgo4al9G5n588lyUUKrWeJSudpxzf2QgUuIgcl1UYIkVdWOpTZpL0qQpKrabs0dJu9FCZJUVVuWuySVrh3X3CWpeJa7JBXIcpekAlnuklQgy12SCvT/AbRxrevuS2knAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plotTopUsageComparation(df_json, df_other, 'script_tld', 4)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 52 unique script_tld present on the non-json dataset and 89 on the JSONs\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEOCAYAAACZ2uz0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAEc1JREFUeJzt3XuMlfWZwPHvIwwdrYoU0VVxHUzwhiDi1LWXrUZstGorul5KsJLFlTRZb62ogJs07Xa9xbTWZmti6ioastJlbWxs61JZTG1i3DLjKCD1EtfLiNLRVVoVKeizf8yBcBmcw9xe5jffT0Jmzjvve84DM3x5+c17zkRmIkka/PaoegBJUt8w6JJUCIMuSYUw6JJUCIMuSYUw6JJUCIMuSYUw6JJUCIMuSYUYPpAPtv/++2dTU9NAPqQkDXotLS1vZeaY7vYb0KA3NTWxfPnygXxISRr0IuKVevZzyUWSCmHQJakQBl2SCjGga+iSdg8bN26kvb2dDz/8sOpRtJXGxkbGjh1LQ0NDj4436NIQ1N7ezj777ENTUxMRUfU4AjKTt99+m/b2dsaNG9ej+3DJRRqCPvzwQ0aPHm3MdyMRwejRo3v1vyaDLg1Rxnz309vPiUGXpEK4ht6Fprm/rHqEYrx881lVj6A69PXXfHef91NOOYV58+Zx+umnb9l2++238/zzz/OTn/xkp8ftvffevPfee30y43333cett95KZpKZzJo1izlz5vTJfW924403Mn/+/D69z0/iGbqkATd9+nQeeOCBbbY98MADTJ8+fUAe/9e//jW33347S5YsYdWqVbS2tjJy5Mg+f5wbb7yxz+/zkxh0SQPu/PPP5+GHH2bDhg0AvPzyy6xZs4YvfvGLvPfee0ydOpUpU6YwceJEHnrooR2Of+yxxzj77LO33L788su59957AWhpaeHkk0/mhBNO4PTTT+eNN97Y4fibbrqJ2267jYMPPhjovFzwsssuA6CtrY2TTjqJSZMmce655/LOO+8Anf+r2PzSJW+99RabX5fq3nvv5bzzzuOMM85g/PjxXHfddQDMnTuX9evXM3nyZGbMmMH777/PWWedxXHHHcexxx7LokWL+uBPclsGXdKAGz16NCeeeCKPPPII0Hl2ftFFFxERNDY28vOf/5zW1laWLVvGNddcQ2bWdb8bN27kiiuuYPHixbS0tDBr1ixuuOGGHfZbuXIlJ5xwQpf3cckll3DLLbfwzDPPMHHiRL773e92+7htbW0sWrSIFStWsGjRIl577TVuvvlm9txzT9ra2li4cCGPPPIIBx98ME8//TQrV67kjDPOqOv3tCsMuqRKbL3ssvVyS2Yyf/58Jk2axGmnncbrr7/O2rVr67rP5557jpUrV/LlL3+ZyZMn8/3vf5/29va6Z1q3bh3vvvsuJ598MgAzZ87kt7/9bbfHTZ06lZEjR9LY2MgxxxzDK6/s+FpaEydO5NFHH+X666/n8ccf75clHoMuqRLTpk1j6dKltLa2sn79eqZMmQLAwoUL6ejooKWlhba2Ng488MAdrs0ePnw4H3/88Zbbmz+emUyYMIG2tjba2tpYsWIFS5Ys2eGxJ0yYQEtLyy7Nu/Vjbj/Ppz71qS3vDxs2jE2bNu1w/BFHHEFLSwsTJ05k3rx5fO9739ulx6+HQZdUib333ptTTjmFWbNmbfPN0HXr1nHAAQfQ0NDAsmXLujzbPeyww3j22WfZsGED69atY+nSpQAceeSRdHR08MQTTwCdSzCrVq3a4fh58+Zx3XXX8eabbwKwYcMG7rjjDkaOHMmoUaN4/PHHAbj//vu3nK03NTVt+Udg8eLFdf0eGxoa2LhxIwBr1qxhr7324uKLL2bOnDm0trbWdR+7wssWJVV2een06dM577zztrniZcaMGXz1q1+lubmZyZMnc9RRR+1w3KGHHsqFF17IpEmTGD9+PMcffzwAI0aMYPHixVx55ZWsW7eOTZs2cfXVVzNhwoRtjj/zzDNZu3Ytp512GplJRDBr1iwAFixYwDe/+U0++OADDj/8cO655x4A5syZw4UXXsj999/PqaeeWtfvb/bs2UyaNIkpU6ZwySWXcO2117LHHnvQ0NDAnXfe2aM/s08S9X6zoS80NzfnYPgBF16H3ne8Dn33tHr1ao4++uiqx1AXuvrcRERLZjZ3d6xLLpJUCIMuSYUw6NIQNZDLrapPbz8nBl0aghobG3n77beN+m5k8+uhNzY29vg+vMpFGoLGjh1Le3s7HR0dVY+irWz+iUU9ZdClIaihoaHHPxVHuy+XXCSpEAZdkgph0CWpEHUFPSK+FRGrImJlRPx7RDRGxLiIeDIiXoiIRRExor+HlSTtXLdBj4hDgCuB5sw8FhgGfB24BfhhZo4H3gEu7c9BJUmfrN4ll+HAnhExHNgLeAM4Fdj8kmMLgGl9P54kqV7dBj0zXwduA16lM+TrgBbg3czc/KK/7cAhXR0fEbMjYnlELPeaV0nqP/UsuYwCzgHGAQcDnwa+0sWuXT7lLDPvyszmzGweM2ZMb2aVJH2CepZcTgP+NzM7MnMj8CDweWC/2hIMwFhgTT/NKEmqQz1BfxU4KSL2iogApgLPAsuA82v7zAR2/NHckqQBU88a+pN0fvOzFVhRO+Yu4Hrg2xHxIjAauLsf55QkdaOu13LJzO8A39lu80vAiX0+kSSpR3ymqCQVwqBLUiEMuiQVwqBLUiEMuiQVwqBLUiEMuiQVwqBLUiEMuiQVwqBLUiEMuiQVwqBLUiEMuiQVwqBLUiEMuiQVwqBLUiEMuiQVwqBLUiEMuiQVwqBLUiEMuiQVwqBLUiEMuiQVwqBLUiEMuiQVwqBLUiEMuiQVwqBLUiEMuiQVwqBLUiEMuiQVwqBLUiEMuiQVwqBLUiEMuiQVwqBLUiEMuiQVwqBLUiHqCnpE7BcRiyPiDxGxOiI+FxGfiYjfRMQLtbej+ntYSdLO1XuG/iPgkcw8CjgOWA3MBZZm5nhgae22JKki3QY9IvYFvgTcDZCZf8nMd4FzgAW13RYA0/prSElS9+o5Qz8c6ADuiYinIuKnEfFp4MDMfAOg9vaArg6OiNkRsTwilnd0dPTZ4JKkbdUT9OHAFODOzDweeJ9dWF7JzLsyszkzm8eMGdPDMSVJ3akn6O1Ae2Y+Wbu9mM7Ar42IgwBqb//YPyNKkurRbdAz803gtYg4srZpKvAs8AtgZm3bTOChfplQklSX4XXudwWwMCJGAC8Bf0/nPwY/i4hLgVeBC/pnRElSPeoKema2Ac1dfGhq344jSeopnykqSYUw6JJUCIMuSYUw6JJUCIMuSYUw6JJUCIMuSYUw6JJUCIMuSYUw6JJUCIMuSYUw6JJUCIMuSYWo9+VzJe0Gmub+suoRivLyzWdVPUKf8gxdkgph0CWpEAZdkgph0CWpEAZdkgph0CWpEAZdkgph0CWpEAZdkgph0CWpEAZdkgph0CWpEAZdkgph0CWpEAZdkgph0CWpEAZdkgph0CWpEAZdkgph0CWpEAZdkgph0CWpEAZdkgpRd9AjYlhEPBURD9duj4uIJyPihYhYFBEj+m9MSVJ3duUM/Spg9Va3bwF+mJnjgXeAS/tyMEnSrqkr6BExFjgL+GntdgCnAotruywApvXHgJKk+tR7hn47cB3wce32aODdzNxUu90OHNLVgRExOyKWR8Tyjo6OXg0rSdq5boMeEWcDf8zMlq03d7FrdnV8Zt6Vmc2Z2TxmzJgejilJ6s7wOvb5AvC1iDgTaAT2pfOMfb+IGF47Sx8LrOm/MSVJ3en2DD0z52Xm2MxsAr4O/HdmzgCWAefXdpsJPNRvU0qSutWb69CvB74dES/SuaZ+d9+MJEnqiXqWXLbIzMeAx2rvvwSc2PcjSZJ6wmeKSlIhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFaLboEfEoRGxLCJWR8SqiLiqtv0zEfGbiHih9nZU/48rSdqZes7QNwHXZObRwEnAP0bEMcBcYGlmjgeW1m5LkirSbdAz843MbK29/2dgNXAIcA6woLbbAmBafw0pSereLq2hR0QTcDzwJHBgZr4BndEHDtjJMbMjYnlELO/o6OjdtJKknao76BGxN/CfwNWZ+ad6j8vMuzKzOTObx4wZ05MZJUl1qCvoEdFAZ8wXZuaDtc1rI+Kg2scPAv7YPyNKkupRz1UuAdwNrM7MH2z1oV8AM2vvzwQe6vvxJEn1Gl7HPl8AvgGsiIi22rb5wM3AzyLiUuBV4IL+GVGSVI9ug56ZvwNiJx+e2rfjSJJ6ymeKSlIhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhDLokFcKgS1IhehX0iDgjIp6LiBcjYm5fDSVJ2nU9DnpEDAP+FfgKcAwwPSKO6avBJEm7pjdn6CcCL2bmS5n5F+AB4Jy+GUuStKt6E/RDgNe2ut1e2yZJqsDwXhwbXWzLHXaKmA3Mrt18LyKe68Vjalv7A29VPcQniVuqnkAV2e2/NmFQfX0eVs9OvQl6O3DoVrfHAmu23ykz7wLu6sXjaCciYnlmNlc9h7Q9vzar0Zsll98D4yNiXESMAL4O/KJvxpIk7aoen6Fn5qaIuBz4L2AY8G+ZuarPJpMk7ZLeLLmQmb8CftVHs2jXuZSl3ZVfmxWIzB2+jylJGoR86r8kFcKgS1IhDLqkXomIPSLi81XPIdfQB42IGANcBjSx1TezM3NWVTNJm0XEE5n5uarnGOp6dZWLBtRDwOPAo8BHFc8ibW9JRPwd8GB6llgZz9AHiYhoy8zJVc8hdSUi/gx8ms6TjfV0vjRIZua+lQ42xLiGPng8HBFnVj2E1JXM3Ccz98jMhszct3bbmA8wz9AHia3OgP4CbKxt9gxIu4WICGAGMC4z/zkiDgUOysz/qXi0IcWgS+q1iLgT+Bg4NTOPjohRwJLM/GzFow0pflN0EImIrwFfqt18LDMfrnIeaSt/k5lTIuIpgMx8p/aifRpArqEPEhFxM3AV8Gzt11W1bdLuYGPtx1ImbLnM9uNqRxp6XHIZJCLiGWByZn5cuz0MeCozJ1U7mQQRMQO4CJgCLADOB/4pM/+j0sGGGJdcBpf9gP+rvT+yykGkrWXmwohoAabSecnitMxcXfFYQ45BHzxuAp6KiGV0/oX5EjCv2pGkbbwA/IlaVyLirzPz1WpHGlpcchlEIuIg4LN0Bv3JzHyz4pEkACLiCuA7wFo6n1y0+YlFLgkOIIM+SETEF4C2zHw/Ii6mc63yR5n5SsWjSUTEi3Re6fJ21bMMZV7lMnjcCXwQEccB1wKvAPdVO5K0xWvAuqqHGOpcQx88NmVmRsQ5wB2ZeXdEzKx6KA1tEfHt2rsvAY9FxC+BDZs/npk/qGSwIcqgDx5/joh5wDeAv61dtthQ8UzSPrW3r9Z+jaj9gto16Ro4rqEPEhHxV8B04PeZ+bvaa2Wckpn3VzyaRERcsP01511tU/8y6Lu52otybf4kRe1t1t7fALwI3JCZSysYTwIgIlozc0p329S/XHLZzWXmPjv7WG3Z5VhgYe2tNKAi4ivAmcAhEXHHVh/aF9hUzVRDl0EfxDLzI+DpiPhx1bNoyFoDLAcuAJ6n83+PH9F5Pfq3KpxrSHLJRVKPRUQD8C/APwAv07kUeChwDzA/Mzfu/Gj1Na9Dl9QbtwKjgMMyc0pmHg8cTudrDd1W6WRDkGfoknosIl4Ajtj+B0PXvr/zh8wcX81kQ5Nn6JJ6I7ePeW3jR3gd+oAz6JJ649mIuGT7jbXXG/pDBfMMaS65SOqxiDgEeBBYD7TQeVb+WWBP4NzMfL3C8YYcgy6p1yLiVGACnVe5rPKJbtUw6JJUCNfQJakQBl2SCmHQJakQBl2SCmHQJakQ/w93tLJrAKtZgwAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plotUniqueValuesComparation(df_a_json, df_a_other, 'script_tld')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
jsonother
com0.7050270.627121
net0.1508780.190486
ru0.0554860.002650
biz0.0014980.041437
\n", + "
" + ], + "text/plain": [ + " json other\n", + "com 0.705027 0.627121\n", + "net 0.150878 0.190486\n", + "ru 0.055486 0.002650\n", + "biz 0.001498 0.041437" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEHCAYAAABV4gY/AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFK9JREFUeJzt3X+QndV93/H31/phNYZCAusOaGWkJgpYLFigRRJWRwLHFDCOFDvEloodM8WoqSsT1zVFNh1KlXbs2K5/NJEzlhNpTDyScJQaZKyOZmLMb0x3BSIgEbmyIsxGxFnWgA0MRiLf/rEr5frqSvvs7l1d3aP3a2Zn7jnPuc/97pXms+ee58eNzESSVJY3tLoASVLzGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAk1s1QuffvrpOX369Fa9vCS1pW3btj2XmR3DjWtZuE+fPp3e3t5WvbwktaWIeLrKOJdlJKlAhrskFchwl6QCtWzNXZKq2L9/P319fbz66qutLuWYmjJlCp2dnUyaNGlUzzfcJR3X+vr6OPnkk5k+fToR0epyjonMZGBggL6+PmbMmDGqfbgsI+m49uqrr3LaaaedMMEOEBGcdtppY/q0YrhLOu6dSMF+0Fh/50rhHhFXRMSuiNgdESsbbP9iRGwf+vlBRLwwpqokSWMy7Jp7REwAVgOXAX1AT0RszsydB8dk5n+sGf9R4IJxqFWSmL7yO03d397PXFVp3Nvf/nYeeuihpr72eKpyQHUusDsz9wBExEZgCbDzCOOXAf+1OeWNTLP/0aH6P7yksrVTsEO1ZZmpwDM17b6hvsNExFnADODuI2xfHhG9EdHb398/0lolqWVOOukknn32WRYuXMjs2bPp6uri/vvvB2DDhg2cd955dHV1cdNNN/3Cc26++Wbe9ra3MX/+fH784x8fs3qrhHujVf08wtilwKbMfL3Rxsxck5ndmdnd0THsfW8k6biyfv16Lr/8crZv387jjz/O7Nmz2bdvHzfddBN3330327dvp6enhzvuuAOAl19+mfnz5/P444+zcOFCvva1rx2zWquEex8wrabdCew7wtilwIaxFiVJx6OLLrqIdevWceutt/LEE09w8skn09PTwyWXXEJHRwcTJ07kmmuu4b777gNg8uTJvPvd7wZgzpw57N2795jVWiXce4CZETEjIiYzGOCb6wdFxNnALwMPN7dESTo+LFy4kPvuu4+pU6fywQ9+kNtuu43MIy1kwKRJkw6d0jhhwgQOHDhwrEodPtwz8wCwAtgKPAV8MzN3RMSqiFhcM3QZsDGP9ptKUht7+umnefOb38z111/Pddddx6OPPsq8efO49957ee6553j99dfZsGEDixYtanWp1W4/kJlbgC11fbfUtW9tXlmS1FirzmCLCO655x4+97nPMWnSJE466SRuu+02zjjjDD796U9z6aWXkpm8613vYsmSJS2p8RfqbdVEu7u7O5v9ZR2eCimV56mnnuKtb31rS2sYGBjgwgsv5OmnK31PRtM0+t0jYltmdg/3XG8/IElHsW/fPi6++GI+8YlPtLqUEfGukJJ0FGeeeSY/+MEPWl3GiDlzl6QCGe6SVCDDXZIKZLhLUoE8oCqpvdx6SpP39+KonvbCCy+wfv16PvKRjwBwzz338PnPf5677rqrmdWNmjN3SRqFF154ga985StN21+zb01guEtSBV/4whfo6uqiq6uLL33pS6xcuZIf/vCHzJ49mxtvvBGAl156iauvvppzzjmHa6655tB9Z7Zt28aiRYuYM2cOl19+Oc8++ywAl1xyCZ/61KdYtGgRX/7yl5tar8sykjSMbdu2sW7dOh555BEyk3nz5vGNb3yDJ598ku3btwODyzKPPfYYO3bs4Mwzz2TBggU8+OCDzJs3j49+9KPceeeddHR0cPvtt3PzzTezdu1aYPATwL333tv0mg13SRrGAw88wHve8x7e9KY3AfDe97730Bd11Jo7dy6dnZ0AzJ49m71793Lqqafy5JNPctlllwHw+uuvc8YZZxx6zvvf//5xqdlwl6RhVL0H1xvf+MZDjw/e4jczOffcc3n44cZ3Qz/4B6PZXHOXpGEsXLiQO+64g1deeYWXX36Zb33rWyxYsICf/exnwz737LPPpr+//1C479+/nx07dox3yc7cJbWZUZ66OBYXXngh1157LXPnzgXgwx/+MHPmzGHBggV0dXVx5ZVXctVVje8gO3nyZDZt2sQNN9zAiy++yIEDB/jYxz7GueeeO641e8vfYeyd8m+avs9W/OeU2tXxcMvfVvGWv5KkX2C4S1KBDHdJx70T8auZx/o7G+6SjmtTpkxhYGDghAr4zGRgYIApU6aMeh+VzpaJiCuALwMTgD/NzM80GPM+4FYggcczcxyOREo60XR2dtLX10d/f3+rSzmmpkyZcuiCqNEYNtwjYgKwGrgM6AN6ImJzZu6sGTMT+CSwIDOfj4g3j7oiSaoxadIkZsyY0eoy2k6VZZm5wO7M3JOZrwEbgSV1Y64HVmfm8wCZ+Q/NLVOSNBJVwn0q8ExNu2+or9avA78eEQ9GxPeHlnEOExHLI6I3InpPtI9YknQsVQn3aNBXf2RjIjATuARYBvxpRJx62JMy12Rmd2Z2d3R0jLRWSVJFVcK9D5hW0+4E9jUYc2dm7s/MvwV2MRj2kqQWqBLuPcDMiJgREZOBpcDmujF3AJcCRMTpDC7T7GlmoZKk6oYN98w8AKwAtgJPAd/MzB0RsSoiFg8N2woMRMRO4HvAjZk5MF5FS5KOrtJ57pm5BdhS13dLzeMEPj70I0lqMa9QlaQCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgSqFe0RcERG7ImJ3RKxssP3aiOiPiO1DPx9ufqmSpKomDjcgIiYAq4HLgD6gJyI2Z+bOuqG3Z+aKcahRkjRCVWbuc4HdmbknM18DNgJLxrcsSdJYVAn3qcAzNe2+ob56vx0Rfx0RmyJiWlOqkySNSpVwjwZ9Wdf+NjA9M88H/gr4esMdRSyPiN6I6O3v7x9ZpZKkyqqEex9QOxPvBPbVDsjMgcz8+VDza8CcRjvKzDWZ2Z2Z3R0dHaOpV5JUQZVw7wFmRsSMiJgMLAU21w6IiDNqmouBp5pXoiRppIY9WyYzD0TECmArMAFYm5k7ImIV0JuZm4EbImIxcAD4CXDtONYsSRrGsOEOkJlbgC11fbfUPP4k8MnmliZJGi2vUJWkAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqUKVwj4grImJXROyOiJVHGXd1RGREdDevREnSSA0b7hExAVgNXAnMApZFxKwG404GbgAeaXaRkqSRqTJznwvszsw9mfkasBFY0mDcHwCfBV5tYn2SpFGoEu5TgWdq2n1DfYdExAXAtMy862g7iojlEdEbEb39/f0jLlaSVE2VcI8GfXloY8QbgC8C/2m4HWXmmszszszujo6O6lVKkkakSrj3AdNq2p3Avpr2yUAXcE9E7AXmA5s9qCpJrVMl3HuAmRExIyImA0uBzQc3ZuaLmXl6Zk7PzOnA94HFmdk7LhVLkoY1bLhn5gFgBbAVeAr4ZmbuiIhVEbF4vAuUJI3cxCqDMnMLsKWu75YjjL1k7GVJksbCK1QlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklSgSuEeEVdExK6I2B0RKxts/72IeCIitkfEAxExq/mlSpKqGjbcI2ICsBq4EpgFLGsQ3usz87zMnA18FvhC0yuVJFVWZeY+F9idmXsy8zVgI7CkdkBm/rSm+SYgm1eiJGmkJlYYMxV4pqbdB8yrHxQR/wH4ODAZeEdTqpMkjUqVmXs06DtsZp6ZqzPzV4GbgP/ScEcRyyOiNyJ6+/v7R1apJKmyKuHeB0yraXcC+44yfiPwW402ZOaazOzOzO6Ojo7qVUqSRqRKuPcAMyNiRkRMBpYCm2sHRMTMmuZVwP9rXomSpJEads09Mw9ExApgKzABWJuZOyJiFdCbmZuBFRHxTmA/8DzwofEsWpJ0dFUOqJKZW4AtdX231Dz+/SbXJUkaA69QlaQCGe6SVCDDXZIKZLhLUoEMd0kqkOEuSQUy3CWpQIa7JBXIcJekAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgSqFe0RcERG7ImJ3RKxssP3jEbEzIv46Ir4bEWc1v1RJUlXDhntETABWA1cCs4BlETGrbthjQHdmng9sAj7b7EIlSdVVmbnPBXZn5p7MfA3YCCypHZCZ38vMV4aa3wc6m1umJGkkqoT7VOCZmnbfUN+RXAf8n7EUJUkam4kVxkSDvmw4MOIDQDew6AjblwPLAd7ylrdULFGSNFJVZu59wLSadiewr35QRLwTuBlYnJk/b7SjzFyTmd2Z2d3R0TGaeiVJFVQJ9x5gZkTMiIjJwFJgc+2AiLgA+CqDwf4PzS9TkjQSwy7LZOaBiFgBbAUmAGszc0dErAJ6M3Mz8DngJOAvIgLgR5m5eBzrVru69ZRx2OeLzd+n1OaqrLmTmVuALXV9t9Q8fmeT65IkjYFXqEpSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEqXaGqE9P0ld9p+j73Tmn6LiU14MxdkgpkuEtSgQx3SSqQ4S5JBTLcJalAhrskFchwl6QCGe6SVCDDXZIKZLhLUoEMd0kqUKVwj4grImJXROyOiJUNti+MiEcj4kBEXN38MiVJIzFsuEfEBGA1cCUwC1gWEbPqhv0IuBZY3+wCJUkjV+WukHOB3Zm5ByAiNgJLgJ0HB2Tm3qFt/zgONUqSRqjKssxU4Jmadt9Q34hFxPKI6I2I3v7+/tHsQpJUQZVwjwZ9OZoXy8w1mdmdmd0dHR2j2YUkqYIq4d4HTKtpdwL7xqccSVIzVAn3HmBmRMyIiMnAUmDz+JYlSRqLYcM9Mw8AK4CtwFPANzNzR0SsiojFABFxUUT0Ab8DfDUidoxn0ZKko6v0HaqZuQXYUtd3S83jHgaXayRJxwGvUJWkAhnuklQgw12SCmS4S1KBDHdJKpDhLkkFMtwlqUCVznOXNDbTV36n6fvc+5mrmr5PlcOZuyQVyHCXpAIZ7pJUIMNdkgrkAVVJArj1lHHY54vN32dFztwlqUCGuyQVyHCXpAIZ7pJUIMNdkgpkuEtSgQx3SSpQpXCPiCsiYldE7I6IlQ22vzEibh/a/khETG92oZKk6oYN94iYAKwGrgRmAcsiYlbdsOuA5zPz14AvAn/Y7EIlSdVVmbnPBXZn5p7MfA3YCCypG7ME+PrQ403Ab0RENK9MSdJIVLn9wFTgmZp2HzDvSGMy80BEvAicBjzXjCIlNVDY5fIjMS73x5/S9F22VJVwbzQDz1GMISKWA8uHmi9FxK4Kr99SAafT7D9S/+3E/VDj+9k8vpfN1Ubv51lVBlUJ9z5gWk27E9h3hDF9ETEROAX4Sf2OMnMNsKZKYceLiOjNzO5W11EK38/m8b1srtLezypr7j3AzIiYERGTgaXA5roxm4EPDT2+Grg7Mw+buUuSjo1hZ+5Da+grgK3ABGBtZu6IiFVAb2ZuBv4M+POI2M3gjH3peBYtSTq6Svdzz8wtwJa6vltqHr8K/E5zSztutNUyUhvw/Wwe38vmKur9DFdPJKk83n5AkgpkuEtSgQx3SSqQ4d5ARJwfEYsj4r0Hf1pdUzuLiBlV+qRjLSJ+GBG/V9d3V6vqaaZKZ8ucSCJiLXA+sAP4x6HuBP53y4pqf38JXFjXtwmY04Ja2lpErKPB1d+Z+W9bUE4J9gOXRsQ84N8N3T9raotragrD/XDzM7P+rpcahYg4BzgXOKXu088/Bwq7k8cxUzurnAK8h8OvGFd1r2Tm+yPiPwP3R8T7aPDHsx0Z7od7OCJmZebOVhdSgLOBdwOnAr9Z0/8z4PqWVNTmMvMva9sRsQH4qxaVU4IAyMzPRsQ2Bi/W/JXWltQcnudeJyIWAt8G/h74OYP/+JmZ57e0sDYWERdn5sOtrqNEEXE28J2h71LQCEXEb2bmt2vaZwEfysxVLSyrKZy5H24t8EHgCf5pzV1jMxAR3wX+RWZ2RcT5wOLM/O+tLqydDH1HwuvASzXdfw/c1JqK2ldEnJOZfwP8XUTUHw8q4oCqM/c6EXF3Zr6j1XWUJCLuBW4EvpqZFwz1PZmZXa2trP1ExKOZWR9GGqGIWJOZyyPie/ziGvvBT+ptnwHO3A/3NxGxnsGlmZ8f7MxMz5YZvV/KzP9b9+VcB1pVTJt7KCIuysyeVhfSzjLz4PdKvAv4CPCvGAz5+4E/aVVdzWS4H+6fMRjq/7qmz1Mhx+a5iPhVhmZIEXE18GxrS2pb7wD+fUTsBV7GY0Jj9XXgp8D/GmovA24D3teyiprEZRmNu4j4lwzece/twPPA3wLXZObTLS2sDQ0d8DuM7+XoRMTjmfm24frakTP3OhHRCfwRsIDBmeYDwO9nZl9LC2tvfwesA77H4GlmP2Xwy13a/oyEY80Qb7rHImJ+Zn4fYOhipgdbXFNTGO6HWwes55/uT/+Bob7LWlZR+7sTeAF4FC+40XEgIp5gcPI2CfjdiPjRUPssoIhrXFyWqRMR2zNz9nB9qs4zY3S8OdLy1kElfEJy5n645yLiA8CGofYyYKCF9ZTgoYg4LzOfaHUhEpQR3sNx5l4nIt4C/DFwMYMf0x4CbsjMH7W0sDYWETuBX2PwQKpX/UrHgOFeJyK+DnwsM58fav8K8Hnvujd6nuEhHXsuyxzu/IPBDpCZP4mIC1pZULszxKVjzy/rONwbIuKXDzaGZu7+EZTUVgytw/1PBg8AbmJwzf19wP9obUmSNDKuuTcQEbMYvMw7gO96b3dJ7cZwl6QCueYuSQUy3CWpQIa7JBXIcJekAhnuklSg/w9NYc34abuLYwAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plotTopUsageComparation(df_a_json, df_a_other, 'script_tld', 3)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From e1ee1f20a0a14aade8bc1f40080b9461f7ed3bc5 Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Mon, 22 Apr 2019 00:13:35 -0300 Subject: [PATCH 22/23] Remove isJson_correlation_domain_and_value.ipynb --- .../isJson_correlation_domain_and_value.ipynb | 922 ------------------ 1 file changed, 922 deletions(-) delete mode 100644 analyses/2019_03_aliamcami_value_analyses/isJson_correlation_domain_and_value.ipynb diff --git a/analyses/2019_03_aliamcami_value_analyses/isJson_correlation_domain_and_value.ipynb b/analyses/2019_03_aliamcami_value_analyses/isJson_correlation_domain_and_value.ipynb deleted file mode 100644 index f171d76..0000000 --- a/analyses/2019_03_aliamcami_value_analyses/isJson_correlation_domain_and_value.ipynb +++ /dev/null @@ -1,922 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Start Dask" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/anaconda3/envs/overscripted/lib/python3.6/site-packages/dask/config.py:168: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", - " data = yaml.load(f.read()) or {}\n", - "/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/config.py:20: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", - " defaults = yaml.load(f)\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "

Client

\n", - "\n", - "
\n", - "

Cluster

\n", - "
    \n", - "
  • Workers: 4
  • \n", - "
  • Cores: 4
  • \n", - "
  • Memory: 8.59 GB
  • \n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import dask.dataframe as dd\n", - "from dask.distributed import Client\n", - "\n", - "#Initializing client\n", - "client = Client()\n", - "client" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Objective\n", - "\n", - "The objective of this notebook is to answer two main questions: \n", - " - \"The JSON values are always from the same location or related domains?\" \n", - " - \"Are there a set of location domains that always produces a JSON?\"\n", - "\n", - "To answer this we will use the sample data set produced by the notebook \"isJson_dataPrep.ipynb\" called 'all_json_above_mean.parquet' for first question and 'is_json_above_mean_md5.parquet' for seccond, this contains two extra calculated columns that will be important: 'is_json' and 'location_domain'.\n", - "\n", - "\n", - "OBS.: For \"value\" comparison I will use instead value_md5, because its reliable and faster. Value_md5 is the calculated md5 for the value columns \n", - "OBS2.: To see validation that all biggest values are json please reffer to 'isJson_Sample_Comparasion.ipynb'\n", - "\n", - "### Findings: \n", - "\n", - "On this notebook I was able to validate couple facts about the two proposed questions, which are: \n", - "- One domain produces multiple JSONs\n", - "- One JSON is usually (99.9%) produced by a single domain. \n", - "\n", - "\n", - "- One domain can produce values there are both Json or not, but most produce only one type\n", - "- Most of the domains that produce a single type produces JSON type. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - "\n", - "# Are there a set of location domains that always produces a JSON?\n", - "The dataset used to this analise contains non-json values as well for the sake of proving that one domain may or may not produce only json values." - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['value_md5', 'is_json', 'location_domain'], dtype='object')" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = dd.read_parquet('is_json_above_mean_md5.parquet', engine='pyarrow', columns=['value_md5', 'is_json', 'location_domain'])\n", - "df.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
value_md5is_jsonlocation_domain
0cff77029e3ae45dd439a62987b1d8340Truecanada.ca
19ac0a0a0afb677c8fd985a7c2f4ddbc5Truetmall.com
29ac0a0a0afb677c8fd985a7c2f4ddbc5Truetmall.com
3db64465b639e01993d9212390f057628Falsecoches.net
4db64465b639e01993d9212390f057628Falsecoches.net
\n", - "
" - ], - "text/plain": [ - " value_md5 is_json location_domain\n", - "0 cff77029e3ae45dd439a62987b1d8340 True canada.ca\n", - "1 9ac0a0a0afb677c8fd985a7c2f4ddbc5 True tmall.com\n", - "2 9ac0a0a0afb677c8fd985a7c2f4ddbc5 True tmall.com\n", - "3 db64465b639e01993d9212390f057628 False coches.net\n", - "4 db64465b639e01993d9212390f057628 False coches.net" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [], - "source": [ - "location_domain_group = df.compute().groupby('location_domain')" - ] - }, - { - "cell_type": "code", - "execution_count": 160, - "metadata": {}, - "outputs": [], - "source": [ - "agg = location_domain_group.agg({'value_md5': ['nunique', 'count'], 'is_json': ['sum', 'nunique']})" - ] - }, - { - "cell_type": "code", - "execution_count": 161, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
value_md5is_json
nuniquecountsumnunique
location_domain
0123movies.com222.01
10010.com288.01
1001freefonts.com20155155.01
10fastfingers.com42828.01
10jqka.com.cn73030.01
\n", - "
" - ], - "text/plain": [ - " value_md5 is_json \n", - " nunique count sum nunique\n", - "location_domain \n", - "0123movies.com 2 2 2.0 1\n", - "10010.com 2 8 8.0 1\n", - "1001freefonts.com 20 155 155.0 1\n", - "10fastfingers.com 4 28 28.0 1\n", - "10jqka.com.cn 7 30 30.0 1" - ] - }, - "execution_count": 161, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "agg.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 178, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1563" - ] - }, - "execution_count": 178, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Count the number of domains that only produce one type of value (json or non-json)\n", - "f1 = agg['is_json']['nunique'] == 1\n", - "agg_1 = agg[f1]\n", - "oneType = len(agg_1['is_json'])\n", - "oneType" - ] - }, - { - "cell_type": "code", - "execution_count": 180, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1226" - ] - }, - "execution_count": 180, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Out of the ones there have only one type of output, these are the ones that have as JSON\n", - "f2 = agg['is_json']['sum'] > 0\n", - "agg_1a = agg[f1 & f2]\n", - "oneType_json = len(agg_1a['is_json'])\n", - "oneType_json" - ] - }, - { - "cell_type": "code", - "execution_count": 185, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 185, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "series = pd.Series([(oneType - oneType_json), (oneType_json)], index=['non-json', 'json'], name='One type')\n", - "series.plot.pie(figsize=(6, 6), autopct='%1.0f%%')" - ] - }, - { - "cell_type": "code", - "execution_count": 179, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "294" - ] - }, - "execution_count": 179, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Count the number of domains that only produce one BOTH json and non-json values\n", - "agg_2 = agg[agg['is_json']['nunique'] == 2]\n", - "twoTypes = len(agg_2['is_json'])\n", - "twoTypes" - ] - }, - { - "cell_type": "code", - "execution_count": 187, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 187, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "series = pd.Series([(oneType), (twoTypes)], index=['Json', 'both'], name='Domain output')\n", - "series.plot.pie(figsize=(6, 6), autopct='%1.0f%%')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Conclusion\n", - "Most of the domains has only one type of value output, but not all of them. 16% have outputs that can be json and non-json\n", - "\n", - "Out of the ones that has one value type, 78% is json. \n", - "\n", - ">Are there a set of location domains that always produces a JSON?\n", - "\n", - "Yes, there is a set that always produces the value as a valid JSON, but not all of them. There are also the ones there never produces JSON and some that produces both. \n", - "\n", - "---" - ] - }, - { - "cell_type": "code", - "execution_count": 189, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "location_domain\n", - "twitter.com 5594\n", - "petsmart.com 2313\n", - "cdiscount.com 1835\n", - "debenhams.com 1229\n", - "mediamarkt.de 1094\n", - "Name: value_md5, dtype: int64" - ] - }, - "execution_count": 189, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Using the above methoed I could tell that the domains did not have only one \n", - "#output, but I could not find a way to tell the output type. \n", - "#Thats why I decided to calculate by hand as you propably notice, sorry. \n", - "\n", - "location_domain_group_unique_md5 = location_domain_group['value_md5'].nunique()\n", - "location_domain_group_unique_md5.sort_values(ascending=False).head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## The JSON values are always from the same location or related domains?\n", - "For \"value\" comparison I will use value_md5 instead, because its reliable and faster\n", - "\n", - "* value_md5 is the calculated md5 for the value columns" - ] - }, - { - "cell_type": "code", - "execution_count": 191, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['value_md5', 'location_domain', 'value_len'], dtype='object')" - ] - }, - "execution_count": 191, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = dd.read_parquet('all_json_above_mean.parquet', columns=['value_md5','location_domain', 'value_len'])\n", - "df.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 192, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
value_md5location_domainvalue_len
0cff77029e3ae45dd439a62987b1d8340canada.ca3713
19ac0a0a0afb677c8fd985a7c2f4ddbc5tmall.com103878
29ac0a0a0afb677c8fd985a7c2f4ddbc5tmall.com103878
3983f2d6827a86b128a02cf7442c94af1coches.net1686
4b2ad4d7452aeed3df181b1501cc20231coches.net1686
\n", - "
" - ], - "text/plain": [ - " value_md5 location_domain value_len\n", - "0 cff77029e3ae45dd439a62987b1d8340 canada.ca 3713\n", - "1 9ac0a0a0afb677c8fd985a7c2f4ddbc5 tmall.com 103878\n", - "2 9ac0a0a0afb677c8fd985a7c2f4ddbc5 tmall.com 103878\n", - "3 983f2d6827a86b128a02cf7442c94af1 coches.net 1686\n", - "4 b2ad4d7452aeed3df181b1501cc20231 coches.net 1686" - ] - }, - "execution_count": 192, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 196, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/anaconda3/envs/overscripted/lib/python3.6/site-packages/distributed/worker.py:2791: UserWarning: Large object of size 1.89 MB detected in task graph: \n", - " (\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
location_domain
nunique
value_md5
000599fa6f59053c67e6ccbef137a0d21
0005e12de9897336bf5c7e352a8075681
00076462ead16ac77a1d56745584fd5b1
0007a2345e42bca1d5cac86e356bb87b1
000b0b6b104a36cbc6f31b923e1b31a71
\n", - "" - ], - "text/plain": [ - " location_domain\n", - " nunique\n", - "value_md5 \n", - "000599fa6f59053c67e6ccbef137a0d2 1\n", - "0005e12de9897336bf5c7e352a807568 1\n", - "00076462ead16ac77a1d56745584fd5b 1\n", - "0007a2345e42bca1d5cac86e356bb87b 1\n", - "000b0b6b104a36cbc6f31b923e1b31a7 1" - ] - }, - "execution_count": 207, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "aggmd.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 210, - "metadata": {}, - "outputs": [], - "source": [ - "f1 = aggmd['location_domain']['nunique'] > 1\n", - "aggf = aggmd[f1]" - ] - }, - { - "cell_type": "code", - "execution_count": 215, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "35746" - ] - }, - "execution_count": 215, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "unique_values_count = len(aggmd)\n", - "unique_values_count" - ] - }, - { - "cell_type": "code", - "execution_count": 218, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(35, 35711)" - ] - }, - "execution_count": 218, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "values_multiple_origin = len(aggf)\n", - "values_single_origin = unique_values_count - values_multiple_origin \n", - "(values_multiple_origin, values_single_origin )" - ] - }, - { - "cell_type": "code", - "execution_count": 219, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 219, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "series = pd.Series([(values_multiple_origin), (values_single_origin)], index=['multiple', 'single'], name='Value Origin')\n", - "series.plot.pie(figsize=(6, 6), autopct='%1.0f%%')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Conclusion\n", - "The absolute most values have only one origin. Only 35 occurencies of the same value are found to have more than one domain origin. \n", - "\n", - ">The JSON values are always from the same location or related domains?\n", - "\n", - "Almost, 0,097% of the values have multiple origins, but 99,9% is only produced by one domain. \n", - "\n", - "---" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 3b8091515f70b446a9aa34cdef245fabbadda24d Mon Sep 17 00:00:00 2001 From: Camila Resende Date: Mon, 22 Apr 2019 01:37:56 -0300 Subject: [PATCH 23/23] Add isJson_Script_Domain_Output.ipynb and update readme --- .../README.md | 12 +- .../isJson_Script_Domain_Output.ipynb | 245 ++++++++++++++++++ 2 files changed, 252 insertions(+), 5 deletions(-) create mode 100644 analyses/2019_03_aliamcami_value_analyses/isJson_Script_Domain_Output.ipynb diff --git a/analyses/2019_03_aliamcami_value_analyses/README.md b/analyses/2019_03_aliamcami_value_analyses/README.md index d7195a9..0039d9f 100644 --- a/analyses/2019_03_aliamcami_value_analyses/README.md +++ b/analyses/2019_03_aliamcami_value_analyses/README.md @@ -29,13 +29,15 @@ The top 46745 gratest value_len are valid JSONs, that is 9.35% of the filtered s --- ## Correlation of location_domain and value -- One domain produces multiple JSONs -- One JSON is usually (99.9%) produced by a single domain. +- One domain can produces a single type of output (31%). +- 99% of the domains with single type of output do not produces JSON. -### -- One domain can produce values there are both Json or not, but most produce only one type -- Most of the domains that produce a single type produces JSON type. +- 31% of all domains can produce JSON. +- Only 0,016% of all the domains will aways have JSON as output, and less than half of it will always have the same JSON. + + +- One JSON is usually (83.09%) produced by a single script domain. --- diff --git a/analyses/2019_03_aliamcami_value_analyses/isJson_Script_Domain_Output.ipynb b/analyses/2019_03_aliamcami_value_analyses/isJson_Script_Domain_Output.ipynb new file mode 100644 index 0000000..64b75e3 --- /dev/null +++ b/analyses/2019_03_aliamcami_value_analyses/isJson_Script_Domain_Output.ipynb @@ -0,0 +1,245 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Start Dask" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/overscripted/lib/python3.6/site-packages/dask/config.py:168: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", + " data = yaml.load(f.read()) or {}\n" + ] + } + ], + "source": [ + "import dask.dataframe as dd\n", + "from dask.diagnostics import ProgressBar\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import numpy as np\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Objective\n", + "\n", + "The objective of this notebook is to answer two main questions: \n", + " - \"Are there a set of location domains that always produces a JSON?\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "To answer \"Are there a set of location domains that always produces a JSON?\", YES, but it counts only to 0.16%, and\n", + "about 31% of all the domains can produces both types of values, json and non-json, so, we can say that only 31% of all the domains are capaple of geting JSONs, all the others 69% will never get a JSON value. \n", + "\n", + "---\n", + "\n", + "There are 11185 different scripts domains, 93.60% of those appear in multiple rows across the dataset. \n", + "\n", + "Most domains only have one type of value output, either they are json or not-json.\n", + "For the domains that have one tipe of output, 99% of the time they get the same keys_md5, but it may not be very accurate because every non-json value has a empty key and thus equal, so, after filtering to valid jsons we have only 19 domains with unique output && valid jsons, out of this, 63% are aways the same json." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "DIR = 'sample_0_prep/'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['is_json', 'location_domain', 'script_domain', 'keys_md5'], dtype='object')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = dd.read_parquet(DIR + 's0_domains_isJson_jsonKeys_md5_TLD.parquet',\n", + " engine='pyarrow',\n", + " columns=['is_json', 'location_domain', 'script_domain', 'keys_md5'])\n", + "df.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Are there a set of location domains that always produces a JSON?\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 3.5s\n", + "The total number of different script_domain is 11185\n" + ] + } + ], + "source": [ + "with ProgressBar():\n", + " group_by_script_domain = df.compute().groupby(['script_domain'])\n", + " group_by_script_domain_len = len(group_by_script_domain)\n", + " print(\"The total number of different {} is {}\".format('script_domain', group_by_script_domain_len))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "agg = group_by_script_domain.agg({'is_json': ['nunique', 'sum'],\n", + " 'location_domain': ['nunique'],\n", + " 'keys_md5': ['nunique']})" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are a total of 10469(93.60%) JSONs that appear in multiple rows\n" + ] + } + ], + "source": [ + "appear_multiple_times = agg['is_json'][group_by_script_domain['is_json'].count() > 1]\n", + "appear_multiple_times_len = len(appear_multiple_times)\n", + "agg_len = len(agg['is_json'])\n", + "print('There are a total of {0}({1:0.2f}%) JSONs that appear in multiple rows'.format(\n", + " appear_multiple_times_len, \n", + " appear_multiple_times_len*100/agg_len))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def get_unique(agg, column, title=''):\n", + " agg_len = len(agg[column])\n", + " x = agg[agg[column]['nunique'] == 1]\n", + " x_len = len(x)\n", + " print(title + '{0} ({1:0.2f}%) unique {2},\\n{3} ({4:0.2f}%) multiple {2}'.format(\n", + " x_len,\n", + " x_len*100/agg_len,\n", + " column, \n", + " agg_len - x_len,\n", + " (agg_len - x_len) * 100 / agg_len\n", + " ))\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "JSON data:\n", + "7697 (68.82%) unique is_json,\n", + "3488 (31.18%) multiple is_json\n", + "\n", + "KEYS data: out of the unique jsons\n", + "7690 (99.91%) unique keys_md5,\n", + "7 (0.09%) multiple keys_md5\n" + ] + } + ], + "source": [ + "unique_is_json = get_unique(agg, 'is_json', 'JSON data:\\n')\n", + "unique_json_key = get_unique(unique_is_json, 'keys_md5', '\\nKEYS data: out of the unique jsons\\n')\n", + "#it may not be very accurate because every non-json value has a empty key and thus equal" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are only 19 domains with unique valid json output\n", + "\n", + "KEYS data: out of the unique valid jsons\n", + "12 (63.16%) unique keys_md5,\n", + "7 (36.84%) multiple keys_md5\n" + ] + } + ], + "source": [ + "#Filter to only valid jsons\n", + "unique_is_json_jsons = unique_is_json[unique_is_json['is_json']['sum'] > 0]\n", + "print(\"There are only {} domains with unique valid json output\".format(len(unique_is_json_jsons)))\n", + "unique_key_jsons = get_unique(unique_is_json_jsons, 'keys_md5', '\\nKEYS data: out of the unique valid jsons\\n')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}