diff --git a/DBSCAN_1-zmiany_df.ipynb b/DBSCAN_1-zmiany_df.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..e73fa9a39778f2f7174671da8c26cde128aeb617 --- /dev/null +++ b/DBSCAN_1-zmiany_df.ipynb @@ -0,0 +1,4517 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pyodbc\n", + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import urllib\n", + "import seaborn as sns\n", + "from matplotlib import pyplot as plt\n", + "import numpy as np\n", + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "params = urllib.parse.quote_plus(\"DRIVER={ODBC Driver 17 for SQL Server};\"\n", + " #\"SERVER=dbserver.mif.pg.gda.pl,1433;\"\n", + " \"SERVER=127.0.0.1,1433;\"\n", + " \"DATABASE=silkycoders;\"\n", + " \"UID=;\"\n", + " \"PWD=\")\n", + "\n", + "engine = create_engine(\"mssql+pyodbc:///?odbc_connect={}\".format(params))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"\"\"SELECT rfid.*, it.*, sub.*, cl.*, dep.*, br.*\n", + " FROM rfid.Logs rfid \n", + " JOIN rfid.EanEpc ean \n", + " ON rfid.EPC = ean.EPC \n", + " JOIN dw.Item it \n", + " ON ean.EAN = it.EAN \n", + " JOIN dw.Subclass sub \n", + " ON sub.SubclassID = it.SubclassID\n", + " JOIN dw.Class cl\n", + " ON sub.ClassID = cl.ClassID\n", + " JOIN dw.Department dep\n", + " ON dep.DepartmentID = cl.DepartmentID\n", + " JOIN dw.Brand br\n", + " ON dep.BrandID = br.BrandID\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_sql_query(query, engine)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_timestamp(df : pd.DataFrame):\n", + " \n", + " dt = df.sort_values(by=\"TIMESTAMP\").reset_index(drop=True)\n", + " dt[\"HOUR\"] = dt.TIMESTAMP.astype('datetime64[ns]').dt.hour.astype(int)\n", + " dt[\"MIN\"] = dt.TIMESTAMP.astype('datetime64[ns]').dt.minute.astype(int)\n", + " dt[\"SEC\"] = dt.TIMESTAMP.astype('datetime64[ns]').dt.second.astype(int)\n", + " dt[\"MICROSEC\"] = dt.TIMESTAMP.astype('datetime64[ns]').dt.microsecond.astype(int)\n", + "\n", + " dt[\"MILISEC\"] = dt.MICROSEC/1000 + dt.SEC*1000 + dt.MIN*60000 + dt.HOUR*3600000\n", + " dt[\"TIME_MS\"] = dt.MILISEC - dt.MILISEC[0]\n", + " \n", + " dt['TIME_PER_MEASUREMENT_MS'] = 0\n", + " dt['NUMBER_OF_SIGNALS'] = 0\n", + " dt['LENGTH_OF_MEASUREMENT'] = 0\n", + " \n", + " for m in dt.MEASUREMENT.unique():\n", + " filtr = (dt.MEASUREMENT == m)\n", + " dt.loc[filtr,'TIME_PER_MEASUREMENT_MS'] = dt[filtr].MILISEC - dt[filtr].MILISEC.iloc[0]\n", + " dt.loc[filtr, \"NUMBER_OF_SIGNALS\"] = len(dt[filtr])\n", + " dt.loc[filtr, 'LENGTH_OF_MEASUREMENT'] = dt[filtr].TIME_PER_MEASUREMENT_MS.max()\n", + " \n", + " dt[\"TIME_KMS\"] = np.floor(dt.TIME_MS/1000) \n", + " dt = dt.merge(dt.groupby(['EPC','TIME_KMS'])[\"PROXIMITY\"].max().reset_index(name=\"MAX_PROXIMITY_KMS\"), how=\"left\",\n", + " on = ['EPC','TIME_KMS'])\n", + " dt = dt.merge(dt.groupby(['EPC','TIME_KMS'])[\"PROXIMITY\"].sum().reset_index(name=\"SUM_PROXIMITY_KMS\"), how=\"left\",\n", + " on = ['EPC','TIME_KMS'])\n", + " return dt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = prepare_timestamp(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('df.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 502689 entries, 0 to 502688\n", + "Data columns (total 36 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 EPC 502689 non-null object \n", + " 1 PROXIMITY 502689 non-null float64\n", + " 2 TIMESTAMP 502689 non-null object \n", + " 3 MEASUREMENT 502689 non-null int64 \n", + " 4 ITEMID 502689 non-null int64 \n", + " 5 EAN 502689 non-null int64 \n", + " 6 StyleColorSize 502689 non-null object \n", + " 7 StyleColor 502689 non-null object \n", + " 8 Size 502689 non-null object \n", + " 9 SubclassID 502689 non-null int64 \n", + " 10 ItemSeason 502689 non-null object \n", + " 11 FashionLevel 369997 non-null object \n", + " 12 SubclassID.1 502689 non-null int64 \n", + " 13 SubclassName 502689 non-null object \n", + " 14 ClassID 502689 non-null int64 \n", + " 15 ClassID.1 502689 non-null int64 \n", + " 16 ClassName 502689 non-null object \n", + " 17 DepartmentID 502689 non-null int64 \n", + " 18 DepartmentID.1 502689 non-null int64 \n", + " 19 DepartmentName 502689 non-null object \n", + " 20 BrandID 502689 non-null int64 \n", + " 21 BrandID.1 502689 non-null int64 \n", + " 22 BrandName 502689 non-null object \n", + " 23 Active 502689 non-null bool \n", + " 24 HOUR 502689 non-null int64 \n", + " 25 MIN 502689 non-null int64 \n", + " 26 SEC 502689 non-null int64 \n", + " 27 MICROSEC 502689 non-null int64 \n", + " 28 MILISEC 502689 non-null float64\n", + " 29 TIME_MS 502689 non-null float64\n", + " 30 TIME_PER_MEASUREMENT_MS 502689 non-null float64\n", + " 31 NUMBER_OF_SIGNALS 502689 non-null int64 \n", + " 32 LENGTH_OF_MEASUREMENT 502689 non-null int64 \n", + " 33 TIME_KMS 502689 non-null float64\n", + " 34 MAX_PROXIMITY_KMS 502689 non-null float64\n", + " 35 SUM_PROXIMITY_KMS 502689 non-null float64\n", + "dtypes: bool(1), float64(7), int64(17), object(11)\n", + "memory usage: 134.7+ MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>PROXIMITY</th>\n", + " <th>MEASUREMENT</th>\n", + " <th>ITEMID</th>\n", + " <th>EAN</th>\n", + " <th>SubclassID</th>\n", + " <th>SubclassID.1</th>\n", + " <th>ClassID</th>\n", + " <th>ClassID.1</th>\n", + " <th>DepartmentID</th>\n", + " <th>DepartmentID.1</th>\n", + " <th>...</th>\n", + " <th>SEC</th>\n", + " <th>MICROSEC</th>\n", + " <th>MILISEC</th>\n", + " <th>TIME_MS</th>\n", + " <th>TIME_PER_MEASUREMENT_MS</th>\n", + " <th>NUMBER_OF_SIGNALS</th>\n", + " <th>LENGTH_OF_MEASUREMENT</th>\n", + " <th>TIME_KMS</th>\n", + " <th>MAX_PROXIMITY_KMS</th>\n", + " <th>SUM_PROXIMITY_KMS</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>5.026890e+05</td>\n", + " <td>5.026890e+05</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>...</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>5.026890e+05</td>\n", + " <td>5.026890e+05</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.00000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>mean</th>\n", + " <td>-75.406017</td>\n", + " <td>23.412382</td>\n", + " <td>2.169862e+06</td>\n", + " <td>5.902835e+12</td>\n", + " <td>83.920704</td>\n", + " <td>83.920704</td>\n", + " <td>18.231477</td>\n", + " <td>18.231477</td>\n", + " <td>2.609574</td>\n", + " <td>2.609574</td>\n", + " <td>...</td>\n", + " <td>29.193547</td>\n", + " <td>499773.110213</td>\n", + " <td>3.721192e+07</td>\n", + " <td>2.018186e+06</td>\n", + " <td>100132.210719</td>\n", + " <td>15383.906986</td>\n", + " <td>199835.398777</td>\n", + " <td>2017.68607</td>\n", + " <td>-72.497318</td>\n", + " <td>-398.108291</td>\n", + " </tr>\n", + " <tr>\n", + " <th>std</th>\n", + " <td>5.698062</td>\n", + " <td>12.175284</td>\n", + " <td>7.798483e+04</td>\n", + " <td>7.380986e+07</td>\n", + " <td>142.489244</td>\n", + " <td>142.489244</td>\n", + " <td>8.844056</td>\n", + " <td>8.844056</td>\n", + " <td>0.937828</td>\n", + " <td>0.937828</td>\n", + " <td>...</td>\n", + " <td>17.223297</td>\n", + " <td>288469.414710</td>\n", + " <td>1.121487e+06</td>\n", + " <td>1.121487e+06</td>\n", + " <td>81859.831696</td>\n", + " <td>8217.121271</td>\n", + " <td>101049.072703</td>\n", + " <td>1121.48684</td>\n", + " <td>5.893956</td>\n", + " <td>262.167663</td>\n", + " </tr>\n", + " <tr>\n", + " <th>min</th>\n", + " <td>-110.000000</td>\n", + " <td>1.000000</td>\n", + " <td>2.028742e+06</td>\n", + " <td>5.902691e+12</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>3.519374e+07</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000</td>\n", + " <td>4597.000000</td>\n", + " <td>53538.000000</td>\n", + " <td>0.00000</td>\n", + " <td>-100.500000</td>\n", + " <td>-2629.400000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25%</th>\n", + " <td>-79.900000</td>\n", + " <td>13.000000</td>\n", + " <td>2.113407e+06</td>\n", + " <td>5.902805e+12</td>\n", + " <td>11.000000</td>\n", + " <td>11.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>14.000000</td>\n", + " <td>250000.000000</td>\n", + " <td>3.624898e+07</td>\n", + " <td>1.055248e+06</td>\n", + " <td>38108.000000</td>\n", + " <td>8533.000000</td>\n", + " <td>127122.000000</td>\n", + " <td>1055.00000</td>\n", + " <td>-76.400000</td>\n", + " <td>-515.900000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>50%</th>\n", + " <td>-75.700000</td>\n", + " <td>24.000000</td>\n", + " <td>2.155604e+06</td>\n", + " <td>5.902806e+12</td>\n", + " <td>82.000000</td>\n", + " <td>82.000000</td>\n", + " <td>16.000000</td>\n", + " <td>16.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>29.000000</td>\n", + " <td>500000.000000</td>\n", + " <td>3.719645e+07</td>\n", + " <td>2.002711e+06</td>\n", + " <td>78477.000000</td>\n", + " <td>13321.000000</td>\n", + " <td>176026.000000</td>\n", + " <td>2002.00000</td>\n", + " <td>-72.900000</td>\n", + " <td>-342.900000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75%</th>\n", + " <td>-71.900000</td>\n", + " <td>33.000000</td>\n", + " <td>2.226340e+06</td>\n", + " <td>5.902852e+12</td>\n", + " <td>82.000000</td>\n", + " <td>82.000000</td>\n", + " <td>25.000000</td>\n", + " <td>25.000000</td>\n", + " <td>3.000000</td>\n", + " <td>3.000000</td>\n", + " <td>...</td>\n", + " <td>44.000000</td>\n", + " <td>749000.000000</td>\n", + " <td>3.815973e+07</td>\n", + " <td>2.965991e+06</td>\n", + " <td>139431.000000</td>\n", + " <td>22217.000000</td>\n", + " <td>265127.000000</td>\n", + " <td>2965.00000</td>\n", + " <td>-68.400000</td>\n", + " <td>-225.700000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max</th>\n", + " <td>-38.900000</td>\n", + " <td>43.000000</td>\n", + " <td>2.304122e+06</td>\n", + " <td>5.902975e+12</td>\n", + " <td>630.000000</td>\n", + " <td>630.000000</td>\n", + " <td>41.000000</td>\n", + " <td>41.000000</td>\n", + " <td>6.000000</td>\n", + " <td>6.000000</td>\n", + " <td>...</td>\n", + " <td>59.000000</td>\n", + " <td>999000.000000</td>\n", + " <td>3.912875e+07</td>\n", + " <td>3.935013e+06</td>\n", + " <td>435771.000000</td>\n", + " <td>35350.000000</td>\n", + " <td>435771.000000</td>\n", + " <td>3935.00000</td>\n", + " <td>-38.900000</td>\n", + " <td>-52.300000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>8 rows Ă 24 columns</p>\n", + "</div>" + ], + "text/plain": [ + " PROXIMITY MEASUREMENT ITEMID EAN \\\n", + "count 502689.000000 502689.000000 5.026890e+05 5.026890e+05 \n", + "mean -75.406017 23.412382 2.169862e+06 5.902835e+12 \n", + "std 5.698062 12.175284 7.798483e+04 7.380986e+07 \n", + "min -110.000000 1.000000 2.028742e+06 5.902691e+12 \n", + "25% -79.900000 13.000000 2.113407e+06 5.902805e+12 \n", + "50% -75.700000 24.000000 2.155604e+06 5.902806e+12 \n", + "75% -71.900000 33.000000 2.226340e+06 5.902852e+12 \n", + "max -38.900000 43.000000 2.304122e+06 5.902975e+12 \n", + "\n", + " SubclassID SubclassID.1 ClassID ClassID.1 \\\n", + "count 502689.000000 502689.000000 502689.000000 502689.000000 \n", + "mean 83.920704 83.920704 18.231477 18.231477 \n", + "std 142.489244 142.489244 8.844056 8.844056 \n", + "min 10.000000 10.000000 10.000000 10.000000 \n", + "25% 11.000000 11.000000 10.000000 10.000000 \n", + "50% 82.000000 82.000000 16.000000 16.000000 \n", + "75% 82.000000 82.000000 25.000000 25.000000 \n", + "max 630.000000 630.000000 41.000000 41.000000 \n", + "\n", + " DepartmentID DepartmentID.1 ... SEC MICROSEC \\\n", + "count 502689.000000 502689.000000 ... 502689.000000 502689.000000 \n", + "mean 2.609574 2.609574 ... 29.193547 499773.110213 \n", + "std 0.937828 0.937828 ... 17.223297 288469.414710 \n", + "min 2.000000 2.000000 ... 0.000000 0.000000 \n", + "25% 2.000000 2.000000 ... 14.000000 250000.000000 \n", + "50% 2.000000 2.000000 ... 29.000000 500000.000000 \n", + "75% 3.000000 3.000000 ... 44.000000 749000.000000 \n", + "max 6.000000 6.000000 ... 59.000000 999000.000000 \n", + "\n", + " MILISEC TIME_MS TIME_PER_MEASUREMENT_MS NUMBER_OF_SIGNALS \\\n", + "count 5.026890e+05 5.026890e+05 502689.000000 502689.000000 \n", + "mean 3.721192e+07 2.018186e+06 100132.210719 15383.906986 \n", + "std 1.121487e+06 1.121487e+06 81859.831696 8217.121271 \n", + "min 3.519374e+07 0.000000e+00 0.000000 4597.000000 \n", + "25% 3.624898e+07 1.055248e+06 38108.000000 8533.000000 \n", + "50% 3.719645e+07 2.002711e+06 78477.000000 13321.000000 \n", + "75% 3.815973e+07 2.965991e+06 139431.000000 22217.000000 \n", + "max 3.912875e+07 3.935013e+06 435771.000000 35350.000000 \n", + "\n", + " LENGTH_OF_MEASUREMENT TIME_KMS MAX_PROXIMITY_KMS \\\n", + "count 502689.000000 502689.00000 502689.000000 \n", + "mean 199835.398777 2017.68607 -72.497318 \n", + "std 101049.072703 1121.48684 5.893956 \n", + "min 53538.000000 0.00000 -100.500000 \n", + "25% 127122.000000 1055.00000 -76.400000 \n", + "50% 176026.000000 2002.00000 -72.900000 \n", + "75% 265127.000000 2965.00000 -68.400000 \n", + "max 435771.000000 3935.00000 -38.900000 \n", + "\n", + " SUM_PROXIMITY_KMS \n", + "count 502689.000000 \n", + "mean -398.108291 \n", + "std 262.167663 \n", + "min -2629.400000 \n", + "25% -515.900000 \n", + "50% -342.900000 \n", + "75% -225.700000 \n", + "max -52.300000 \n", + "\n", + "[8 rows x 24 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'whiskers': [<matplotlib.lines.Line2D at 0x215c384d970>,\n", + " <matplotlib.lines.Line2D at 0x215c384dcd0>],\n", + " 'caps': [<matplotlib.lines.Line2D at 0x215c4d13070>,\n", + " <matplotlib.lines.Line2D at 0x215c4d133d0>],\n", + " 'boxes': [<matplotlib.lines.Line2D at 0x215c384d610>],\n", + " 'medians': [<matplotlib.lines.Line2D at 0x215c4d13730>],\n", + " 'fliers': [<matplotlib.lines.Line2D at 0x215c4d13a90>],\n", + " 'means': []}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAD6CAYAAAC/KwBlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAXaklEQVR4nO3dYYyd1X3n8e+vONqibU1tGJDjIWsUnNVCpXXFlYOUN9lQ2VZa1UQCxSu1+IUlR4hIqbZSFfrGBLRSWCWlQiuQyIIwtBuwaCusKJR1oFW0EjGMUxowBHkkaHCw8FTjEvIGyea/L+6ZzZ3pcHxnjMdh8v1Ij+4z/+ecM899Yf/mec5z70lVIUnSB/m1C30CkqRfbgaFJKnLoJAkdRkUkqQug0KS1GVQSJK6xg6KJBcl+cck32k/35Hkp0lebNvnR9renmQ6yWtJto/Ur0vyUjt2b5K0+r9L8nirH06yaaTP7iTH2rb7Q3nXkqSxrVlC268ArwJrR2r3VNU3RhsluQbYBVwLfBz4XpJPVdUZ4H5gL/AD4LvADuApYA9wqqquTrILuBv4YpL1wD5gABRwJMnBqjr1QSd52WWX1aZNm5bwtiRJR44c+Zeqmljs2FhBkWQS+D3gvwP/7SzNdwKPVdV7wOtJpoGtSd4A1lbVc23MR4AbGQbFTuCO1v8J4H+2q43twKGqmm19DjEMl29/0C/ftGkTU1NT47wtSVKT5J8/6Ni4t57+AvhT4P0F9S8n+VGSh5Ksa7WNwJsjbY632sa2v7A+r09VnQbeAS7tjCVJWiFnDYokvw+crKojCw7dD3wS2AKcAL4512WRYapTX26f0XPcm2QqydTMzMwiXSRJyzXOFcVngD9ot44eAz6X5C+r6u2qOlNV7wPfAra29seBK0f6TwJvtfrkIvV5fZKsAS4BZjtjzVNVD1TVoKoGExOL3mKTJC3TWYOiqm6vqsmq2sRwkvrZqvrDJBtGmn0BeLntHwR2tSeZrgI2A89X1Qng3STXt/mHW4AnR/rMPdF0U/sdBTwNbEuyrt3a2tZqkqQVspSnnhb6H0m2MLwV9AbwJYCqOprkAPAKcBq4rT3xBHAr8DBwMcNJ7Kda/UHg0TbxPcswkKiq2SR3AS+0dnfOTWxLklZGVtvXjA8Gg/KpJ0lamiRHqmqw2DE/mS1J6jIoJEld5zJHIf1Ka99Ac96tttvD+ugxKKRlWs5/4En8j18fOd56kiR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUtfYQZHkoiT/mOQ77ef1SQ4lOdZe1420vT3JdJLXkmwfqV+X5KV27N62djZtfe3HW/1wkk0jfXa333EsyW4kSStqKVcUXwFeHfn5q8AzVbUZeKb9TJJrGK55fS2wA7gvyUWtz/3AXmBz23a0+h7gVFVdDdwD3N3GWg/sAz4NbAX2jQaSJOn8GysokkwCvwf8r5HyTmB/298P3DhSf6yq3quq14FpYGuSDcDaqnquhl/I/8iCPnNjPQHc0K42tgOHqmq2qk4Bh/hFuEiSVsC4VxR/Afwp8P5I7YqqOgHQXi9v9Y3AmyPtjrfaxra/sD6vT1WdBt4BLu2MJUlaIWcNiiS/D5ysqiNjjrnY+pDVqS+3z+g57k0ylWRqZmZmzNOUJI1jnCuKzwB/kOQN4DHgc0n+Eni73U6ivZ5s7Y8DV470nwTeavXJRerz+iRZA1wCzHbGmqeqHqiqQVUNJiYmxnhLkqRxnTUoqur2qpqsqk0MJ6mfrao/BA4Cc08h7QaebPsHgV3tSaarGE5aP99uT72b5Po2/3DLgj5zY93UfkcBTwPbkqxrk9jbWk2StELWnEPfrwMHkuwBfgLcDFBVR5McAF4BTgO3VdWZ1udW4GHgYuCptgE8CDyaZJrhlcSuNtZskruAF1q7O6tq9hzOWZK0RBn+4b56DAaDmpqautCnIS0qCavt35xWhyRHqmqw2DE/mS1J6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqOmtQJPn1JM8n+ackR5N8rdXvSPLTJC+27fMjfW5PMp3ktSTbR+rXJXmpHbu3rZ1NW1/78VY/nGTTSJ/dSY61bTeSpBU1zprZ7wGfq6qfJ/kY8H+TzK11fU9VfWO0cZJrGK55fS3wceB7ST7V1s2+H9gL/AD4LrCD4brZe4BTVXV1kl3A3cAXk6wH9gEDoIAjSQ5W1alze9uSpHGd9Yqihn7efvxY23qL/u4EHquq96rqdWAa2JpkA7C2qp6r4aLBjwA3jvTZ3/afAG5oVxvbgUNVNdvC4RDDcJEkrZCx5iiSXJTkReAkw/+4D7dDX07yoyQPJVnXahuBN0e6H2+1jW1/YX1en6o6DbwDXNoZS5K0QsYKiqo6U1VbgEmGVwe/zfA20ieBLcAJ4JuteRYbolNfbp//L8neJFNJpmZmZjrvRJK0VEt66qmq/hX4B2BHVb3dAuR94FvA1tbsOHDlSLdJ4K1Wn1ykPq9PkjXAJcBsZ6yF5/VAVQ2qajAxMbGUtyRJOotxnnqaSPJbbf9i4HeBH7c5hzlfAF5u+weBXe1JpquAzcDzVXUCeDfJ9W3+4RbgyZE+c0803QQ82+Yxnga2JVnXbm1tazVJ0goZ56mnDcD+JBcxDJYDVfWdJI8m2cLwVtAbwJcAqupokgPAK8Bp4Lb2xBPArcDDwMUMn3aae3rqQeDRJNMMryR2tbFmk9wFvNDa3VlVs8t/u5KkpcrwD/fVYzAY1NTU1IU+DWlRSVht/+a0OiQ5UlWDxY75yWxJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklS1zhrZv96kueT/FOSo0m+1urrkxxKcqy9rhvpc3uS6SSvJdk+Ur8uyUvt2L1t7Wza+tqPt/rhJJtG+uxuv+NYkt1IklbUOFcU7wGfq6r/DGwBdiS5Hvgq8ExVbQaeaT+T5BqGa15fC+wA7mvrbQPcD+wFNrdtR6vvAU5V1dXAPcDdbaz1wD7g08BWYN9oIEmSzr+zBkUN/bz9+LG2FbAT2N/q+4Eb2/5O4LGqeq+qXgemga1JNgBrq+q5Gi4a/MiCPnNjPQHc0K42tgOHqmq2qk4Bh/hFuEiSVsBYcxRJLkryInCS4X/ch4ErquoEQHu9vDXfCLw50v14q21s+wvr8/pU1WngHeDSzlgLz29vkqkkUzMzM+O8JUnSmMYKiqo6U1VbgEmGVwe/3WmexYbo1JfbZ/T8HqiqQVUNJiYmOqcmSVqqJT31VFX/CvwDw9s/b7fbSbTXk63ZceDKkW6TwFutPrlIfV6fJGuAS4DZzliSpBUyzlNPE0l+q+1fDPwu8GPgIDD3FNJu4Mm2fxDY1Z5kuorhpPXz7fbUu0mub/MPtyzoMzfWTcCzbR7jaWBbknVtEntbq0mSVsiaMdpsAPa3J5d+DThQVd9J8hxwIMke4CfAzQBVdTTJAeAV4DRwW1WdaWPdCjwMXAw81TaAB4FHk0wzvJLY1caaTXIX8EJrd2dVzZ7LG5YkLU2Gf7ivHoPBoKampi70aUiLSsJq+zen1SHJkaoaLHbMT2ZLkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV3jfHus9Cth/fr1nDp16rz/nuG37J8/69atY3bWL1nWh8egkJpTp06tim92Pd9BpF893nqSJHUZFJKkLoNCktQ1zprZVyb5+ySvJjma5CutfkeSnyZ5sW2fH+lze5LpJK8l2T5Svy7JS+3YvW3tbNr62o+3+uEkm0b67E5yrG27kSStqHEms08Df1JVP0zym8CRJIfasXuq6hujjZNcw3DN62uBjwPfS/Kptm72/cBe4AfAd4EdDNfN3gOcqqqrk+wC7ga+mGQ9sA8YANV+98GqOv+PpkiSgDGuKKrqRFX9sO2/C7wKbOx02Qk8VlXvVdXrwDSwNckGYG1VPVfDR0seAW4c6bO/7T8B3NCuNrYDh6pqtoXDIYbhIklaIUuao2i3hH4HONxKX07yoyQPJVnXahuBN0e6HW+1jW1/YX1en6o6DbwDXNoZa+F57U0ylWRqZmZmKW9JknQWYwdFkt8A/hr446r6GcPbSJ8EtgAngG/ONV2ke3Xqy+3zi0LVA1U1qKrBxMRE721IkpZorKBI8jGGIfFXVfU3AFX1dlWdqar3gW8BW1vz48CVI90ngbdafXKR+rw+SdYAlwCznbEkSStknKeeAjwIvFpVfz5S3zDS7AvAy23/ILCrPcl0FbAZeL6qTgDvJrm+jXkL8ORIn7knmm4Cnm3zGE8D25Ksa7e2trWaJGmFjPPU02eAPwJeSvJiq/0Z8F+TbGF4K+gN4EsAVXU0yQHgFYZPTN3WnngCuBV4GLiY4dNOT7X6g8CjSaYZXknsamPNJrkLeKG1u7Oq/BIbSVpBWQ3fbTNqMBjU1NTUhT4NfQQlWTXf9bQa3odWVpIjVTVY7JifzJYkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqWucpVCvTPL3SV5NcjTJV1p9fZJDSY6113UjfW5PMp3ktSTbR+rXJXmpHbu3LYlKWzb18VY/nGTTSJ/d7XccS7IbSdKKGueK4jTwJ1X1n4DrgduSXAN8FXimqjYDz7Sfacd2AdcCO4D7klzUxrof2MtwHe3N7TjAHuBUVV0N3APc3cZaD+wDPg1sBfaNBpIk6fw7a1BU1Ymq+mHbfxd4FdgI7AT2t2b7gRvb/k7gsap6r6peB6aBrUk2AGur6rkartP4yII+c2M9AdzQrja2A4eqaraqTgGH+EW4SJJWwJLmKNotod8BDgNXVNUJGIYJcHlrthF4c6Tb8Vbb2PYX1uf1qarTwDvApZ2xJEkrZOygSPIbwF8Df1xVP+s1XaRWnfpy+4ye294kU0mmZmZmOqcmSVqqsYIiyccYhsRfVdXftPLb7XYS7fVkqx8HrhzpPgm81eqTi9Tn9UmyBrgEmO2MNU9VPVBVg6oaTExMjPOWJEljGueppwAPAq9W1Z+PHDoIzD2FtBt4cqS+qz3JdBXDSevn2+2pd5Nc38a8ZUGfubFuAp5t8xhPA9uSrGuT2NtaTZK0QtaM0eYzwB8BLyV5sdX+DPg6cCDJHuAnwM0AVXU0yQHgFYZPTN1WVWdav1uBh4GLgafaBsMgejTJNMMriV1trNkkdwEvtHZ3VtXs8t6qJGk5MvzDffUYDAY1NTV1oU9DH0FJWA3/HlbL+9DKSnKkqgaLHfOT2ZKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1jfPJbOlXQu1bC3dccqFP45zVvrUX+hS0yhgUUpOv/WxVfKI5CXXHhT4LrSbeepIkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV3jrJn9UJKTSV4eqd2R5KdJXmzb50eO3Z5kOslrSbaP1K9L8lI7dm9bN5u2tvbjrX44yaaRPruTHGvb3JrakqQVNM4VxcPAjkXq91TVlrZ9FyDJNQzXu7629bkvyUWt/f3AXmBz2+bG3AOcqqqrgXuAu9tY64F9wKeBrcC+JOuW/A4lSefkrEFRVd8HZsccbyfwWFW9V1WvA9PA1iQbgLVV9VwNP/r6CHDjSJ/9bf8J4IZ2tbEdOFRVs1V1CjjE4oElSTqPzmWO4stJftRuTc39pb8ReHOkzfFW29j2F9bn9amq08A7wKWdsSRJK2i5QXE/8ElgC3AC+GarZ5G21akvt888SfYmmUoyNTMz0zltSdJSLSsoqurtqjpTVe8D32I4hwDDv/qvHGk6CbzV6pOL1Of1SbIGuIThra4PGmux83mgqgZVNZiYmFjOW5IkfYBlBUWbc5jzBWDuiaiDwK72JNNVDCetn6+qE8C7Sa5v8w+3AE+O9Jl7oukm4Nk2j/E0sC3JunZra1urSZJW0Fm/ZjzJt4HPApclOc7wSaTPJtnC8FbQG8CXAKrqaJIDwCvAaeC2qjrThrqV4RNUFwNPtQ3gQeDRJNMMryR2tbFmk9wFvNDa3VlV406qS5I+JFkN378/ajAY1NTU1IU+DX0EJVk961GsgvehlZXkSFUNFjvmJ7MlSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVLXWT9wJ/0qacukfKStW+e38evDZVBIzUp8SM0Pw+mjyFtPkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHWdNSiSPJTkZJKXR2rrkxxKcqy9rhs5dnuS6SSvJdk+Ur8uyUvt2L1t7Wza+tqPt/rhJJtG+uxuv+NYkrl1tSVJK2icK4qHgR0Lal8FnqmqzcAz7WeSXMNwzetrW5/7klzU+twP7AU2t21uzD3Aqaq6GrgHuLuNtZ7h+tyfBrYC+0YDSZK0Ms4aFFX1fWB2QXknsL/t7wduHKk/VlXvVdXrwDSwNckGYG1VPVfD7y94ZEGfubGeAG5oVxvbgUNVNVtVp4BD/NvAkiSdZ8udo7iiqk4AtNfLW30j8OZIu+OttrHtL6zP61NVp4F3gEs7Y0mSVtCHPZm92FdvVqe+3D7zf2myN8lUkqmZmZmxTlSSNJ7lBsXb7XYS7fVkqx8HrhxpNwm81eqTi9Tn9UmyBriE4a2uDxrr36iqB6pqUFWDiYmJZb4lSdJilhsUB4G5p5B2A0+O1He1J5muYjhp/Xy7PfVukuvb/MMtC/rMjXUT8Gybx3ga2JZkXZvE3tZqkqQVdNb1KJJ8G/gscFmS4wyfRPo6cCDJHuAnwM0AVXU0yQHgFeA0cFtVnWlD3crwCaqLgafaBvAg8GiSaYZXErvaWLNJ7gJeaO3urKqFk+qSpPMsq20RlcFgUFNTUxf6NKRFuXCRflklOVJVg8WO+clsSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUtc5BUWSN5K8lOTFJFOttj7JoSTH2uu6kfa3J5lO8lqS7SP169o400nubetq09befrzVDyfZdC7nK0laug/jiuK/VNWWkSX0vgo8U1WbgWfazyS5huF62NcCO4D7klzU+twP7AU2t21Hq+8BTlXV1cA9wN0fwvlKkpbgfNx62gnsb/v7gRtH6o9V1XtV9TowDWxNsgFYW1XP1XAx4UcW9Jkb6wnghrmrDelCS7LkbTn9pAvtXIOigP+T5EiSva12RVWdAGivl7f6RuDNkb7HW21j219Yn9enqk4D7wCXnuM5Sx+KqlqRTbrQ1pxj/89U1VtJLgcOJflxp+1ifxpVp97rM3/gYUjtBfjEJz7RP2NJ0pKc0xVFVb3VXk8CfwtsBd5ut5Norydb8+PAlSPdJ4G3Wn1ykfq8PknWAJcAs4ucxwNVNaiqwcTExLm8JUnSAssOiiT/Pslvzu0D24CXgYPA7tZsN/Bk2z8I7GpPMl3FcNL6+XZ76t0k17f5h1sW9Jkb6ybg2fJaXJJW1LnceroC+Ns22bYG+N9V9XdJXgAOJNkD/AS4GaCqjiY5ALwCnAZuq6ozbaxbgYeBi4Gn2gbwIPBokmmGVxK7zuF8JUnLkNX2B/pgMKipqakLfRqS9JGS5MjIxxzm8ZPZkqQug0KS1GVQSJK6Vt0cRZIZ4J8v9HlIH+Ay4F8u9ElIi/gPVbXo5wtWXVBIv8ySTH3QhKH0y8pbT5KkLoNCktRlUEgr64ELfQLSUjlHIUnq8opCktRlUEgrIMlDSU4meflCn4u0VAaFtDIe5hdL/EofKQaFtAKq6vssspaK9FFgUEiSugwKSVKXQSFJ6jIoJEldBoW0ApJ8G3gO+I9JjrelgqWPBD+ZLUnq8opCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpK7/B9G71F0NbH01AAAAAElFTkSuQmCC\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.boxplot(df['LENGTH_OF_MEASUREMENT'])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Usuwamy najdĹuĹźsze pomiary\n", + "df.drop(df.loc[df['LENGTH_OF_MEASUREMENT'] > 200000].index, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>PROXIMITY</th>\n", + " <th>MEASUREMENT</th>\n", + " <th>ITEMID</th>\n", + " <th>EAN</th>\n", + " <th>SubclassID</th>\n", + " <th>SubclassID.1</th>\n", + " <th>ClassID</th>\n", + " <th>ClassID.1</th>\n", + " <th>DepartmentID</th>\n", + " <th>DepartmentID.1</th>\n", + " <th>...</th>\n", + " <th>SEC</th>\n", + " <th>MICROSEC</th>\n", + " <th>MILISEC</th>\n", + " <th>TIME_MS</th>\n", + " <th>TIME_PER_MEASUREMENT_MS</th>\n", + " <th>NUMBER_OF_SIGNALS</th>\n", + " <th>LENGTH_OF_MEASUREMENT</th>\n", + " <th>TIME_KMS</th>\n", + " <th>MAX_PROXIMITY_KMS</th>\n", + " <th>SUM_PROXIMITY_KMS</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>3.177620e+05</td>\n", + " <td>3.177620e+05</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>...</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>3.177620e+05</td>\n", + " <td>3.177620e+05</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>mean</th>\n", + " <td>-75.526452</td>\n", + " <td>22.776858</td>\n", + " <td>2.170711e+06</td>\n", + " <td>5.902836e+12</td>\n", + " <td>82.639686</td>\n", + " <td>82.639686</td>\n", + " <td>18.142610</td>\n", + " <td>18.142610</td>\n", + " <td>2.603486</td>\n", + " <td>2.603486</td>\n", + " <td>...</td>\n", + " <td>29.378906</td>\n", + " <td>499539.922961</td>\n", + " <td>3.717748e+07</td>\n", + " <td>1.983745e+06</td>\n", + " <td>66756.283605</td>\n", + " <td>10103.277673</td>\n", + " <td>133222.336198</td>\n", + " <td>1983.244840</td>\n", + " <td>-72.536124</td>\n", + " <td>-404.190290</td>\n", + " </tr>\n", + " <tr>\n", + " <th>std</th>\n", + " <td>5.583369</td>\n", + " <td>12.801744</td>\n", + " <td>7.820521e+04</td>\n", + " <td>7.406653e+07</td>\n", + " <td>141.056329</td>\n", + " <td>141.056329</td>\n", + " <td>8.830015</td>\n", + " <td>8.830015</td>\n", + " <td>0.932835</td>\n", + " <td>0.932835</td>\n", + " <td>...</td>\n", + " <td>17.290876</td>\n", + " <td>288325.166786</td>\n", + " <td>1.183041e+06</td>\n", + " <td>1.183041e+06</td>\n", + " <td>44987.211264</td>\n", + " <td>3113.779828</td>\n", + " <td>39296.957914</td>\n", + " <td>1183.041729</td>\n", + " <td>5.831107</td>\n", + " <td>267.914649</td>\n", + " </tr>\n", + " <tr>\n", + " <th>min</th>\n", + " <td>-110.000000</td>\n", + " <td>1.000000</td>\n", + " <td>2.028742e+06</td>\n", + " <td>5.902691e+12</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>3.519374e+07</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000</td>\n", + " <td>4597.000000</td>\n", + " <td>53538.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-98.000000</td>\n", + " <td>-2629.400000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25%</th>\n", + " <td>-79.900000</td>\n", + " <td>12.000000</td>\n", + " <td>2.113407e+06</td>\n", + " <td>5.902805e+12</td>\n", + " <td>11.000000</td>\n", + " <td>11.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>14.000000</td>\n", + " <td>249000.000000</td>\n", + " <td>3.617897e+07</td>\n", + " <td>9.852318e+05</td>\n", + " <td>29587.000000</td>\n", + " <td>8027.000000</td>\n", + " <td>101041.000000</td>\n", + " <td>985.000000</td>\n", + " <td>-76.400000</td>\n", + " <td>-527.300000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>50%</th>\n", + " <td>-75.700000</td>\n", + " <td>24.000000</td>\n", + " <td>2.155605e+06</td>\n", + " <td>5.902806e+12</td>\n", + " <td>82.000000</td>\n", + " <td>82.000000</td>\n", + " <td>16.000000</td>\n", + " <td>16.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>29.000000</td>\n", + " <td>500000.000000</td>\n", + " <td>3.714444e+07</td>\n", + " <td>1.950704e+06</td>\n", + " <td>59599.000000</td>\n", + " <td>9887.000000</td>\n", + " <td>138579.000000</td>\n", + " <td>1950.000000</td>\n", + " <td>-72.900000</td>\n", + " <td>-350.700000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75%</th>\n", + " <td>-71.900000</td>\n", + " <td>34.000000</td>\n", + " <td>2.226340e+06</td>\n", + " <td>5.902852e+12</td>\n", + " <td>82.000000</td>\n", + " <td>82.000000</td>\n", + " <td>25.000000</td>\n", + " <td>25.000000</td>\n", + " <td>3.000000</td>\n", + " <td>3.000000</td>\n", + " <td>...</td>\n", + " <td>44.000000</td>\n", + " <td>748000.000000</td>\n", + " <td>3.834786e+07</td>\n", + " <td>3.154128e+06</td>\n", + " <td>97761.750000</td>\n", + " <td>12660.000000</td>\n", + " <td>168403.000000</td>\n", + " <td>3154.000000</td>\n", + " <td>-68.400000</td>\n", + " <td>-226.100000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max</th>\n", + " <td>-41.000000</td>\n", + " <td>42.000000</td>\n", + " <td>2.304122e+06</td>\n", + " <td>5.902975e+12</td>\n", + " <td>630.000000</td>\n", + " <td>630.000000</td>\n", + " <td>41.000000</td>\n", + " <td>41.000000</td>\n", + " <td>6.000000</td>\n", + " <td>6.000000</td>\n", + " <td>...</td>\n", + " <td>59.000000</td>\n", + " <td>999000.000000</td>\n", + " <td>3.908956e+07</td>\n", + " <td>3.895821e+06</td>\n", + " <td>189705.000000</td>\n", + " <td>15444.000000</td>\n", + " <td>189705.000000</td>\n", + " <td>3895.000000</td>\n", + " <td>-41.000000</td>\n", + " <td>-52.300000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>8 rows Ă 24 columns</p>\n", + "</div>" + ], + "text/plain": [ + " PROXIMITY MEASUREMENT ITEMID EAN \\\n", + "count 317762.000000 317762.000000 3.177620e+05 3.177620e+05 \n", + "mean -75.526452 22.776858 2.170711e+06 5.902836e+12 \n", + "std 5.583369 12.801744 7.820521e+04 7.406653e+07 \n", + "min -110.000000 1.000000 2.028742e+06 5.902691e+12 \n", + "25% -79.900000 12.000000 2.113407e+06 5.902805e+12 \n", + "50% -75.700000 24.000000 2.155605e+06 5.902806e+12 \n", + "75% -71.900000 34.000000 2.226340e+06 5.902852e+12 \n", + "max -41.000000 42.000000 2.304122e+06 5.902975e+12 \n", + "\n", + " SubclassID SubclassID.1 ClassID ClassID.1 \\\n", + "count 317762.000000 317762.000000 317762.000000 317762.000000 \n", + "mean 82.639686 82.639686 18.142610 18.142610 \n", + "std 141.056329 141.056329 8.830015 8.830015 \n", + "min 10.000000 10.000000 10.000000 10.000000 \n", + "25% 11.000000 11.000000 10.000000 10.000000 \n", + "50% 82.000000 82.000000 16.000000 16.000000 \n", + "75% 82.000000 82.000000 25.000000 25.000000 \n", + "max 630.000000 630.000000 41.000000 41.000000 \n", + "\n", + " DepartmentID DepartmentID.1 ... SEC MICROSEC \\\n", + "count 317762.000000 317762.000000 ... 317762.000000 317762.000000 \n", + "mean 2.603486 2.603486 ... 29.378906 499539.922961 \n", + "std 0.932835 0.932835 ... 17.290876 288325.166786 \n", + "min 2.000000 2.000000 ... 0.000000 0.000000 \n", + "25% 2.000000 2.000000 ... 14.000000 249000.000000 \n", + "50% 2.000000 2.000000 ... 29.000000 500000.000000 \n", + "75% 3.000000 3.000000 ... 44.000000 748000.000000 \n", + "max 6.000000 6.000000 ... 59.000000 999000.000000 \n", + "\n", + " MILISEC TIME_MS TIME_PER_MEASUREMENT_MS NUMBER_OF_SIGNALS \\\n", + "count 3.177620e+05 3.177620e+05 317762.000000 317762.000000 \n", + "mean 3.717748e+07 1.983745e+06 66756.283605 10103.277673 \n", + "std 1.183041e+06 1.183041e+06 44987.211264 3113.779828 \n", + "min 3.519374e+07 0.000000e+00 0.000000 4597.000000 \n", + "25% 3.617897e+07 9.852318e+05 29587.000000 8027.000000 \n", + "50% 3.714444e+07 1.950704e+06 59599.000000 9887.000000 \n", + "75% 3.834786e+07 3.154128e+06 97761.750000 12660.000000 \n", + "max 3.908956e+07 3.895821e+06 189705.000000 15444.000000 \n", + "\n", + " LENGTH_OF_MEASUREMENT TIME_KMS MAX_PROXIMITY_KMS \\\n", + "count 317762.000000 317762.000000 317762.000000 \n", + "mean 133222.336198 1983.244840 -72.536124 \n", + "std 39296.957914 1183.041729 5.831107 \n", + "min 53538.000000 0.000000 -98.000000 \n", + "25% 101041.000000 985.000000 -76.400000 \n", + "50% 138579.000000 1950.000000 -72.900000 \n", + "75% 168403.000000 3154.000000 -68.400000 \n", + "max 189705.000000 3895.000000 -41.000000 \n", + "\n", + " SUM_PROXIMITY_KMS \n", + "count 317762.000000 \n", + "mean -404.190290 \n", + "std 267.914649 \n", + "min -2629.400000 \n", + "25% -527.300000 \n", + "50% -350.700000 \n", + "75% -226.100000 \n", + "max -52.300000 \n", + "\n", + "[8 rows x 24 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 1, 2, 3, 4, 5, 7, 6, 10, 11, 12, 13, 14, 16, 18, 17, 19, 20,\n", + " 21, 23, 24, 26, 27, 28, 29, 31, 32, 34, 35, 36, 38, 37, 40, 39, 41,\n", + " 42], dtype=int64)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.MEASUREMENT.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 , RK485-99X\n", + "2 , RK485-99X\n", + "3 , RK485-99X\n", + "4 , RK485-99X\n", + "5 , RK485-99X\n", + "7 , RK485-99X\n", + "6 , RK485-99X\n", + "10 , RK485-99X\n", + "11 , RK485-99X\n", + "12 , RK485-99X\n", + "13 , RK485-99X\n", + "14 , RK485-99X\n", + "16 , RK485-99X\n", + "18 , RK485-99X\n", + "17 , RK485-99X\n", + "19 , RK485-99X\n", + "20 , RK485-99X\n", + "21 , RK485-99X\n", + "23 , RK485-99X\n", + "24 , RK485-99X\n", + "26 , RK485-99X\n", + "27 , RK485-99X\n", + "28 , RK485-99X\n", + "29 , RK485-99X\n", + "31 , RK485-99X\n", + "32 , RK485-99X\n", + "34 , RK485-99X\n", + "35 , RK485-99X\n", + "36 , RK485-99X\n", + "38 , RK485-99X\n", + "37 , RK485-99X\n", + "40 , RK485-99X\n", + "39 , RK485-99X\n", + "41 , RK485-99X\n", + "42 , RK485-99X\n" + ] + } + ], + "source": [ + "for i in df.MEASUREMENT.unique():\n", + " zb = df[df['MEASUREMENT'] == i]\n", + " for j in zb.StyleColor.unique():\n", + " zbior = zb[zb['StyleColor'] == j]\n", + " if zbior.EPC.unique().size == 1:\n", + " print(i,', ', j)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Zatem usuwamy caĹy ten Stylokolor\n", + "df.drop(df.loc[df['StyleColor'] == 'RK485-99X'].index, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Sprawdzamy, czy mamy klipsy przypisane do wiÄcej niĹź 1 Itemu\n", + "for i in df.MEASUREMENT.unique():\n", + " zb = df[df['MEASUREMENT'] == i]\n", + " for j in zb.EPC.unique():\n", + " zbior = zb[zb['EPC'] == j]\n", + " if zbior.EAN.unique().size > 1:\n", + " print(i,', ', j)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#do testow samego modelu\n", + "test_1 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ((df.MEASUREMENT == 2) | (df.MEASUREMENT == 3) | (df.MEASUREMENT == 4) | (df.MEASUREMENT == 26) | (df.MEASUREMENT == 28) ) ]\n", + "\n", + "test1 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ( (df.MEASUREMENT == 1) ) ]\n", + "test2 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ( (df.MEASUREMENT == 2) ) ]\n", + "test3 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ( (df.MEASUREMENT == 3) ) ]\n", + "test4 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ( (df.MEASUREMENT == 4) ) ]\n", + "test5 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 5) ]\n", + "test6 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 6) ]\n", + "test9 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 9) ]\n", + "test12 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 12) ]\n", + "test22 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 22) ]\n", + "test24 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 24) ]\n", + "test25 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 25) ]\n", + "test21 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 21) ]\n", + "test29 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 29) ]\n", + "test28 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 28) ]\n", + "\n", + "test29" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EAN', data=test6)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EAN', data=test_1[test_1.MEASUREMENT == 3])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EPC', data=test_1[test_1.MEASUREMENT == 4])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EPC', data=test_1[test_1.MEASUREMENT == 26])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EAN', data=test_1[test_1.MEASUREMENT == 28])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "NastÄpnie tworzymy intuicyjny podziaĹ na zbiĂłr testowy i treningowy: pomiary przed godzinÄ 10.30 traktujemy jako zbiĂłr treningowy, natomiast te po godzinie 10.30 - jako zbiĂłr testowy." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "train = df[df.TIMESTAMP <= '2021-10-26T10:30:00.000']\n", + "train = train[['SUM_PROXIMITY_KMS','TIME_MS', 'EAN', 'MEASUREMENT','StyleColor', 'EPC']]\n", + "test = df[df.TIMESTAMP > '2021-10-26T10:30:00.000']\n", + "test = test[['SUM_PROXIMITY_KMS','TIME_MS', 'EAN', 'MEASUREMENT','StyleColor', 'EPC']]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from sklearn.cluster import DBSCAN\n", + "from sklearn import metrics\n", + "from sklearn.datasets import make_blobs\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.preprocessing import normalize\n", + "from sklearn.neighbors import NearestNeighbors\n", + "import plotly.express as px\n", + "from kneed import KneeLocator" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "#zbiĂłr testowy dla kilku stylokolorĂłw\n", + "test0 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & ((df.StyleColor == 'RH797-81X') | (df.StyleColor == 'SL171-99X') \n", + " | (df.StyleColor == 'RH797-59X'))]\n", + "caly1 = test0[['SUM_PROXIMITY_KMS','TIME_MS', 'EAN', 'MEASUREMENT','StyleColor', 'EPC']]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def c1(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(2,minimal_epc.astype(int))\n", + " \n", + " if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(6000,kneedle.knee_y/8)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.5*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "#zbiĂłr testowy dla kilku stylokolorĂłw \n", + "train1 = train[(train.StyleColor == 'RH267-85J') | (train.StyleColor == 'RJ369-87X') | (train.StyleColor =='RM119-93X') \n", + " | (train.StyleColor == 'RS483-99X') | (train.StyleColor == 'SB281-90M')]\n", + "\n", + "train2 = train[(train.StyleColor == 'RV167-MLC') | (train.StyleColor == 'RV462-87X') | (train.StyleColor =='QJ677-33X') \n", + " | (train.StyleColor == 'RH797-00X') | (train.StyleColor == 'RH267-55J')]\n", + "\n", + "train3 = train[(train.StyleColor == 'SL171-99X') | (train.StyleColor == 'SO133-09M') | (train.StyleColor =='RB254-00X') \n", + " | (train.StyleColor == 'SF078-MLC') | (train.StyleColor == 'QY337-00X')]\n", + "\n", + "train4 = train[(train.StyleColor == 'SP095-59X') | (train.StyleColor == 'RN633-00X') | (train.StyleColor =='RH267-59J') \n", + " | (train.StyleColor == 'RV167-87X')]\n", + "\n", + "train5 = train[(train.StyleColor == 'RJ365-09M') | (train.StyleColor == 'RH797-59X') | (train.StyleColor =='SP090-90X') \n", + " | (train.StyleColor == 'RH797-99X') | (train.StyleColor == 'RJ371-59M')]\n", + "\n", + "train6 = train[(train.StyleColor == 'RV462-99X') | (train.StyleColor == 'RH797-81X') | (train.StyleColor =='QZ555-20X') \n", + " | (train.StyleColor == 'RJ371-53M') | (train.StyleColor == 'RS054-99X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train1)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train2)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 18 5902851852638 SO133-09M\n", + "1 11 5902690542745 QY337-00X\n", + "2 11 5902690542769 QY337-00X" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train3)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train4)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train5)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 21 5902805820447 RH797-81X\n", + "1 24 5902805820447 RH797-81X\n", + "2 26 5902805820447 RH797-81X\n", + "3 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train6)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 18 5902851852638 SO133-09M\n", + "1 11 5902690542745 QY337-00X\n", + "2 11 5902690542769 QY337-00X\n", + "3 21 5902805820447 RH797-81X\n", + "4 24 5902805820447 RH797-81X\n", + "5 26 5902805820447 RH797-81X\n", + "6 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 42 5902690542769 QY337-00X\n", + "2 29 5902805820447 RH797-81X\n", + "3 32 5902805820447 RH797-81X\n", + "4 34 5902805820447 RH797-81X\n", + "5 38 5902851852614 SO133-09M\n", + "6 38 5902851852638 SO133-09M\n", + "7 38 5902851852638 SO133-09M\n", + "8 42 5902851852638 SO133-09M\n", + "9 42 5902851852614 SO133-09M\n", + "10 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 268, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>22</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>25</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>30</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 22 5902805820447 RH797-81X\n", + "1 25 5902805820447 RH797-81X\n", + "2 24 5902805820447 RH797-81X\n", + "3 26 5902805820447 RH797-81X\n", + "4 30 5902805820447 RH797-81X\n", + "5 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 268, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(caly1,0.7)" + ] + }, + { + "cell_type": "code", + "execution_count": 225, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>22</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>25</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>30</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 22 5902805820447 RH797-81X\n", + "1 21 5902805820447 RH797-81X\n", + "2 25 5902805820447 RH797-81X\n", + "3 24 5902805820447 RH797-81X\n", + "4 26 5902805820447 RH797-81X\n", + "5 30 5902805820447 RH797-81X\n", + "6 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 225, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(caly1, 0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + "DF = df[(df.StyleColor == 'RH797-81X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "proba = DF[ DF.MEASUREMENT == 38 ]\n", + "X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + "minimal_epc = np.floor(proba['EAN'].value_counts().min()/2)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3035684754501F8000B5B6E6 140\n", + "3035684754501F4000B5B6E5 135\n", + "3035684754501F0000B5B614 130\n", + "3035684754501F0000B5B632 90\n", + "3035684754501F8000B5B6A5 80\n", + "Name: EPC, dtype: int64" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "proba['EPC'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "67.0" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "minimal_epc" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "neighbors = max(2,minimal_epc.astype(int))\n", + "X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + "nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + "distances, indices = nbrs.kneighbors(X_embedded)\n", + "distance_desc = sorted(distances[:,neighbors-1], reverse=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5000" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eps = max(5000,kneedle.knee_y/8)\n", + "eps" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='brute')\n", + "db.fit(X)\n", + "y_pred = db.fit_predict(X)\n", + "clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + "calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "if (db.labels_[db.labels_ == -1].size != 0) :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.6*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': 24, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':'QY337-00X'}, ignore_index = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3035684754501F4000B5B6E5 39\n", + "3035684754501F8000B5B6E6 31\n", + "3035684754501F8000B5B6A5 10\n", + "3035684754501F0000B5B632 10\n", + "3035684754501F0000B5B614 6\n", + "Name: EPC, dtype: int64" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "outliery" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "# Wybrane StyleColor\n", + "z1 = test[(test.StyleColor == 'RH267-85J') | (test.StyleColor == 'RJ369-87X') | (test.StyleColor =='RM119-93X') \n", + " | (test.StyleColor == 'RS483-99X') | (test.StyleColor == 'SB281-90M')]\n", + "\n", + "z2 = test[(test.StyleColor == 'RV167-MLC') | (test.StyleColor == 'RV462-87X') | (test.StyleColor =='QJ677-33X') \n", + " | (test.StyleColor == 'RH797-00X') | (test.StyleColor == 'RH267-55J')]\n", + "\n", + "z3 = test[(test.StyleColor == 'SL171-99X') | (test.StyleColor == 'SO133-09M') | (test.StyleColor =='RB254-00X') \n", + " | (test.StyleColor == 'SF078-MLC') | (test.StyleColor == 'QY337-00X')]\n", + "\n", + "z4 = test[(test.StyleColor == 'SP095-59X') | (test.StyleColor == 'RN633-00X') | (test.StyleColor =='RH267-59J') \n", + " | (test.StyleColor == 'RV167-87X') | (test.StyleColor == 'RK485-99X')]\n", + "\n", + "z5 = test[(test.StyleColor == 'RJ365-09M') | (test.StyleColor == 'RH797-59X') | (test.StyleColor =='SP090-90X') \n", + " | (test.StyleColor == 'RH797-99X') | (test.StyleColor == 'RJ371-59M')]\n", + "\n", + "z6 = test[(test.StyleColor == 'RV462-99X') | (test.StyleColor == 'RH797-81X') | (test.StyleColor =='QZ555-20X') \n", + " | (test.StyleColor == 'RJ371-53M') | (test.StyleColor == 'RS054-99X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(z1)" + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 29 5902805820447 RH797-81X\n", + "2 32 5902805820447 RH797-81X\n", + "3 34 5902805820447 RH797-81X\n", + "4 38 5902851852614 SO133-09M\n", + "5 38 5902851852638 SO133-09M\n", + "6 42 5902851852638 SO133-09M\n", + "7 42 5902851852614 SO133-09M\n", + "8 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 183, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 29 5902805820447 RH797-81X\n", + "2 32 5902805820447 RH797-81X\n", + "3 34 5902805820447 RH797-81X\n", + "4 38 5902851852614 SO133-09M\n", + "5 38 5902851852638 SO133-09M\n", + "6 42 5902851852638 SO133-09M\n", + "7 42 5902851852614 SO133-09M\n", + "8 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 186, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 277, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 24 5902805820447 RH797-81X\n", + "1 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 277, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# dla /5\n", + "c1(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 278, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 278, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 280, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 24 5902805820447 RH797-81X\n", + "1 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 280, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# dla /6\n", + "c1(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 281, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 34 5902805820447 RH797-81X\n", + "3 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 281, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 283, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 11 5902690542745 QY337-00X\n", + "1 11 5902690542769 QY337-00X\n", + "2 24 5902805820447 RH797-81X\n", + "3 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 283, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# eps min 5000 / 5\n", + "# \n", + "c1(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 284, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 42 5902690542769 QY337-00X\n", + "2 29 5902805820447 RH797-81X\n", + "3 32 5902805820447 RH797-81X\n", + "4 34 5902805820447 RH797-81X\n", + "5 38 5902851852614 SO133-09M\n", + "6 38 5902851852638 SO133-09M\n", + "7 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 284, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# c=0.6\n", + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 317, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 11 5902690542745 QY337-00X\n", + "1 11 5902690542769 QY337-00X\n", + "2 24 5902805820447 RH797-81X\n", + "3 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 317, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 318, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 29 5902805820447 RH797-81X\n", + "2 32 5902805820447 RH797-81X\n", + "3 34 5902805820447 RH797-81X\n", + "4 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 318, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "def c2(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(2,minimal_epc.astype(int))\n", + " \n", + " if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(5000,kneedle.knee_y/5)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.5*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 21 5902805820447 RH797-81X\n", + "1 24 5902805820447 RH797-81X\n", + "2 26 5902805820447 RH797-81X\n", + "3 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(train6)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 34 5902805820447 RH797-81X" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(z6)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 18 5902851852638 SO133-09M\n", + "1 11 5902690542745 QY337-00X\n", + "2 11 5902690542769 QY337-00X\n", + "3 21 5902805820447 RH797-81X\n", + "4 24 5902805820447 RH797-81X\n", + "5 26 5902805820447 RH797-81X\n", + "6 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>42</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 42 5902690542769 QY337-00X\n", + "2 42 5902690542745 QY337-00X\n", + "3 29 5902805820447 RH797-81X\n", + "4 32 5902805820447 RH797-81X\n", + "5 34 5902805820447 RH797-81X\n", + "6 38 5902851852614 SO133-09M\n", + "7 38 5902851852638 SO133-09M\n", + "8 38 5902851852638 SO133-09M\n", + "9 42 5902851852638 SO133-09M\n", + "10 42 5902851852614 SO133-09M\n", + "11 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "def c3(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(2,minimal_epc.astype(int))\n", + " \n", + " if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(5000,kneedle.knee_y/4)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='kd_tree')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.5*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 18 5902851852638 SO133-09M\n", + "1 11 5902690542745 QY337-00X\n", + "2 11 5902690542769 QY337-00X\n", + "3 21 5902805820447 RH797-81X\n", + "4 24 5902805820447 RH797-81X\n", + "5 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c3(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>42</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 42 5902690542769 QY337-00X\n", + "2 42 5902690542745 QY337-00X\n", + "3 29 5902805820447 RH797-81X\n", + "4 32 5902805820447 RH797-81X\n", + "5 38 5902851852638 SO133-09M\n", + "6 42 5902851852638 SO133-09M\n", + "7 42 5902851852614 SO133-09M\n", + "8 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c3(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "def c4(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(2,minimal_epc.astype(int))\n", + " \n", + " if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(5000,kneedle.knee_y/2)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if (db.labels_[db.labels_ == -1].size != 0 ) & (db.labels_[db.labels_ == -1].size > minimal_epc ):\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts().sum()\n", + " b = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()[0] \n", + " if a - 2*b < 0:\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EAN': proba[proba['EPC'] == calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts().index[0]].EAN.iloc[0], \"StyleColor\":j}, ignore_index = True)\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>6</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>4</td>\n", + " <td>5902851445731</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>7</td>\n", + " <td>5902851445700</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>11</td>\n", + " <td>5902851445731</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>28</td>\n", + " <td>5902805303681</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>1</td>\n", + " <td>5902805431803</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>5</td>\n", + " <td>5902805431797</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>6</td>\n", + " <td>5902805431803</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>12</td>\n", + " <td>5902851535913</td>\n", + " <td>RV167-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>2</td>\n", + " <td>5902975217986</td>\n", + " <td>RV462-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>4</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>7</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>12</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>4</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>7</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>14</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>20</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>2</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>4</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>13</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>16</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>23</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>24</td>\n", + " <td>5902805219685</td>\n", + " <td>RN633-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>1</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>2</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>13</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>21</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>24</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>26</td>\n", + " <td>5902805444698</td>\n", + " <td>RJ365-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31</th>\n", + " <td>23</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>32</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>33</th>\n", + " <td>3</td>\n", + " <td>5902805385823</td>\n", + " <td>RJ371-53M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 6 5902805533040 RH267-85J\n", + "1 24 5902805533040 RH267-85J\n", + "2 4 5902851445731 RS483-99X\n", + "3 7 5902851445700 RS483-99X\n", + "4 11 5902851445731 RS483-99X\n", + "5 28 5902805303681 RJ369-87X\n", + "6 1 5902805431803 RM119-93X\n", + "7 5 5902805431797 RM119-93X\n", + "8 6 5902805431803 RM119-93X\n", + "9 12 5902851535913 RV167-MLC\n", + "10 2 5902975217986 RV462-87X\n", + "11 4 5902851414508 SL171-99X\n", + "12 7 5902851414508 SL171-99X\n", + "13 12 5902851414508 SL171-99X\n", + "14 4 5902851852638 SO133-09M\n", + "15 7 5902851852638 SO133-09M\n", + "16 14 5902851852638 SO133-09M\n", + "17 20 5902851852638 SO133-09M\n", + "18 2 5902690542769 QY337-00X\n", + "19 4 5902690542745 QY337-00X\n", + "20 13 5902690542769 QY337-00X\n", + "21 16 5902690542745 QY337-00X\n", + "22 23 5902690542745 QY337-00X\n", + "23 24 5902805219685 RN633-00X\n", + "24 1 5902805533255 RH267-59J\n", + "25 2 5902805533255 RH267-59J\n", + "26 13 5902805533255 RH267-59J\n", + "27 21 5902805533255 RH267-59J\n", + "28 24 5902805533255 RH267-59J\n", + "29 26 5902805444698 RJ365-09M\n", + "30 21 5902805820447 RH797-81X\n", + "31 23 5902805820447 RH797-81X\n", + "32 28 5902805820447 RH797-81X\n", + "33 3 5902805385823 RJ371-53M" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c4(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>41</td>\n", + " <td>5902805532999</td>\n", + " <td>RH267-55J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>34</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>42</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>32</td>\n", + " <td>5902805431803</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>40</td>\n", + " <td>5902805431803</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>32</td>\n", + " <td>5902851445700</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>37</td>\n", + " <td>5902851445700</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>40</td>\n", + " <td>5902851445731</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>32</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>36</td>\n", + " <td>5902805820423</td>\n", + " <td>RH797-59X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>37</td>\n", + " <td>5902805303681</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>32</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>38</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>40</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>29</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>36</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>41</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>39</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>32</td>\n", + " <td>5902805820546</td>\n", + " <td>RH797-00X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 41 5902805532999 RH267-55J\n", + "1 32 5902851414515 SL171-99X\n", + "2 34 5902851414515 SL171-99X\n", + "3 42 5902851414515 SL171-99X\n", + "4 32 5902805431803 RM119-93X\n", + "5 40 5902805431803 RM119-93X\n", + "6 32 5902851445700 RS483-99X\n", + "7 37 5902851445700 RS483-99X\n", + "8 40 5902851445731 RS483-99X\n", + "9 32 5902690542745 QY337-00X\n", + "10 36 5902805820423 RH797-59X\n", + "11 37 5902805303681 RJ369-87X\n", + "12 29 5902805820447 RH797-81X\n", + "13 32 5902805820447 RH797-81X\n", + "14 32 5902805533255 RH267-59J\n", + "15 38 5902805533255 RH267-59J\n", + "16 40 5902805533255 RH267-59J\n", + "17 29 5902975236994 SF078-MLC\n", + "18 36 5902975236994 SF078-MLC\n", + "19 41 5902975236994 SF078-MLC\n", + "20 39 5902851852638 SO133-09M\n", + "21 32 5902805820546 RH797-00X" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c4(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "def c5(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(2,minimal_epc.astype(int))\n", + " \n", + " if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(5000,kneedle.knee_y/2)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='kd_tree')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.5*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c5(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c5(test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/df.csv b/df.csv new file mode 100644 index 0000000000000000000000000000000000000000..5232a92fa78cdb90a1c586efb0e29f7f80f75f17 Binary files /dev/null and b/df.csv differ