diff --git a/.ipynb_checkpoints/DBSCAN_1-zmiany_df-Copy1-checkpoint.ipynb b/.ipynb_checkpoints/DBSCAN_1-zmiany_df-Copy1-checkpoint.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c73e7893e86dd37c1a2996f67821d22991bfcc2a --- /dev/null +++ b/.ipynb_checkpoints/DBSCAN_1-zmiany_df-Copy1-checkpoint.ipynb @@ -0,0 +1,7725 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pyodbc\n", + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import urllib\n", + "import seaborn as sns\n", + "from matplotlib import pyplot as plt\n", + "import numpy as np\n", + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "params = urllib.parse.quote_plus(\"DRIVER={ODBC Driver 17 for SQL Server};\"\n", + " #\"SERVER=dbserver.mif.pg.gda.pl,1433;\"\n", + " \"SERVER=127.0.0.1,1433;\"\n", + " \"DATABASE=silkycoders;\"\n", + " \"UID=;\"\n", + " \"PWD=\")\n", + "\n", + "engine = create_engine(\"mssql+pyodbc:///?odbc_connect={}\".format(params))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"\"\"SELECT rfid.*, it.*, sub.*, cl.*, dep.*, br.*\n", + " FROM rfid.Logs rfid \n", + " JOIN rfid.EanEpc ean \n", + " ON rfid.EPC = ean.EPC \n", + " JOIN dw.Item it \n", + " ON ean.EAN = it.EAN \n", + " JOIN dw.Subclass sub \n", + " ON sub.SubclassID = it.SubclassID\n", + " JOIN dw.Class cl\n", + " ON sub.ClassID = cl.ClassID\n", + " JOIN dw.Department dep\n", + " ON dep.DepartmentID = cl.DepartmentID\n", + " JOIN dw.Brand br\n", + " ON dep.BrandID = br.BrandID\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_sql_query(query, engine)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_timestamp(df : pd.DataFrame):\n", + " \n", + " dt = df.sort_values(by=\"TIMESTAMP\").reset_index(drop=True)\n", + " dt[\"HOUR\"] = dt.TIMESTAMP.astype('datetime64[ns]').dt.hour.astype(int)\n", + " dt[\"MIN\"] = dt.TIMESTAMP.astype('datetime64[ns]').dt.minute.astype(int)\n", + " dt[\"SEC\"] = dt.TIMESTAMP.astype('datetime64[ns]').dt.second.astype(int)\n", + " dt[\"MICROSEC\"] = dt.TIMESTAMP.astype('datetime64[ns]').dt.microsecond.astype(int)\n", + "\n", + " dt[\"MILISEC\"] = dt.MICROSEC/1000 + dt.SEC*1000 + dt.MIN*60000 + dt.HOUR*3600000\n", + " dt[\"TIME_MS\"] = dt.MILISEC - dt.MILISEC[0]\n", + " \n", + " dt['TIME_PER_MEASUREMENT_MS'] = 0\n", + " dt['NUMBER_OF_SIGNALS'] = 0\n", + " dt['LENGTH_OF_MEASUREMENT'] = 0\n", + " \n", + " for m in dt.MEASUREMENT.unique():\n", + " filtr = (dt.MEASUREMENT == m)\n", + " dt.loc[filtr,'TIME_PER_MEASUREMENT_MS'] = dt[filtr].MILISEC - dt[filtr].MILISEC.iloc[0]\n", + " dt.loc[filtr, \"NUMBER_OF_SIGNALS\"] = len(dt[filtr])\n", + " dt.loc[filtr, 'LENGTH_OF_MEASUREMENT'] = dt[filtr].TIME_PER_MEASUREMENT_MS.max()\n", + " \n", + " dt[\"TIME_KMS\"] = np.floor(dt.TIME_MS/1000) \n", + " dt = dt.merge(dt.groupby(['EPC','TIME_KMS'])[\"PROXIMITY\"].max().reset_index(name=\"MAX_PROXIMITY_KMS\"), how=\"left\",\n", + " on = ['EPC','TIME_KMS'])\n", + " dt = dt.merge(dt.groupby(['EPC','TIME_KMS'])[\"PROXIMITY\"].sum().reset_index(name=\"SUM_PROXIMITY_KMS\"), how=\"left\",\n", + " on = ['EPC','TIME_KMS'])\n", + " return dt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = prepare_timestamp(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('df.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 502689 entries, 0 to 502688\n", + "Data columns (total 36 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 EPC 502689 non-null object \n", + " 1 PROXIMITY 502689 non-null float64\n", + " 2 TIMESTAMP 502689 non-null object \n", + " 3 MEASUREMENT 502689 non-null int64 \n", + " 4 ITEMID 502689 non-null int64 \n", + " 5 EAN 502689 non-null int64 \n", + " 6 StyleColorSize 502689 non-null object \n", + " 7 StyleColor 502689 non-null object \n", + " 8 Size 502689 non-null object \n", + " 9 SubclassID 502689 non-null int64 \n", + " 10 ItemSeason 502689 non-null object \n", + " 11 FashionLevel 369997 non-null object \n", + " 12 SubclassID.1 502689 non-null int64 \n", + " 13 SubclassName 502689 non-null object \n", + " 14 ClassID 502689 non-null int64 \n", + " 15 ClassID.1 502689 non-null int64 \n", + " 16 ClassName 502689 non-null object \n", + " 17 DepartmentID 502689 non-null int64 \n", + " 18 DepartmentID.1 502689 non-null int64 \n", + " 19 DepartmentName 502689 non-null object \n", + " 20 BrandID 502689 non-null int64 \n", + " 21 BrandID.1 502689 non-null int64 \n", + " 22 BrandName 502689 non-null object \n", + " 23 Active 502689 non-null bool \n", + " 24 HOUR 502689 non-null int64 \n", + " 25 MIN 502689 non-null int64 \n", + " 26 SEC 502689 non-null int64 \n", + " 27 MICROSEC 502689 non-null int64 \n", + " 28 MILISEC 502689 non-null float64\n", + " 29 TIME_MS 502689 non-null float64\n", + " 30 TIME_PER_MEASUREMENT_MS 502689 non-null float64\n", + " 31 NUMBER_OF_SIGNALS 502689 non-null int64 \n", + " 32 LENGTH_OF_MEASUREMENT 502689 non-null int64 \n", + " 33 TIME_KMS 502689 non-null float64\n", + " 34 MAX_PROXIMITY_KMS 502689 non-null float64\n", + " 35 SUM_PROXIMITY_KMS 502689 non-null float64\n", + "dtypes: bool(1), float64(7), int64(17), object(11)\n", + "memory usage: 134.7+ MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>PROXIMITY</th>\n", + " <th>MEASUREMENT</th>\n", + " <th>ITEMID</th>\n", + " <th>EAN</th>\n", + " <th>SubclassID</th>\n", + " <th>SubclassID.1</th>\n", + " <th>ClassID</th>\n", + " <th>ClassID.1</th>\n", + " <th>DepartmentID</th>\n", + " <th>DepartmentID.1</th>\n", + " <th>...</th>\n", + " <th>SEC</th>\n", + " <th>MICROSEC</th>\n", + " <th>MILISEC</th>\n", + " <th>TIME_MS</th>\n", + " <th>TIME_PER_MEASUREMENT_MS</th>\n", + " <th>NUMBER_OF_SIGNALS</th>\n", + " <th>LENGTH_OF_MEASUREMENT</th>\n", + " <th>TIME_KMS</th>\n", + " <th>MAX_PROXIMITY_KMS</th>\n", + " <th>SUM_PROXIMITY_KMS</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>5.026890e+05</td>\n", + " <td>5.026890e+05</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>...</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>5.026890e+05</td>\n", + " <td>5.026890e+05</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.00000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>mean</th>\n", + " <td>-75.406017</td>\n", + " <td>23.412382</td>\n", + " <td>2.169862e+06</td>\n", + " <td>5.902835e+12</td>\n", + " <td>83.920704</td>\n", + " <td>83.920704</td>\n", + " <td>18.231477</td>\n", + " <td>18.231477</td>\n", + " <td>2.609574</td>\n", + " <td>2.609574</td>\n", + " <td>...</td>\n", + " <td>29.193547</td>\n", + " <td>499773.110213</td>\n", + " <td>3.721192e+07</td>\n", + " <td>2.018186e+06</td>\n", + " <td>100132.210719</td>\n", + " <td>15383.906986</td>\n", + " <td>199835.398777</td>\n", + " <td>2017.68607</td>\n", + " <td>-72.497318</td>\n", + " <td>-398.108291</td>\n", + " </tr>\n", + " <tr>\n", + " <th>std</th>\n", + " <td>5.698062</td>\n", + " <td>12.175284</td>\n", + " <td>7.798483e+04</td>\n", + " <td>7.380986e+07</td>\n", + " <td>142.489244</td>\n", + " <td>142.489244</td>\n", + " <td>8.844056</td>\n", + " <td>8.844056</td>\n", + " <td>0.937828</td>\n", + " <td>0.937828</td>\n", + " <td>...</td>\n", + " <td>17.223297</td>\n", + " <td>288469.414710</td>\n", + " <td>1.121487e+06</td>\n", + " <td>1.121487e+06</td>\n", + " <td>81859.831696</td>\n", + " <td>8217.121271</td>\n", + " <td>101049.072703</td>\n", + " <td>1121.48684</td>\n", + " <td>5.893956</td>\n", + " <td>262.167663</td>\n", + " </tr>\n", + " <tr>\n", + " <th>min</th>\n", + " <td>-110.000000</td>\n", + " <td>1.000000</td>\n", + " <td>2.028742e+06</td>\n", + " <td>5.902691e+12</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>3.519374e+07</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000</td>\n", + " <td>4597.000000</td>\n", + " <td>53538.000000</td>\n", + " <td>0.00000</td>\n", + " <td>-100.500000</td>\n", + " <td>-2629.400000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25%</th>\n", + " <td>-79.900000</td>\n", + " <td>13.000000</td>\n", + " <td>2.113407e+06</td>\n", + " <td>5.902805e+12</td>\n", + " <td>11.000000</td>\n", + " <td>11.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>14.000000</td>\n", + " <td>250000.000000</td>\n", + " <td>3.624898e+07</td>\n", + " <td>1.055248e+06</td>\n", + " <td>38108.000000</td>\n", + " <td>8533.000000</td>\n", + " <td>127122.000000</td>\n", + " <td>1055.00000</td>\n", + " <td>-76.400000</td>\n", + " <td>-515.900000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>50%</th>\n", + " <td>-75.700000</td>\n", + " <td>24.000000</td>\n", + " <td>2.155604e+06</td>\n", + " <td>5.902806e+12</td>\n", + " <td>82.000000</td>\n", + " <td>82.000000</td>\n", + " <td>16.000000</td>\n", + " <td>16.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>29.000000</td>\n", + " <td>500000.000000</td>\n", + " <td>3.719645e+07</td>\n", + " <td>2.002711e+06</td>\n", + " <td>78477.000000</td>\n", + " <td>13321.000000</td>\n", + " <td>176026.000000</td>\n", + " <td>2002.00000</td>\n", + " <td>-72.900000</td>\n", + " <td>-342.900000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75%</th>\n", + " <td>-71.900000</td>\n", + " <td>33.000000</td>\n", + " <td>2.226340e+06</td>\n", + " <td>5.902852e+12</td>\n", + " <td>82.000000</td>\n", + " <td>82.000000</td>\n", + " <td>25.000000</td>\n", + " <td>25.000000</td>\n", + " <td>3.000000</td>\n", + " <td>3.000000</td>\n", + " <td>...</td>\n", + " <td>44.000000</td>\n", + " <td>749000.000000</td>\n", + " <td>3.815973e+07</td>\n", + " <td>2.965991e+06</td>\n", + " <td>139431.000000</td>\n", + " <td>22217.000000</td>\n", + " <td>265127.000000</td>\n", + " <td>2965.00000</td>\n", + " <td>-68.400000</td>\n", + " <td>-225.700000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max</th>\n", + " <td>-38.900000</td>\n", + " <td>43.000000</td>\n", + " <td>2.304122e+06</td>\n", + " <td>5.902975e+12</td>\n", + " <td>630.000000</td>\n", + " <td>630.000000</td>\n", + " <td>41.000000</td>\n", + " <td>41.000000</td>\n", + " <td>6.000000</td>\n", + " <td>6.000000</td>\n", + " <td>...</td>\n", + " <td>59.000000</td>\n", + " <td>999000.000000</td>\n", + " <td>3.912875e+07</td>\n", + " <td>3.935013e+06</td>\n", + " <td>435771.000000</td>\n", + " <td>35350.000000</td>\n", + " <td>435771.000000</td>\n", + " <td>3935.00000</td>\n", + " <td>-38.900000</td>\n", + " <td>-52.300000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>8 rows Ă 24 columns</p>\n", + "</div>" + ], + "text/plain": [ + " PROXIMITY MEASUREMENT ITEMID EAN \\\n", + "count 502689.000000 502689.000000 5.026890e+05 5.026890e+05 \n", + "mean -75.406017 23.412382 2.169862e+06 5.902835e+12 \n", + "std 5.698062 12.175284 7.798483e+04 7.380986e+07 \n", + "min -110.000000 1.000000 2.028742e+06 5.902691e+12 \n", + "25% -79.900000 13.000000 2.113407e+06 5.902805e+12 \n", + "50% -75.700000 24.000000 2.155604e+06 5.902806e+12 \n", + "75% -71.900000 33.000000 2.226340e+06 5.902852e+12 \n", + "max -38.900000 43.000000 2.304122e+06 5.902975e+12 \n", + "\n", + " SubclassID SubclassID.1 ClassID ClassID.1 \\\n", + "count 502689.000000 502689.000000 502689.000000 502689.000000 \n", + "mean 83.920704 83.920704 18.231477 18.231477 \n", + "std 142.489244 142.489244 8.844056 8.844056 \n", + "min 10.000000 10.000000 10.000000 10.000000 \n", + "25% 11.000000 11.000000 10.000000 10.000000 \n", + "50% 82.000000 82.000000 16.000000 16.000000 \n", + "75% 82.000000 82.000000 25.000000 25.000000 \n", + "max 630.000000 630.000000 41.000000 41.000000 \n", + "\n", + " DepartmentID DepartmentID.1 ... SEC MICROSEC \\\n", + "count 502689.000000 502689.000000 ... 502689.000000 502689.000000 \n", + "mean 2.609574 2.609574 ... 29.193547 499773.110213 \n", + "std 0.937828 0.937828 ... 17.223297 288469.414710 \n", + "min 2.000000 2.000000 ... 0.000000 0.000000 \n", + "25% 2.000000 2.000000 ... 14.000000 250000.000000 \n", + "50% 2.000000 2.000000 ... 29.000000 500000.000000 \n", + "75% 3.000000 3.000000 ... 44.000000 749000.000000 \n", + "max 6.000000 6.000000 ... 59.000000 999000.000000 \n", + "\n", + " MILISEC TIME_MS TIME_PER_MEASUREMENT_MS NUMBER_OF_SIGNALS \\\n", + "count 5.026890e+05 5.026890e+05 502689.000000 502689.000000 \n", + "mean 3.721192e+07 2.018186e+06 100132.210719 15383.906986 \n", + "std 1.121487e+06 1.121487e+06 81859.831696 8217.121271 \n", + "min 3.519374e+07 0.000000e+00 0.000000 4597.000000 \n", + "25% 3.624898e+07 1.055248e+06 38108.000000 8533.000000 \n", + "50% 3.719645e+07 2.002711e+06 78477.000000 13321.000000 \n", + "75% 3.815973e+07 2.965991e+06 139431.000000 22217.000000 \n", + "max 3.912875e+07 3.935013e+06 435771.000000 35350.000000 \n", + "\n", + " LENGTH_OF_MEASUREMENT TIME_KMS MAX_PROXIMITY_KMS \\\n", + "count 502689.000000 502689.00000 502689.000000 \n", + "mean 199835.398777 2017.68607 -72.497318 \n", + "std 101049.072703 1121.48684 5.893956 \n", + "min 53538.000000 0.00000 -100.500000 \n", + "25% 127122.000000 1055.00000 -76.400000 \n", + "50% 176026.000000 2002.00000 -72.900000 \n", + "75% 265127.000000 2965.00000 -68.400000 \n", + "max 435771.000000 3935.00000 -38.900000 \n", + "\n", + " SUM_PROXIMITY_KMS \n", + "count 502689.000000 \n", + "mean -398.108291 \n", + "std 262.167663 \n", + "min -2629.400000 \n", + "25% -515.900000 \n", + "50% -342.900000 \n", + "75% -225.700000 \n", + "max -52.300000 \n", + "\n", + "[8 rows x 24 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.boxplot(df['LENGTH_OF_MEASUREMENT']);" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Usuwamy najdĹuĹźsze pomiary\n", + "df.drop(df.loc[df['LENGTH_OF_MEASUREMENT'] > 200000].index, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>PROXIMITY</th>\n", + " <th>MEASUREMENT</th>\n", + " <th>ITEMID</th>\n", + " <th>EAN</th>\n", + " <th>SubclassID</th>\n", + " <th>SubclassID.1</th>\n", + " <th>ClassID</th>\n", + " <th>ClassID.1</th>\n", + " <th>DepartmentID</th>\n", + " <th>DepartmentID.1</th>\n", + " <th>...</th>\n", + " <th>SEC</th>\n", + " <th>MICROSEC</th>\n", + " <th>MILISEC</th>\n", + " <th>TIME_MS</th>\n", + " <th>TIME_PER_MEASUREMENT_MS</th>\n", + " <th>NUMBER_OF_SIGNALS</th>\n", + " <th>LENGTH_OF_MEASUREMENT</th>\n", + " <th>TIME_KMS</th>\n", + " <th>MAX_PROXIMITY_KMS</th>\n", + " <th>SUM_PROXIMITY_KMS</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>3.177620e+05</td>\n", + " <td>3.177620e+05</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>...</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>3.177620e+05</td>\n", + " <td>3.177620e+05</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>mean</th>\n", + " <td>-75.526452</td>\n", + " <td>22.776858</td>\n", + " <td>2.170711e+06</td>\n", + " <td>5.902836e+12</td>\n", + " <td>82.639686</td>\n", + " <td>82.639686</td>\n", + " <td>18.142610</td>\n", + " <td>18.142610</td>\n", + " <td>2.603486</td>\n", + " <td>2.603486</td>\n", + " <td>...</td>\n", + " <td>29.378906</td>\n", + " <td>499539.922961</td>\n", + " <td>3.717748e+07</td>\n", + " <td>1.983745e+06</td>\n", + " <td>66756.283605</td>\n", + " <td>10103.277673</td>\n", + " <td>133222.336198</td>\n", + " <td>1983.244840</td>\n", + " <td>-72.536124</td>\n", + " <td>-404.190290</td>\n", + " </tr>\n", + " <tr>\n", + " <th>std</th>\n", + " <td>5.583369</td>\n", + " <td>12.801744</td>\n", + " <td>7.820521e+04</td>\n", + " <td>7.406653e+07</td>\n", + " <td>141.056329</td>\n", + " <td>141.056329</td>\n", + " <td>8.830015</td>\n", + " <td>8.830015</td>\n", + " <td>0.932835</td>\n", + " <td>0.932835</td>\n", + " <td>...</td>\n", + " <td>17.290876</td>\n", + " <td>288325.166786</td>\n", + " <td>1.183041e+06</td>\n", + " <td>1.183041e+06</td>\n", + " <td>44987.211264</td>\n", + " <td>3113.779828</td>\n", + " <td>39296.957914</td>\n", + " <td>1183.041729</td>\n", + " <td>5.831107</td>\n", + " <td>267.914649</td>\n", + " </tr>\n", + " <tr>\n", + " <th>min</th>\n", + " <td>-110.000000</td>\n", + " <td>1.000000</td>\n", + " <td>2.028742e+06</td>\n", + " <td>5.902691e+12</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>3.519374e+07</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000</td>\n", + " <td>4597.000000</td>\n", + " <td>53538.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-98.000000</td>\n", + " <td>-2629.400000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25%</th>\n", + " <td>-79.900000</td>\n", + " <td>12.000000</td>\n", + " <td>2.113407e+06</td>\n", + " <td>5.902805e+12</td>\n", + " <td>11.000000</td>\n", + " <td>11.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>14.000000</td>\n", + " <td>249000.000000</td>\n", + " <td>3.617897e+07</td>\n", + " <td>9.852318e+05</td>\n", + " <td>29587.000000</td>\n", + " <td>8027.000000</td>\n", + " <td>101041.000000</td>\n", + " <td>985.000000</td>\n", + " <td>-76.400000</td>\n", + " <td>-527.300000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>50%</th>\n", + " <td>-75.700000</td>\n", + " <td>24.000000</td>\n", + " <td>2.155605e+06</td>\n", + " <td>5.902806e+12</td>\n", + " <td>82.000000</td>\n", + " <td>82.000000</td>\n", + " <td>16.000000</td>\n", + " <td>16.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>29.000000</td>\n", + " <td>500000.000000</td>\n", + " <td>3.714444e+07</td>\n", + " <td>1.950704e+06</td>\n", + " <td>59599.000000</td>\n", + " <td>9887.000000</td>\n", + " <td>138579.000000</td>\n", + " <td>1950.000000</td>\n", + " <td>-72.900000</td>\n", + " <td>-350.700000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75%</th>\n", + " <td>-71.900000</td>\n", + " <td>34.000000</td>\n", + " <td>2.226340e+06</td>\n", + " <td>5.902852e+12</td>\n", + " <td>82.000000</td>\n", + " <td>82.000000</td>\n", + " <td>25.000000</td>\n", + " <td>25.000000</td>\n", + " <td>3.000000</td>\n", + " <td>3.000000</td>\n", + " <td>...</td>\n", + " <td>44.000000</td>\n", + " <td>748000.000000</td>\n", + " <td>3.834786e+07</td>\n", + " <td>3.154128e+06</td>\n", + " <td>97761.750000</td>\n", + " <td>12660.000000</td>\n", + " <td>168403.000000</td>\n", + " <td>3154.000000</td>\n", + " <td>-68.400000</td>\n", + " <td>-226.100000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max</th>\n", + " <td>-41.000000</td>\n", + " <td>42.000000</td>\n", + " <td>2.304122e+06</td>\n", + " <td>5.902975e+12</td>\n", + " <td>630.000000</td>\n", + " <td>630.000000</td>\n", + " <td>41.000000</td>\n", + " <td>41.000000</td>\n", + " <td>6.000000</td>\n", + " <td>6.000000</td>\n", + " <td>...</td>\n", + " <td>59.000000</td>\n", + " <td>999000.000000</td>\n", + " <td>3.908956e+07</td>\n", + " <td>3.895821e+06</td>\n", + " <td>189705.000000</td>\n", + " <td>15444.000000</td>\n", + " <td>189705.000000</td>\n", + " <td>3895.000000</td>\n", + " <td>-41.000000</td>\n", + " <td>-52.300000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>8 rows Ă 24 columns</p>\n", + "</div>" + ], + "text/plain": [ + " PROXIMITY MEASUREMENT ITEMID EAN \\\n", + "count 317762.000000 317762.000000 3.177620e+05 3.177620e+05 \n", + "mean -75.526452 22.776858 2.170711e+06 5.902836e+12 \n", + "std 5.583369 12.801744 7.820521e+04 7.406653e+07 \n", + "min -110.000000 1.000000 2.028742e+06 5.902691e+12 \n", + "25% -79.900000 12.000000 2.113407e+06 5.902805e+12 \n", + "50% -75.700000 24.000000 2.155605e+06 5.902806e+12 \n", + "75% -71.900000 34.000000 2.226340e+06 5.902852e+12 \n", + "max -41.000000 42.000000 2.304122e+06 5.902975e+12 \n", + "\n", + " SubclassID SubclassID.1 ClassID ClassID.1 \\\n", + "count 317762.000000 317762.000000 317762.000000 317762.000000 \n", + "mean 82.639686 82.639686 18.142610 18.142610 \n", + "std 141.056329 141.056329 8.830015 8.830015 \n", + "min 10.000000 10.000000 10.000000 10.000000 \n", + "25% 11.000000 11.000000 10.000000 10.000000 \n", + "50% 82.000000 82.000000 16.000000 16.000000 \n", + "75% 82.000000 82.000000 25.000000 25.000000 \n", + "max 630.000000 630.000000 41.000000 41.000000 \n", + "\n", + " DepartmentID DepartmentID.1 ... SEC MICROSEC \\\n", + "count 317762.000000 317762.000000 ... 317762.000000 317762.000000 \n", + "mean 2.603486 2.603486 ... 29.378906 499539.922961 \n", + "std 0.932835 0.932835 ... 17.290876 288325.166786 \n", + "min 2.000000 2.000000 ... 0.000000 0.000000 \n", + "25% 2.000000 2.000000 ... 14.000000 249000.000000 \n", + "50% 2.000000 2.000000 ... 29.000000 500000.000000 \n", + "75% 3.000000 3.000000 ... 44.000000 748000.000000 \n", + "max 6.000000 6.000000 ... 59.000000 999000.000000 \n", + "\n", + " MILISEC TIME_MS TIME_PER_MEASUREMENT_MS NUMBER_OF_SIGNALS \\\n", + "count 3.177620e+05 3.177620e+05 317762.000000 317762.000000 \n", + "mean 3.717748e+07 1.983745e+06 66756.283605 10103.277673 \n", + "std 1.183041e+06 1.183041e+06 44987.211264 3113.779828 \n", + "min 3.519374e+07 0.000000e+00 0.000000 4597.000000 \n", + "25% 3.617897e+07 9.852318e+05 29587.000000 8027.000000 \n", + "50% 3.714444e+07 1.950704e+06 59599.000000 9887.000000 \n", + "75% 3.834786e+07 3.154128e+06 97761.750000 12660.000000 \n", + "max 3.908956e+07 3.895821e+06 189705.000000 15444.000000 \n", + "\n", + " LENGTH_OF_MEASUREMENT TIME_KMS MAX_PROXIMITY_KMS \\\n", + "count 317762.000000 317762.000000 317762.000000 \n", + "mean 133222.336198 1983.244840 -72.536124 \n", + "std 39296.957914 1183.041729 5.831107 \n", + "min 53538.000000 0.000000 -98.000000 \n", + "25% 101041.000000 985.000000 -76.400000 \n", + "50% 138579.000000 1950.000000 -72.900000 \n", + "75% 168403.000000 3154.000000 -68.400000 \n", + "max 189705.000000 3895.000000 -41.000000 \n", + "\n", + " SUM_PROXIMITY_KMS \n", + "count 317762.000000 \n", + "mean -404.190290 \n", + "std 267.914649 \n", + "min -2629.400000 \n", + "25% -527.300000 \n", + "50% -350.700000 \n", + "75% -226.100000 \n", + "max -52.300000 \n", + "\n", + "[8 rows x 24 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 1, 2, 3, 4, 5, 7, 6, 10, 11, 12, 13, 14, 16, 18, 17, 19, 20,\n", + " 21, 23, 24, 26, 27, 28, 29, 31, 32, 34, 35, 36, 38, 37, 40, 39, 41,\n", + " 42], dtype=int64)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.MEASUREMENT.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 , RK485-99X\n", + "2 , RK485-99X\n", + "3 , RK485-99X\n", + "4 , RK485-99X\n", + "5 , RK485-99X\n", + "7 , RK485-99X\n", + "6 , RK485-99X\n", + "10 , RK485-99X\n", + "11 , RK485-99X\n", + "12 , RK485-99X\n", + "13 , RK485-99X\n", + "14 , RK485-99X\n", + "16 , RK485-99X\n", + "18 , RK485-99X\n", + "17 , RK485-99X\n", + "19 , RK485-99X\n", + "20 , RK485-99X\n", + "21 , RK485-99X\n", + "23 , RK485-99X\n", + "24 , RK485-99X\n", + "26 , RK485-99X\n", + "27 , RK485-99X\n", + "28 , RK485-99X\n", + "29 , RK485-99X\n", + "31 , RK485-99X\n", + "32 , RK485-99X\n", + "34 , RK485-99X\n", + "35 , RK485-99X\n", + "36 , RK485-99X\n", + "38 , RK485-99X\n", + "37 , RK485-99X\n", + "40 , RK485-99X\n", + "39 , RK485-99X\n", + "41 , RK485-99X\n", + "42 , RK485-99X\n" + ] + } + ], + "source": [ + "for i in df.MEASUREMENT.unique():\n", + " zb = df[df['MEASUREMENT'] == i]\n", + " for j in zb.StyleColor.unique():\n", + " zbior = zb[zb['StyleColor'] == j]\n", + " if zbior.EPC.unique().size == 1:\n", + " print(i,', ', j)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Zatem usuwamy caĹy ten Stylokolor\n", + "df.drop(df.loc[df['StyleColor'] == 'RK485-99X'].index, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Sprawdzamy, czy mamy klipsy przypisane do wiÄcej niĹź 1 Itemu\n", + "for i in df.MEASUREMENT.unique():\n", + " zb = df[df['MEASUREMENT'] == i]\n", + " for j in zb.EPC.unique():\n", + " zbior = zb[zb['EPC'] == j]\n", + " if zbior.EAN.unique().size > 1:\n", + " print(i,', ', j)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>EPC</th>\n", + " <th>PROXIMITY</th>\n", + " <th>TIMESTAMP</th>\n", + " <th>MEASUREMENT</th>\n", + " <th>ITEMID</th>\n", + " <th>EAN</th>\n", + " <th>StyleColorSize</th>\n", + " <th>StyleColor</th>\n", + " <th>Size</th>\n", + " <th>SubclassID</th>\n", + " <th>...</th>\n", + " <th>SEC</th>\n", + " <th>MICROSEC</th>\n", + " <th>MILISEC</th>\n", + " <th>TIME_MS</th>\n", + " <th>TIME_PER_MEASUREMENT_MS</th>\n", + " <th>NUMBER_OF_SIGNALS</th>\n", + " <th>LENGTH_OF_MEASUREMENT</th>\n", + " <th>TIME_KMS</th>\n", + " <th>MAX_PROXIMITY_KMS</th>\n", + " <th>SUM_PROXIMITY_KMS</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "<p>0 rows Ă 36 columns</p>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [EPC, PROXIMITY, TIMESTAMP, MEASUREMENT, ITEMID, EAN, StyleColorSize, StyleColor, Size, SubclassID, ItemSeason, FashionLevel, SubclassID.1, SubclassName, ClassID, ClassID.1, ClassName, DepartmentID, DepartmentID.1, DepartmentName, BrandID, BrandID.1, BrandName, Active, HOUR, MIN, SEC, MICROSEC, MILISEC, TIME_MS, TIME_PER_MEASUREMENT_MS, NUMBER_OF_SIGNALS, LENGTH_OF_MEASUREMENT, TIME_KMS, MAX_PROXIMITY_KMS, SUM_PROXIMITY_KMS]\n", + "Index: []\n", + "\n", + "[0 rows x 36 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#do testow samego modelu\n", + "test_1 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ((df.MEASUREMENT == 2) | (df.MEASUREMENT == 3) | (df.MEASUREMENT == 4) | (df.MEASUREMENT == 26) | (df.MEASUREMENT == 28) ) ]\n", + "\n", + "test1 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ( (df.MEASUREMENT == 1) ) ]\n", + "test2 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ( (df.MEASUREMENT == 2) ) ]\n", + "test3 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ( (df.MEASUREMENT == 3) ) ]\n", + "test4 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ( (df.MEASUREMENT == 4) ) ]\n", + "test5 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 5) ]\n", + "test6 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 6) ]\n", + "test9 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 9) ]\n", + "test12 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 12) ]\n", + "test22 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 22) ]\n", + "test24 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 24) ]\n", + "test25 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 25) ]\n", + "test21 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 21) ]\n", + "test29 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 29) ]\n", + "test28 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 28) ]\n", + "\n", + "test29" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EAN', data=test6)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EAN', data=test_1[test_1.MEASUREMENT == 3])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EPC', data=test_1[test_1.MEASUREMENT == 4])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EPC', data=test_1[test_1.MEASUREMENT == 26])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EAN', data=test_1[test_1.MEASUREMENT == 28])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "NastÄpnie tworzymy intuicyjny podziaĹ na zbiĂłr testowy i treningowy: pomiary przed godzinÄ 10.30 traktujemy jako zbiĂłr treningowy, natomiast te po godzinie 10.30 - jako zbiĂłr testowy." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "train = df[df.TIMESTAMP <= '2021-10-26T10:30:00.000']\n", + "train = train[['SUM_PROXIMITY_KMS','TIME_MS', 'EAN', 'MEASUREMENT','StyleColor', 'EPC']]\n", + "test = df[df.TIMESTAMP > '2021-10-26T10:30:00.000']\n", + "test = test[['SUM_PROXIMITY_KMS','TIME_MS', 'EAN', 'MEASUREMENT','StyleColor', 'EPC']]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from sklearn.cluster import DBSCAN\n", + "from sklearn import metrics\n", + "from sklearn.datasets import make_blobs\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.preprocessing import normalize\n", + "from sklearn.neighbors import NearestNeighbors\n", + "import plotly.express as px\n", + "from kneed import KneeLocator" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "#zbiĂłr testowy dla kilku stylokolorĂłw\n", + "test0 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & ((df.StyleColor == 'RH797-81X') | (df.StyleColor == 'SL171-99X') \n", + " | (df.StyleColor == 'RH797-59X'))]\n", + "caly1 = test0[['SUM_PROXIMITY_KMS','TIME_MS', 'EAN', 'MEASUREMENT','StyleColor', 'EPC']]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def c1(x, g, d, c):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(2,minimal_epc.astype(int))\n", + "\n", + " if minimal_epc > g:\n", + " minimal_epc = g\n", + " neighbors = g\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(6000,kneedle.knee_y/d)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > c*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "#zbiĂłr testowy dla kilku stylokolorĂłw \n", + "train1 = train[(train.StyleColor == 'RH267-85J') | (train.StyleColor == 'RJ369-87X') | (train.StyleColor =='RM119-93X') \n", + " | (train.StyleColor == 'RS483-99X') | (train.StyleColor == 'SB281-90M')]\n", + "\n", + "train2 = train[(train.StyleColor == 'RV167-MLC') | (train.StyleColor == 'RV462-87X') | (train.StyleColor =='QJ677-33X') \n", + " | (train.StyleColor == 'RH797-00X') | (train.StyleColor == 'RH267-55J')]\n", + "\n", + "train3 = train[(train.StyleColor == 'SL171-99X') | (train.StyleColor == 'SO133-09M') | (train.StyleColor =='RB254-00X') \n", + " | (train.StyleColor == 'SF078-MLC') | (train.StyleColor == 'QY337-00X')]\n", + "\n", + "train4 = train[(train.StyleColor == 'SP095-59X') | (train.StyleColor == 'RN633-00X') | (train.StyleColor =='RH267-59J') \n", + " | (train.StyleColor == 'RV167-87X')]\n", + "\n", + "train5 = train[(train.StyleColor == 'RJ365-09M') | (train.StyleColor == 'RH797-59X') | (train.StyleColor =='SP090-90X') \n", + " | (train.StyleColor == 'RH797-99X') | (train.StyleColor == 'RJ371-59M')]\n", + "\n", + "train6 = train[(train.StyleColor == 'RV462-99X') | (train.StyleColor == 'RH797-81X') | (train.StyleColor =='QZ555-20X') \n", + " | (train.StyleColor == 'RJ371-53M') | (train.StyleColor == 'RS054-99X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# Wybrane StyleColor\n", + "z1 = test[(test.StyleColor == 'RH267-85J') | (test.StyleColor == 'RJ369-87X') | (test.StyleColor =='RM119-93X') \n", + " | (test.StyleColor == 'RS483-99X') | (test.StyleColor == 'SB281-90M')]\n", + "\n", + "z2 = test[(test.StyleColor == 'RV167-MLC') | (test.StyleColor == 'RV462-87X') | (test.StyleColor =='QJ677-33X') \n", + " | (test.StyleColor == 'RH797-00X') | (test.StyleColor == 'RH267-55J')]\n", + "\n", + "z3 = test[(test.StyleColor == 'SL171-99X') | (test.StyleColor == 'SO133-09M') | (test.StyleColor =='RB254-00X') \n", + " | (test.StyleColor == 'SF078-MLC') | (test.StyleColor == 'QY337-00X')]\n", + "\n", + "z4 = test[(test.StyleColor == 'SP095-59X') | (test.StyleColor == 'RN633-00X') | (test.StyleColor =='RH267-59J') \n", + " | (test.StyleColor == 'RV167-87X') | (test.StyleColor == 'RK485-99X')]\n", + "\n", + "z5 = test[(test.StyleColor == 'RJ365-09M') | (test.StyleColor == 'RH797-59X') | (test.StyleColor =='SP090-90X') \n", + " | (test.StyleColor == 'RH797-99X') | (test.StyleColor == 'RJ371-59M')]\n", + "\n", + "z6 = test[(test.StyleColor == 'RV462-99X') | (test.StyleColor == 'RH797-81X') | (test.StyleColor =='QZ555-20X') \n", + " | (test.StyleColor == 'RJ371-53M') | (test.StyleColor == 'RS054-99X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "u = train[(train.StyleColor == 'RV462-99X') | (train.StyleColor == 'RH797-81X') | (train.StyleColor =='SL171-99X')]\n", + "t = test[(test.StyleColor == 'RV462-99X') | (test.StyleColor == 'RH797-81X') | (test.StyleColor =='SL171-99X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 21 5902805820447 RH797-81X\n", + "1 24 5902805820447 RH797-81X\n", + "2 26 5902805820447 RH797-81X\n", + "3 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(u)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 29 5902805820447 RH797-81X\n", + "2 32 5902805820447 RH797-81X\n", + "3 34 5902805820447 RH797-81X" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(t)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train1)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train2)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 18 5902851852638 SO133-09M\n", + "1 11 5902690542745 QY337-00X\n", + "2 11 5902690542769 QY337-00X" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train3)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train4)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train5)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 21 5902805820447 RH797-81X\n", + "1 24 5902805820447 RH797-81X\n", + "2 26 5902805820447 RH797-81X\n", + "3 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train6)" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 34 5902805820447 RH797-81X" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(z6)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 11 5902690542745 QY337-00X\n", + "1 11 5902690542769 QY337-00X\n", + "2 21 5902805820447 RH797-81X\n", + "3 26 5902805820447 RH797-81X\n", + "4 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 34 5902805820447 RH797-81X" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 18 5902851852638 SO133-09M\n", + "1 11 5902690542745 QY337-00X\n", + "2 11 5902690542769 QY337-00X\n", + "3 21 5902805820447 RH797-81X\n", + "4 24 5902805820447 RH797-81X\n", + "5 26 5902805820447 RH797-81X\n", + "6 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train,70,6,0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>42</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38</td>\n", + " <td>5902851445700</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>35</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>42</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>42</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>38</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>42</td>\n", + " <td>5902975236956</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 42 5902805533040 RH267-85J\n", + "1 38 5902851414515 SL171-99X\n", + "2 38 5902851445700 RS483-99X\n", + "3 35 5902690542769 QY337-00X\n", + "4 42 5902690542769 QY337-00X\n", + "5 42 5902690542745 QY337-00X\n", + "6 42 5902690542769 QY337-00X\n", + "7 29 5902805820447 RH797-81X\n", + "8 32 5902805820447 RH797-81X\n", + "9 34 5902805820447 RH797-81X\n", + "10 42 5902805820447 RH797-81X\n", + "11 38 5902975236994 SF078-MLC\n", + "12 42 5902975236956 SF078-MLC\n", + "13 38 5902851852614 SO133-09M\n", + "14 38 5902851852638 SO133-09M\n", + "15 38 5902851852638 SO133-09M\n", + "16 42 5902851852638 SO133-09M\n", + "17 42 5902851852614 SO133-09M\n", + "18 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test,70,6,0.4)" + ] + }, + { + "cell_type": "code", + "execution_count": 268, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>22</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>25</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>30</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 22 5902805820447 RH797-81X\n", + "1 25 5902805820447 RH797-81X\n", + "2 24 5902805820447 RH797-81X\n", + "3 26 5902805820447 RH797-81X\n", + "4 30 5902805820447 RH797-81X\n", + "5 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 268, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(caly1,0.7)" + ] + }, + { + "cell_type": "code", + "execution_count": 225, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>22</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>25</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>30</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 22 5902805820447 RH797-81X\n", + "1 21 5902805820447 RH797-81X\n", + "2 25 5902805820447 RH797-81X\n", + "3 24 5902805820447 RH797-81X\n", + "4 26 5902805820447 RH797-81X\n", + "5 30 5902805820447 RH797-81X\n", + "6 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 225, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(caly1, 0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + "DF = df[(df.StyleColor == 'SL171-99X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "proba = DF[ DF.MEASUREMENT == 28 ]\n", + "X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + "minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "303568480C287AC000B5BAD5 96\n", + "303568480C287A8000B5BA63 90\n", + "303568480C287A8000B5BADA 54\n", + "303568480C287A8000B5BABD 14\n", + "Name: EPC, dtype: int64" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "proba['EPC'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.0" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "minimal_epc" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "eps = np.floor(proba['EPC'].value_counts().max()*20)" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + "db.fit(X)\n", + "y_pred = db.fit_predict(X)\n", + "clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + "calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [ + "if (db.labels_[db.labels_ == -1].size != 0) :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.6*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': 24, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':'QY337-00X'}, ignore_index = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "303568480C287AC000B5BAD5 7\n", + "303568480C287A8000B5BA63 5\n", + "303568480C287A8000B5BADA 4\n", + "Name: EPC, dtype: int64" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "outliery" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(z1)" + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 29 5902805820447 RH797-81X\n", + "2 32 5902805820447 RH797-81X\n", + "3 34 5902805820447 RH797-81X\n", + "4 38 5902851852614 SO133-09M\n", + "5 38 5902851852638 SO133-09M\n", + "6 42 5902851852638 SO133-09M\n", + "7 42 5902851852614 SO133-09M\n", + "8 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 183, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 29 5902805820447 RH797-81X\n", + "2 32 5902805820447 RH797-81X\n", + "3 34 5902805820447 RH797-81X\n", + "4 38 5902851852614 SO133-09M\n", + "5 38 5902851852638 SO133-09M\n", + "6 42 5902851852638 SO133-09M\n", + "7 42 5902851852614 SO133-09M\n", + "8 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 186, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 277, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 24 5902805820447 RH797-81X\n", + "1 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 277, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# dla /5\n", + "c1(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 278, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 278, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 280, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 24 5902805820447 RH797-81X\n", + "1 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 280, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# dla /6\n", + "c1(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 281, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 34 5902805820447 RH797-81X\n", + "3 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 281, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 283, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 11 5902690542745 QY337-00X\n", + "1 11 5902690542769 QY337-00X\n", + "2 24 5902805820447 RH797-81X\n", + "3 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 283, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# eps min 5000 / 5\n", + "# \n", + "c1(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 284, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 42 5902690542769 QY337-00X\n", + "2 29 5902805820447 RH797-81X\n", + "3 32 5902805820447 RH797-81X\n", + "4 34 5902805820447 RH797-81X\n", + "5 38 5902851852614 SO133-09M\n", + "6 38 5902851852638 SO133-09M\n", + "7 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 284, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# c=0.6\n", + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 317, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 11 5902690542745 QY337-00X\n", + "1 11 5902690542769 QY337-00X\n", + "2 24 5902805820447 RH797-81X\n", + "3 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 317, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 318, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 29 5902805820447 RH797-81X\n", + "2 32 5902805820447 RH797-81X\n", + "3 34 5902805820447 RH797-81X\n", + "4 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 318, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "def c2(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(20,minimal_epc.astype(int))\n", + " \n", + " if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(5000,kneedle.knee_y/5)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.3*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>24</td>\n", + " <td>5902805820461</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820461</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>27</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 21 5902805820447 RH797-81X\n", + "1 24 5902805820447 RH797-81X\n", + "2 24 5902805820461 RH797-81X\n", + "3 24 5902805820447 RH797-81X\n", + "4 24 5902805820461 RH797-81X\n", + "5 26 5902805820447 RH797-81X\n", + "6 27 5902805820447 RH797-81X\n", + "7 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(train6)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>42</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 34 5902805820447 RH797-81X\n", + "3 42 5902805820447 RH797-81X" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(z6)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 18 5902851852638 SO133-09M\n", + "1 11 5902690542745 QY337-00X\n", + "2 11 5902690542769 QY337-00X\n", + "3 21 5902805820447 RH797-81X\n", + "4 24 5902805820447 RH797-81X\n", + "5 26 5902805820447 RH797-81X\n", + "6 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>42</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 42 5902690542769 QY337-00X\n", + "2 42 5902690542745 QY337-00X\n", + "3 29 5902805820447 RH797-81X\n", + "4 32 5902805820447 RH797-81X\n", + "5 34 5902805820447 RH797-81X\n", + "6 38 5902851852614 SO133-09M\n", + "7 38 5902851852638 SO133-09M\n", + "8 38 5902851852638 SO133-09M\n", + "9 42 5902851852638 SO133-09M\n", + "10 42 5902851852614 SO133-09M\n", + "11 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "def c3(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(2,minimal_epc.astype(int))\n", + " \n", + " if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(5000,kneedle.knee_y/4)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='kd_tree')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.5*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 18 5902851852638 SO133-09M\n", + "1 11 5902690542745 QY337-00X\n", + "2 11 5902690542769 QY337-00X\n", + "3 21 5902805820447 RH797-81X\n", + "4 24 5902805820447 RH797-81X\n", + "5 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c3(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>42</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 42 5902690542769 QY337-00X\n", + "2 42 5902690542745 QY337-00X\n", + "3 29 5902805820447 RH797-81X\n", + "4 32 5902805820447 RH797-81X\n", + "5 38 5902851852638 SO133-09M\n", + "6 42 5902851852638 SO133-09M\n", + "7 42 5902851852614 SO133-09M\n", + "8 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c3(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "def c4(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(2,minimal_epc.astype(int))\n", + " \n", + " if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(5000,kneedle.knee_y/2)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if (db.labels_[db.labels_ == -1].size != 0 ) & (db.labels_[db.labels_ == -1].size > minimal_epc ):\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts().sum()\n", + " b = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()[0] \n", + " if a - 2*b < 0:\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EAN': proba[proba['EPC'] == calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts().index[0]].EAN.iloc[0], \"StyleColor\":j}, ignore_index = True)\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>6</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>4</td>\n", + " <td>5902851445731</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>7</td>\n", + " <td>5902851445700</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>11</td>\n", + " <td>5902851445731</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>28</td>\n", + " <td>5902805303681</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>1</td>\n", + " <td>5902805431803</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>5</td>\n", + " <td>5902805431797</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>6</td>\n", + " <td>5902805431803</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>12</td>\n", + " <td>5902851535913</td>\n", + " <td>RV167-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>2</td>\n", + " <td>5902975217986</td>\n", + " <td>RV462-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>4</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>7</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>12</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>4</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>7</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>14</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>20</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>2</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>4</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>13</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>16</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>23</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>24</td>\n", + " <td>5902805219685</td>\n", + " <td>RN633-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>1</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>2</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>13</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>21</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>24</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>26</td>\n", + " <td>5902805444698</td>\n", + " <td>RJ365-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31</th>\n", + " <td>23</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>32</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>33</th>\n", + " <td>3</td>\n", + " <td>5902805385823</td>\n", + " <td>RJ371-53M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 6 5902805533040 RH267-85J\n", + "1 24 5902805533040 RH267-85J\n", + "2 4 5902851445731 RS483-99X\n", + "3 7 5902851445700 RS483-99X\n", + "4 11 5902851445731 RS483-99X\n", + "5 28 5902805303681 RJ369-87X\n", + "6 1 5902805431803 RM119-93X\n", + "7 5 5902805431797 RM119-93X\n", + "8 6 5902805431803 RM119-93X\n", + "9 12 5902851535913 RV167-MLC\n", + "10 2 5902975217986 RV462-87X\n", + "11 4 5902851414508 SL171-99X\n", + "12 7 5902851414508 SL171-99X\n", + "13 12 5902851414508 SL171-99X\n", + "14 4 5902851852638 SO133-09M\n", + "15 7 5902851852638 SO133-09M\n", + "16 14 5902851852638 SO133-09M\n", + "17 20 5902851852638 SO133-09M\n", + "18 2 5902690542769 QY337-00X\n", + "19 4 5902690542745 QY337-00X\n", + "20 13 5902690542769 QY337-00X\n", + "21 16 5902690542745 QY337-00X\n", + "22 23 5902690542745 QY337-00X\n", + "23 24 5902805219685 RN633-00X\n", + "24 1 5902805533255 RH267-59J\n", + "25 2 5902805533255 RH267-59J\n", + "26 13 5902805533255 RH267-59J\n", + "27 21 5902805533255 RH267-59J\n", + "28 24 5902805533255 RH267-59J\n", + "29 26 5902805444698 RJ365-09M\n", + "30 21 5902805820447 RH797-81X\n", + "31 23 5902805820447 RH797-81X\n", + "32 28 5902805820447 RH797-81X\n", + "33 3 5902805385823 RJ371-53M" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c4(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>41</td>\n", + " <td>5902805532999</td>\n", + " <td>RH267-55J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>34</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>42</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>32</td>\n", + " <td>5902805431803</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>40</td>\n", + " <td>5902805431803</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>32</td>\n", + " <td>5902851445700</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>37</td>\n", + " <td>5902851445700</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>40</td>\n", + " <td>5902851445731</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>32</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>36</td>\n", + " <td>5902805820423</td>\n", + " <td>RH797-59X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>37</td>\n", + " <td>5902805303681</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>32</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>38</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>40</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>29</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>36</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>41</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>39</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>32</td>\n", + " <td>5902805820546</td>\n", + " <td>RH797-00X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 41 5902805532999 RH267-55J\n", + "1 32 5902851414515 SL171-99X\n", + "2 34 5902851414515 SL171-99X\n", + "3 42 5902851414515 SL171-99X\n", + "4 32 5902805431803 RM119-93X\n", + "5 40 5902805431803 RM119-93X\n", + "6 32 5902851445700 RS483-99X\n", + "7 37 5902851445700 RS483-99X\n", + "8 40 5902851445731 RS483-99X\n", + "9 32 5902690542745 QY337-00X\n", + "10 36 5902805820423 RH797-59X\n", + "11 37 5902805303681 RJ369-87X\n", + "12 29 5902805820447 RH797-81X\n", + "13 32 5902805820447 RH797-81X\n", + "14 32 5902805533255 RH267-59J\n", + "15 38 5902805533255 RH267-59J\n", + "16 40 5902805533255 RH267-59J\n", + "17 29 5902975236994 SF078-MLC\n", + "18 36 5902975236994 SF078-MLC\n", + "19 41 5902975236994 SF078-MLC\n", + "20 39 5902851852638 SO133-09M\n", + "21 32 5902805820546 RH797-00X" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c4(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "def c5(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + "\n", + " eps = np.floor(proba['EPC'].value_counts().max()*30)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='kd_tree')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.5*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>27</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 21 5902805820447 RH797-81X\n", + "1 24 5902805820447 RH797-81X\n", + "2 26 5902805820447 RH797-81X\n", + "3 27 5902805820447 RH797-81X\n", + "4 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c5(u)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>35</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>42</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 35 5902851414515 SL171-99X\n", + "1 38 5902851414515 SL171-99X\n", + "2 38 5902851414508 SL171-99X\n", + "3 42 5902851414515 SL171-99X\n", + "4 29 5902805820447 RH797-81X\n", + "5 32 5902805820447 RH797-81X\n", + "6 34 5902805820447 RH797-81X" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c5(t)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>13</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>13</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>26</td>\n", + " <td>5902851445700</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>16</td>\n", + " <td>5902851547602</td>\n", + " <td>SB281-90M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>6</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>6</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>7</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>11</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>11</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>11</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>12</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>13</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>13</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>17</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>24</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>4</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>4</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>12</td>\n", + " <td>5902975236956</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>23</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>28</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>19</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>19</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>1</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>5</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>27</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 13 5902805533040 RH267-85J\n", + "1 13 5902805533040 RH267-85J\n", + "2 26 5902851445700 RS483-99X\n", + "3 16 5902851547602 SB281-90M\n", + "4 6 5902851852638 SO133-09M\n", + "5 6 5902851852614 SO133-09M\n", + "6 7 5902851852638 SO133-09M\n", + "7 11 5902851852614 SO133-09M\n", + "8 11 5902851852638 SO133-09M\n", + "9 11 5902851852638 SO133-09M\n", + "10 12 5902851852638 SO133-09M\n", + "11 13 5902851852638 SO133-09M\n", + "12 13 5902851852614 SO133-09M\n", + "13 18 5902851852638 SO133-09M\n", + "14 17 5902851852638 SO133-09M\n", + "15 24 5902851852638 SO133-09M\n", + "16 4 5902975236994 SF078-MLC\n", + "17 4 5902975236994 SF078-MLC\n", + "18 12 5902975236956 SF078-MLC\n", + "19 23 5902975236994 SF078-MLC\n", + "20 28 5902975236994 SF078-MLC\n", + "21 11 5902690542745 QY337-00X\n", + "22 11 5902690542769 QY337-00X\n", + "23 19 5902690542745 QY337-00X\n", + "24 19 5902690542769 QY337-00X\n", + "25 1 5902805533255 RH267-59J\n", + "26 5 5902805533255 RH267-59J\n", + "27 21 5902805820447 RH797-81X\n", + "28 24 5902805820447 RH797-81X\n", + "29 26 5902805820447 RH797-81X\n", + "30 27 5902805820447 RH797-81X\n", + "31 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# bez if i 0,3\n", + "c5(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>35</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>42</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>28</td>\n", + " <td>5902851445731</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>29</td>\n", + " <td>5902851445731</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>34</td>\n", + " <td>5902851445731</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>35</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>35</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>42</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>32</td>\n", + " <td>5902805303681</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>38</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 35 5902851414515 SL171-99X\n", + "1 38 5902851414515 SL171-99X\n", + "2 38 5902851414508 SL171-99X\n", + "3 42 5902851414515 SL171-99X\n", + "4 28 5902851445731 RS483-99X\n", + "5 29 5902851445731 RS483-99X\n", + "6 34 5902851445731 RS483-99X\n", + "7 35 5902690542745 QY337-00X\n", + "8 35 5902690542769 QY337-00X\n", + "9 42 5902690542769 QY337-00X\n", + "10 42 5902690542745 QY337-00X\n", + "11 32 5902805303681 RJ369-87X\n", + "12 29 5902805820447 RH797-81X\n", + "13 32 5902805820447 RH797-81X\n", + "14 34 5902805820447 RH797-81X\n", + "15 38 5902975236994 SF078-MLC\n", + "16 38 5902851852614 SO133-09M\n", + "17 38 5902851852638 SO133-09M\n", + "18 38 5902851852638 SO133-09M\n", + "19 42 5902851852638 SO133-09M\n", + "20 42 5902851852614 SO133-09M\n", + "21 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c5(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [], + "source": [ + "outliery1 = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + "j = 'RH797-81X' #'RH797-81X' # 'RH267-55J'\n", + "i = 24" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [], + "source": [ + "DF = train[(train.StyleColor == j)]\n", + "proba = DF[ DF.MEASUREMENT == i ]\n", + "X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + "minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [], + "source": [ + "eps = 6000\n", + "\n", + "db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + "db.fit(X)\n", + "y_pred = db.fit_predict(X)\n", + "clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + "calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [], + "source": [ + "if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.5*proba[proba['EPC'] == b].count()[0] :\n", + " outliery1 = outliery1.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 24 5902805820447 RH797-81X" + ] + }, + "execution_count": 141, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "outliery1" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [], + "source": [ + "outliery2 = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + "for i in train.MEASUREMENT.unique():\n", + " DF = train[(train.MEASUREMENT == i)]\n", + "\n", + " for k in DF.StyleColor.unique():\n", + " proba = DF[ DF.StyleColor == k]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2) \n", + "\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.9*proba[proba['EPC'] == b].count()[0] :\n", + " outliery2 = outliery2.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':k}, ignore_index = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>3</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>3</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>3</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>4</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>4</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>171</th>\n", + " <td>28</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>172</th>\n", + " <td>28</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>173</th>\n", + " <td>28</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>174</th>\n", + " <td>28</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>175</th>\n", + " <td>28</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>176 rows Ă 3 columns</p>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 3 5902690542769 QY337-00X\n", + "1 3 5902690542745 QY337-00X\n", + "2 3 5902690542769 QY337-00X\n", + "3 4 5902805533040 RH267-85J\n", + "4 4 5902805533040 RH267-85J\n", + ".. ... ... ...\n", + "171 28 5902805533040 RH267-85J\n", + "172 28 5902805533040 RH267-85J\n", + "173 28 5902805533040 RH267-85J\n", + "174 28 5902805533040 RH267-85J\n", + "175 28 5902805533040 RH267-85J\n", + "\n", + "[176 rows x 3 columns]" + ] + }, + "execution_count": 151, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "outliery2" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [], + "source": [ + "DF = train[(train.StyleColor == 'RH797-81X')]\n", + "proba = DF[ DF.MEASUREMENT == 24 ]\n", + "X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + "y_pred = db.fit_predict(X)\n", + "clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + "calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "y_pred = db.fit_predict(X)\n", + "\n", + "if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.5*proba[proba['EPC'] == b].count()[0] :\n", + " outliery1 = outliery1.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3035684754501F0000B5B614 83\n", + "3035684754501F4000B5B6E5 30\n", + "3035684754501F8000B5B6E6 28\n", + "3035684754501F0000B5B632 23\n", + "3035684754501F8000B5B6A5 20\n", + "Name: EPC, dtype: int64" + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "def c6(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EPC', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(10,minimal_epc.astype(int))\n", + " \n", + " if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(2000,kneedle.knee_y/3)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.4*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EPC':proba[proba['EPC'] == b].EPC.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "train7 = train[(train.StyleColor == 'SF078-MLC') | (train.StyleColor == 'RH797-81X') | (train.StyleColor == 'SL171-99X')]\n", + "test7 = test[(test.StyleColor == 'SF078-MLC') | (test.StyleColor == 'RH797-81X') | (test.StyleColor == 'SL171-99X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EPC</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>27</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>28</td>\n", + " <td>30356849FC1724C000B59A42</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>21</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>24</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>27</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>28</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EPC StyleColor\n", + "0 27 303568480C287AC000B5BAD5 SL171-99X\n", + "1 28 30356849FC1724C000B59A42 SF078-MLC\n", + "2 21 3035684754501F0000B5B614 RH797-81X\n", + "3 24 3035684754501F0000B5B614 RH797-81X\n", + "4 27 3035684754501F0000B5B614 RH797-81X\n", + "5 28 3035684754501F0000B5B614 RH797-81X" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c6(train7)" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EPC</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>32</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>35</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BA63</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BADA</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>41</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>29</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>32</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EPC StyleColor\n", + "0 32 303568480C287AC000B5BAD5 SL171-99X\n", + "1 35 303568480C287AC000B5BAD5 SL171-99X\n", + "2 38 303568480C287AC000B5BAD5 SL171-99X\n", + "3 38 303568480C287A8000B5BA63 SL171-99X\n", + "4 38 303568480C287A8000B5BADA SL171-99X\n", + "5 38 303568480C287A8000B5BABD SL171-99X\n", + "6 41 303568480C287AC000B5BAD5 SL171-99X\n", + "7 42 303568480C287AC000B5BAD5 SL171-99X\n", + "8 42 303568480C287A8000B5BABD SL171-99X\n", + "9 29 3035684754501F0000B5B614 RH797-81X\n", + "10 32 3035684754501F0000B5B614 RH797-81X" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c6(test7)" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>23</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820461</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>24</td>\n", + " <td>5902805820461</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>27</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 23 5902975236994 SF078-MLC\n", + "1 24 5902975236994 SF078-MLC\n", + "2 21 5902805820447 RH797-81X\n", + "3 24 5902805820447 RH797-81X\n", + "4 24 5902805820461 RH797-81X\n", + "5 24 5902805820447 RH797-81X\n", + "6 24 5902805820461 RH797-81X\n", + "7 26 5902805820447 RH797-81X\n", + "8 27 5902805820447 RH797-81X\n", + "9 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(train7)" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>32</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>38</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>42</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>42</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>29</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>36</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>38</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>38</td>\n", + " <td>5902975236956</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>40</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>41</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>42</td>\n", + " <td>5902975236956</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 32 5902851414515 SL171-99X\n", + "1 38 5902851414515 SL171-99X\n", + "2 38 5902851414508 SL171-99X\n", + "3 38 5902851414508 SL171-99X\n", + "4 38 5902851414508 SL171-99X\n", + "5 42 5902851414515 SL171-99X\n", + "6 42 5902851414508 SL171-99X\n", + "7 29 5902805820447 RH797-81X\n", + "8 32 5902805820447 RH797-81X\n", + "9 34 5902805820447 RH797-81X\n", + "10 42 5902805820447 RH797-81X\n", + "11 29 5902975236994 SF078-MLC\n", + "12 36 5902975236994 SF078-MLC\n", + "13 38 5902975236994 SF078-MLC\n", + "14 38 5902975236956 SF078-MLC\n", + "15 40 5902975236994 SF078-MLC\n", + "16 41 5902975236994 SF078-MLC\n", + "17 42 5902975236956 SF078-MLC" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(test7)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>4</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>4</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>12</td>\n", + " <td>5902975236956</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>23</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>28</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>27</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 4 5902975236994 SF078-MLC\n", + "1 4 5902975236994 SF078-MLC\n", + "2 12 5902975236956 SF078-MLC\n", + "3 23 5902975236994 SF078-MLC\n", + "4 28 5902975236994 SF078-MLC\n", + "5 21 5902805820447 RH797-81X\n", + "6 24 5902805820447 RH797-81X\n", + "7 26 5902805820447 RH797-81X\n", + "8 27 5902805820447 RH797-81X\n", + "9 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c5(train7)" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>35</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>42</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>38</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 35 5902851414515 SL171-99X\n", + "1 38 5902851414515 SL171-99X\n", + "2 38 5902851414508 SL171-99X\n", + "3 42 5902851414515 SL171-99X\n", + "4 29 5902805820447 RH797-81X\n", + "5 32 5902805820447 RH797-81X\n", + "6 34 5902805820447 RH797-81X\n", + "7 38 5902975236994 SF078-MLC" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c5(test7)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EPC</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>12</td>\n", + " <td>303568480C2B874000B59A39</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>23</td>\n", + " <td>303568480C2B874000B59A39</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>28</td>\n", + " <td>303568480C2B868000B599B2</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>303568480C357A0000B59999</td>\n", + " <td>SB281-90M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>16</td>\n", + " <td>303568480C357A0000B59999</td>\n", + " <td>SB281-90M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>7</td>\n", + " <td>30356847541DA80000B5BA54</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>17</td>\n", + " <td>303568480C3455C000B5B30A</td>\n", + " <td>RV167-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>17</td>\n", + " <td>30356847542CCD0000B59A80</td>\n", + " <td>QJ677-33X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>26</td>\n", + " <td>30356847542CCD8000B599FA</td>\n", + " <td>QJ677-33X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>26</td>\n", + " <td>30356847542CCD0000B59A26</td>\n", + " <td>QJ677-33X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>4</td>\n", + " <td>3035684754340CC000B594C3</td>\n", + " <td>RH267-55J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>18</td>\n", + " <td>3035684754340D0000B594EB</td>\n", + " <td>RH267-55J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>18</td>\n", + " <td>3035684754340CC000B594C6</td>\n", + " <td>RH267-55J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>27</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>7</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>17</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>17</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>24</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>24</td>\n", + " <td>30356847540FE2C000B59A68</td>\n", + " <td>RB254-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>28</td>\n", + " <td>30356849FC1724C000B59A42</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>11</td>\n", + " <td>303568458835008000B5BAD1</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>11</td>\n", + " <td>303568458835010000B5BA58</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>19</td>\n", + " <td>303568458835008000B5BAD1</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>19</td>\n", + " <td>303568458835010000B5BA58</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>7</td>\n", + " <td>303568475415740000B5A5CD</td>\n", + " <td>RN633-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>1</td>\n", + " <td>303568475434134000B5B6DF</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>2</td>\n", + " <td>30356847542B6D4000B5B656</td>\n", + " <td>RJ365-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>2</td>\n", + " <td>30356847542B6D0000B5B65A</td>\n", + " <td>RJ365-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>12</td>\n", + " <td>30356849FC1E348000B5B2D4</td>\n", + " <td>SP090-90X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>21</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>24</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31</th>\n", + " <td>27</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>32</th>\n", + " <td>28</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EPC StyleColor\n", + "0 12 303568480C2B874000B59A39 RS483-99X\n", + "1 23 303568480C2B874000B59A39 RS483-99X\n", + "2 28 303568480C2B868000B599B2 RS483-99X\n", + "3 1 303568480C357A0000B59999 SB281-90M\n", + "4 16 303568480C357A0000B59999 SB281-90M\n", + "5 7 30356847541DA80000B5BA54 RJ369-87X\n", + "6 17 303568480C3455C000B5B30A RV167-MLC\n", + "7 17 30356847542CCD0000B59A80 QJ677-33X\n", + "8 26 30356847542CCD8000B599FA QJ677-33X\n", + "9 26 30356847542CCD0000B59A26 QJ677-33X\n", + "10 4 3035684754340CC000B594C3 RH267-55J\n", + "11 18 3035684754340D0000B594EB RH267-55J\n", + "12 18 3035684754340CC000B594C6 RH267-55J\n", + "13 27 303568480C287AC000B5BAD5 SL171-99X\n", + "14 7 303568480C5343C000B599F6 SO133-09M\n", + "15 17 303568480C5343C000B599C8 SO133-09M\n", + "16 17 303568480C53434000B599E1 SO133-09M\n", + "17 24 303568480C53434000B599E1 SO133-09M\n", + "18 24 30356847540FE2C000B59A68 RB254-00X\n", + "19 28 30356849FC1724C000B59A42 SF078-MLC\n", + "20 11 303568458835008000B5BAD1 QY337-00X\n", + "21 11 303568458835010000B5BA58 QY337-00X\n", + "22 19 303568458835008000B5BAD1 QY337-00X\n", + "23 19 303568458835010000B5BA58 QY337-00X\n", + "24 7 303568475415740000B5A5CD RN633-00X\n", + "25 1 303568475434134000B5B6DF RH267-59J\n", + "26 2 30356847542B6D4000B5B656 RJ365-09M\n", + "27 2 30356847542B6D0000B5B65A RJ365-09M\n", + "28 12 30356849FC1E348000B5B2D4 SP090-90X\n", + "29 21 3035684754501F0000B5B614 RH797-81X\n", + "30 24 3035684754501F0000B5B614 RH797-81X\n", + "31 27 3035684754501F0000B5B614 RH797-81X\n", + "32 28 3035684754501F0000B5B614 RH797-81X" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c6(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EPC</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>42</td>\n", + " <td>3035684754340E0000B594E8</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>35</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>38</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BA63</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BADA</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>41</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>42</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>38</td>\n", + " <td>303568480C2B868000B599B2</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>38</td>\n", + " <td>303568480C34548000B5B2B5</td>\n", + " <td>RV167-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>42</td>\n", + " <td>303568458835010000B5BA58</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>42</td>\n", + " <td>303568458835008000B5BAD1</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>32</td>\n", + " <td>30356847541DA80000B5BA54</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>29</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>32</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>29</td>\n", + " <td>303568475415744000B599FE</td>\n", + " <td>RN633-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>39</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>42</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>42</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>42</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>35</td>\n", + " <td>303568475450218000B59781</td>\n", + " <td>RH797-00X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EPC StyleColor\n", + "0 42 3035684754340E0000B594E8 RH267-85J\n", + "1 32 303568480C287AC000B5BAD5 SL171-99X\n", + "2 35 303568480C287AC000B5BAD5 SL171-99X\n", + "3 38 303568480C287AC000B5BAD5 SL171-99X\n", + "4 38 303568480C287A8000B5BA63 SL171-99X\n", + "5 38 303568480C287A8000B5BADA SL171-99X\n", + "6 38 303568480C287A8000B5BABD SL171-99X\n", + "7 41 303568480C287AC000B5BAD5 SL171-99X\n", + "8 42 303568480C287AC000B5BAD5 SL171-99X\n", + "9 42 303568480C287A8000B5BABD SL171-99X\n", + "10 38 303568480C2B868000B599B2 RS483-99X\n", + "11 38 303568480C34548000B5B2B5 RV167-87X\n", + "12 42 303568458835010000B5BA58 QY337-00X\n", + "13 42 303568458835008000B5BAD1 QY337-00X\n", + "14 32 30356847541DA80000B5BA54 RJ369-87X\n", + "15 29 3035684754501F0000B5B614 RH797-81X\n", + "16 32 3035684754501F0000B5B614 RH797-81X\n", + "17 29 303568475415744000B599FE RN633-00X\n", + "18 39 303568480C5343C000B599F6 SO133-09M\n", + "19 42 303568480C5343C000B599F6 SO133-09M\n", + "20 42 303568480C53434000B599E1 SO133-09M\n", + "21 42 303568480C5343C000B599C8 SO133-09M\n", + "22 35 303568475450218000B59781 RH797-00X" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c6(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "def c7(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EPC', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(10,minimal_epc.astype(int))\n", + " \n", + " if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(4000,kneedle.knee_y/5)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.4*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EPC':proba[proba['EPC'] == b].EPC.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EPC</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>26</td>\n", + " <td>303568480C2B868000B599B2</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>16</td>\n", + " <td>303568480C357A0000B59999</td>\n", + " <td>SB281-90M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>4</td>\n", + " <td>30356847541DA7C000B5BADD</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>26</td>\n", + " <td>30356847541DA80000B5BA54</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>11</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>11</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>11</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>18</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>18</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>18</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>17</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>17</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>24</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>24</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>24</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>24</td>\n", + " <td>30356847540FE2C000B59A68</td>\n", + " <td>RB254-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>23</td>\n", + " <td>30356849FC1724C000B59A42</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>11</td>\n", + " <td>303568458835008000B5BAD1</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>11</td>\n", + " <td>303568458835010000B5BA58</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>1</td>\n", + " <td>303568475434134000B5B6DF</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>21</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>24</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>26</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>27</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>28</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EPC StyleColor\n", + "0 26 303568480C2B868000B599B2 RS483-99X\n", + "1 16 303568480C357A0000B59999 SB281-90M\n", + "2 4 30356847541DA7C000B5BADD RJ369-87X\n", + "3 26 30356847541DA80000B5BA54 RJ369-87X\n", + "4 11 303568480C53434000B599E1 SO133-09M\n", + "5 11 303568480C5343C000B599F6 SO133-09M\n", + "6 11 303568480C5343C000B599C8 SO133-09M\n", + "7 18 303568480C5343C000B599C8 SO133-09M\n", + "8 18 303568480C5343C000B599F6 SO133-09M\n", + "9 18 303568480C53434000B599E1 SO133-09M\n", + "10 17 303568480C5343C000B599C8 SO133-09M\n", + "11 17 303568480C53434000B599E1 SO133-09M\n", + "12 24 303568480C5343C000B599F6 SO133-09M\n", + "13 24 303568480C53434000B599E1 SO133-09M\n", + "14 24 303568480C5343C000B599C8 SO133-09M\n", + "15 24 30356847540FE2C000B59A68 RB254-00X\n", + "16 23 30356849FC1724C000B59A42 SF078-MLC\n", + "17 11 303568458835008000B5BAD1 QY337-00X\n", + "18 11 303568458835010000B5BA58 QY337-00X\n", + "19 1 303568475434134000B5B6DF RH267-59J\n", + "20 21 3035684754501F0000B5B614 RH797-81X\n", + "21 24 3035684754501F0000B5B614 RH797-81X\n", + "22 26 3035684754501F0000B5B614 RH797-81X\n", + "23 27 3035684754501F0000B5B614 RH797-81X\n", + "24 28 3035684754501F0000B5B614 RH797-81X" + ] + }, + "execution_count": 113, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c7(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EPC</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>42</td>\n", + " <td>3035684754340E0000B594E8</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BA63</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BADA</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>51356847542A2B0000B5B280</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>42</td>\n", + " <td>30356847542A2B0000B5B215</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>38</td>\n", + " <td>303568480C2B868000B599B2</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>34</td>\n", + " <td>303568458835008000B5BAD1</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>35</td>\n", + " <td>303568458835010000B5BA58</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>42</td>\n", + " <td>303568458835010000B5BA58</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>42</td>\n", + " <td>303568458835008000B5BAD1</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>42</td>\n", + " <td>303568458835010000B5BA61</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>42</td>\n", + " <td>30356847541DA80000B5BA54</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>29</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>32</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>34</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>42</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>38</td>\n", + " <td>30356849FC1724C000B59A42</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>39</td>\n", + " <td>30356849FC1724C000B59A42</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>42</td>\n", + " <td>30356849FC1723C000B5B1A3</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>38</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>38</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>38</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>39</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>42</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>42</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>42</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31</th>\n", + " <td>38</td>\n", + " <td>303568480C357A0000B59A61</td>\n", + " <td>SB281-90M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EPC StyleColor\n", + "0 42 3035684754340E0000B594E8 RH267-85J\n", + "1 32 303568480C287AC000B5BAD5 SL171-99X\n", + "2 38 303568480C287AC000B5BAD5 SL171-99X\n", + "3 38 303568480C287A8000B5BA63 SL171-99X\n", + "4 38 303568480C287A8000B5BADA SL171-99X\n", + "5 38 303568480C287A8000B5BABD SL171-99X\n", + "6 42 303568480C287AC000B5BAD5 SL171-99X\n", + "7 42 303568480C287A8000B5BABD SL171-99X\n", + "8 42 51356847542A2B0000B5B280 RM119-93X\n", + "9 42 30356847542A2B0000B5B215 RM119-93X\n", + "10 38 303568480C2B868000B599B2 RS483-99X\n", + "11 34 303568458835008000B5BAD1 QY337-00X\n", + "12 35 303568458835010000B5BA58 QY337-00X\n", + "13 42 303568458835010000B5BA58 QY337-00X\n", + "14 42 303568458835008000B5BAD1 QY337-00X\n", + "15 42 303568458835010000B5BA61 QY337-00X\n", + "16 42 30356847541DA80000B5BA54 RJ369-87X\n", + "17 29 3035684754501F0000B5B614 RH797-81X\n", + "18 32 3035684754501F0000B5B614 RH797-81X\n", + "19 34 3035684754501F0000B5B614 RH797-81X\n", + "20 42 3035684754501F0000B5B614 RH797-81X\n", + "21 38 30356849FC1724C000B59A42 SF078-MLC\n", + "22 39 30356849FC1724C000B59A42 SF078-MLC\n", + "23 42 30356849FC1723C000B5B1A3 SF078-MLC\n", + "24 38 303568480C53434000B599E1 SO133-09M\n", + "25 38 303568480C5343C000B599F6 SO133-09M\n", + "26 38 303568480C5343C000B599C8 SO133-09M\n", + "27 39 303568480C5343C000B599F6 SO133-09M\n", + "28 42 303568480C5343C000B599F6 SO133-09M\n", + "29 42 303568480C53434000B599E1 SO133-09M\n", + "30 42 303568480C5343C000B599C8 SO133-09M\n", + "31 38 303568480C357A0000B59A61 SB281-90M" + ] + }, + "execution_count": 114, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c7(test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/.ipynb_checkpoints/DBSCAN_1-zmiany_df-checkpoint.ipynb b/.ipynb_checkpoints/DBSCAN_1-zmiany_df-checkpoint.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..694c4ac49efedcbc045715864b0e6e838c605e6d --- /dev/null +++ b/.ipynb_checkpoints/DBSCAN_1-zmiany_df-checkpoint.ipynb @@ -0,0 +1,4533 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pyodbc\n", + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import urllib\n", + "import seaborn as sns\n", + "from matplotlib import pyplot as plt\n", + "import numpy as np\n", + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "params = urllib.parse.quote_plus(\"DRIVER={ODBC Driver 17 for SQL Server};\"\n", + " #\"SERVER=dbserver.mif.pg.gda.pl,1433;\"\n", + " \"SERVER=127.0.0.1,1433;\"\n", + " \"DATABASE=silkycoders;\"\n", + " \"UID=;\"\n", + " \"PWD=\")\n", + "\n", + "engine = create_engine(\"mssql+pyodbc:///?odbc_connect={}\".format(params))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"\"\"SELECT rfid.*, it.*, sub.*, cl.*, dep.*, br.*\n", + " FROM rfid.Logs rfid \n", + " JOIN rfid.EanEpc ean \n", + " ON rfid.EPC = ean.EPC \n", + " JOIN dw.Item it \n", + " ON ean.EAN = it.EAN \n", + " JOIN dw.Subclass sub \n", + " ON sub.SubclassID = it.SubclassID\n", + " JOIN dw.Class cl\n", + " ON sub.ClassID = cl.ClassID\n", + " JOIN dw.Department dep\n", + " ON dep.DepartmentID = cl.DepartmentID\n", + " JOIN dw.Brand br\n", + " ON dep.BrandID = br.BrandID\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_sql_query(query, engine)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_timestamp(df : pd.DataFrame):\n", + " \n", + " dt = df.sort_values(by=\"TIMESTAMP\").reset_index(drop=True)\n", + " dt[\"HOUR\"] = dt.TIMESTAMP.astype('datetime64[ns]').dt.hour.astype(int)\n", + " dt[\"MIN\"] = dt.TIMESTAMP.astype('datetime64[ns]').dt.minute.astype(int)\n", + " dt[\"SEC\"] = dt.TIMESTAMP.astype('datetime64[ns]').dt.second.astype(int)\n", + " dt[\"MICROSEC\"] = dt.TIMESTAMP.astype('datetime64[ns]').dt.microsecond.astype(int)\n", + "\n", + " dt[\"MILISEC\"] = dt.MICROSEC/1000 + dt.SEC*1000 + dt.MIN*60000 + dt.HOUR*3600000\n", + " dt[\"TIME_MS\"] = dt.MILISEC - dt.MILISEC[0]\n", + " \n", + " dt['TIME_PER_MEASUREMENT_MS'] = 0\n", + " dt['NUMBER_OF_SIGNALS'] = 0\n", + " dt['LENGTH_OF_MEASUREMENT'] = 0\n", + " \n", + " for m in dt.MEASUREMENT.unique():\n", + " filtr = (dt.MEASUREMENT == m)\n", + " dt.loc[filtr,'TIME_PER_MEASUREMENT_MS'] = dt[filtr].MILISEC - dt[filtr].MILISEC.iloc[0]\n", + " dt.loc[filtr, \"NUMBER_OF_SIGNALS\"] = len(dt[filtr])\n", + " dt.loc[filtr, 'LENGTH_OF_MEASUREMENT'] = dt[filtr].TIME_PER_MEASUREMENT_MS.max()\n", + " \n", + " dt[\"TIME_KMS\"] = np.floor(dt.TIME_MS/1000) \n", + " dt = dt.merge(dt.groupby(['EPC','TIME_KMS'])[\"PROXIMITY\"].max().reset_index(name=\"MAX_PROXIMITY_KMS\"), how=\"left\",\n", + " on = ['EPC','TIME_KMS'])\n", + " dt = dt.merge(dt.groupby(['EPC','TIME_KMS'])[\"PROXIMITY\"].sum().reset_index(name=\"SUM_PROXIMITY_KMS\"), how=\"left\",\n", + " on = ['EPC','TIME_KMS'])\n", + " return dt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = prepare_timestamp(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('df.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 502689 entries, 0 to 502688\n", + "Data columns (total 36 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 EPC 502689 non-null object \n", + " 1 PROXIMITY 502689 non-null float64\n", + " 2 TIMESTAMP 502689 non-null object \n", + " 3 MEASUREMENT 502689 non-null int64 \n", + " 4 ITEMID 502689 non-null int64 \n", + " 5 EAN 502689 non-null int64 \n", + " 6 StyleColorSize 502689 non-null object \n", + " 7 StyleColor 502689 non-null object \n", + " 8 Size 502689 non-null object \n", + " 9 SubclassID 502689 non-null int64 \n", + " 10 ItemSeason 502689 non-null object \n", + " 11 FashionLevel 369997 non-null object \n", + " 12 SubclassID.1 502689 non-null int64 \n", + " 13 SubclassName 502689 non-null object \n", + " 14 ClassID 502689 non-null int64 \n", + " 15 ClassID.1 502689 non-null int64 \n", + " 16 ClassName 502689 non-null object \n", + " 17 DepartmentID 502689 non-null int64 \n", + " 18 DepartmentID.1 502689 non-null int64 \n", + " 19 DepartmentName 502689 non-null object \n", + " 20 BrandID 502689 non-null int64 \n", + " 21 BrandID.1 502689 non-null int64 \n", + " 22 BrandName 502689 non-null object \n", + " 23 Active 502689 non-null bool \n", + " 24 HOUR 502689 non-null int64 \n", + " 25 MIN 502689 non-null int64 \n", + " 26 SEC 502689 non-null int64 \n", + " 27 MICROSEC 502689 non-null int64 \n", + " 28 MILISEC 502689 non-null float64\n", + " 29 TIME_MS 502689 non-null float64\n", + " 30 TIME_PER_MEASUREMENT_MS 502689 non-null float64\n", + " 31 NUMBER_OF_SIGNALS 502689 non-null int64 \n", + " 32 LENGTH_OF_MEASUREMENT 502689 non-null int64 \n", + " 33 TIME_KMS 502689 non-null float64\n", + " 34 MAX_PROXIMITY_KMS 502689 non-null float64\n", + " 35 SUM_PROXIMITY_KMS 502689 non-null float64\n", + "dtypes: bool(1), float64(7), int64(17), object(11)\n", + "memory usage: 134.7+ MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>PROXIMITY</th>\n", + " <th>MEASUREMENT</th>\n", + " <th>ITEMID</th>\n", + " <th>EAN</th>\n", + " <th>SubclassID</th>\n", + " <th>SubclassID.1</th>\n", + " <th>ClassID</th>\n", + " <th>ClassID.1</th>\n", + " <th>DepartmentID</th>\n", + " <th>DepartmentID.1</th>\n", + " <th>...</th>\n", + " <th>SEC</th>\n", + " <th>MICROSEC</th>\n", + " <th>MILISEC</th>\n", + " <th>TIME_MS</th>\n", + " <th>TIME_PER_MEASUREMENT_MS</th>\n", + " <th>NUMBER_OF_SIGNALS</th>\n", + " <th>LENGTH_OF_MEASUREMENT</th>\n", + " <th>TIME_KMS</th>\n", + " <th>MAX_PROXIMITY_KMS</th>\n", + " <th>SUM_PROXIMITY_KMS</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>5.026890e+05</td>\n", + " <td>5.026890e+05</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>...</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>5.026890e+05</td>\n", + " <td>5.026890e+05</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.00000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>mean</th>\n", + " <td>-75.406017</td>\n", + " <td>23.412382</td>\n", + " <td>2.169862e+06</td>\n", + " <td>5.902835e+12</td>\n", + " <td>83.920704</td>\n", + " <td>83.920704</td>\n", + " <td>18.231477</td>\n", + " <td>18.231477</td>\n", + " <td>2.609574</td>\n", + " <td>2.609574</td>\n", + " <td>...</td>\n", + " <td>29.193547</td>\n", + " <td>499773.110213</td>\n", + " <td>3.721192e+07</td>\n", + " <td>2.018186e+06</td>\n", + " <td>100132.210719</td>\n", + " <td>15383.906986</td>\n", + " <td>199835.398777</td>\n", + " <td>2017.68607</td>\n", + " <td>-72.497318</td>\n", + " <td>-398.108291</td>\n", + " </tr>\n", + " <tr>\n", + " <th>std</th>\n", + " <td>5.698062</td>\n", + " <td>12.175284</td>\n", + " <td>7.798483e+04</td>\n", + " <td>7.380986e+07</td>\n", + " <td>142.489244</td>\n", + " <td>142.489244</td>\n", + " <td>8.844056</td>\n", + " <td>8.844056</td>\n", + " <td>0.937828</td>\n", + " <td>0.937828</td>\n", + " <td>...</td>\n", + " <td>17.223297</td>\n", + " <td>288469.414710</td>\n", + " <td>1.121487e+06</td>\n", + " <td>1.121487e+06</td>\n", + " <td>81859.831696</td>\n", + " <td>8217.121271</td>\n", + " <td>101049.072703</td>\n", + " <td>1121.48684</td>\n", + " <td>5.893956</td>\n", + " <td>262.167663</td>\n", + " </tr>\n", + " <tr>\n", + " <th>min</th>\n", + " <td>-110.000000</td>\n", + " <td>1.000000</td>\n", + " <td>2.028742e+06</td>\n", + " <td>5.902691e+12</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>3.519374e+07</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000</td>\n", + " <td>4597.000000</td>\n", + " <td>53538.000000</td>\n", + " <td>0.00000</td>\n", + " <td>-100.500000</td>\n", + " <td>-2629.400000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25%</th>\n", + " <td>-79.900000</td>\n", + " <td>13.000000</td>\n", + " <td>2.113407e+06</td>\n", + " <td>5.902805e+12</td>\n", + " <td>11.000000</td>\n", + " <td>11.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>14.000000</td>\n", + " <td>250000.000000</td>\n", + " <td>3.624898e+07</td>\n", + " <td>1.055248e+06</td>\n", + " <td>38108.000000</td>\n", + " <td>8533.000000</td>\n", + " <td>127122.000000</td>\n", + " <td>1055.00000</td>\n", + " <td>-76.400000</td>\n", + " <td>-515.900000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>50%</th>\n", + " <td>-75.700000</td>\n", + " <td>24.000000</td>\n", + " <td>2.155604e+06</td>\n", + " <td>5.902806e+12</td>\n", + " <td>82.000000</td>\n", + " <td>82.000000</td>\n", + " <td>16.000000</td>\n", + " <td>16.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>29.000000</td>\n", + " <td>500000.000000</td>\n", + " <td>3.719645e+07</td>\n", + " <td>2.002711e+06</td>\n", + " <td>78477.000000</td>\n", + " <td>13321.000000</td>\n", + " <td>176026.000000</td>\n", + " <td>2002.00000</td>\n", + " <td>-72.900000</td>\n", + " <td>-342.900000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75%</th>\n", + " <td>-71.900000</td>\n", + " <td>33.000000</td>\n", + " <td>2.226340e+06</td>\n", + " <td>5.902852e+12</td>\n", + " <td>82.000000</td>\n", + " <td>82.000000</td>\n", + " <td>25.000000</td>\n", + " <td>25.000000</td>\n", + " <td>3.000000</td>\n", + " <td>3.000000</td>\n", + " <td>...</td>\n", + " <td>44.000000</td>\n", + " <td>749000.000000</td>\n", + " <td>3.815973e+07</td>\n", + " <td>2.965991e+06</td>\n", + " <td>139431.000000</td>\n", + " <td>22217.000000</td>\n", + " <td>265127.000000</td>\n", + " <td>2965.00000</td>\n", + " <td>-68.400000</td>\n", + " <td>-225.700000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max</th>\n", + " <td>-38.900000</td>\n", + " <td>43.000000</td>\n", + " <td>2.304122e+06</td>\n", + " <td>5.902975e+12</td>\n", + " <td>630.000000</td>\n", + " <td>630.000000</td>\n", + " <td>41.000000</td>\n", + " <td>41.000000</td>\n", + " <td>6.000000</td>\n", + " <td>6.000000</td>\n", + " <td>...</td>\n", + " <td>59.000000</td>\n", + " <td>999000.000000</td>\n", + " <td>3.912875e+07</td>\n", + " <td>3.935013e+06</td>\n", + " <td>435771.000000</td>\n", + " <td>35350.000000</td>\n", + " <td>435771.000000</td>\n", + " <td>3935.00000</td>\n", + " <td>-38.900000</td>\n", + " <td>-52.300000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>8 rows Ă 24 columns</p>\n", + "</div>" + ], + "text/plain": [ + " PROXIMITY MEASUREMENT ITEMID EAN \\\n", + "count 502689.000000 502689.000000 5.026890e+05 5.026890e+05 \n", + "mean -75.406017 23.412382 2.169862e+06 5.902835e+12 \n", + "std 5.698062 12.175284 7.798483e+04 7.380986e+07 \n", + "min -110.000000 1.000000 2.028742e+06 5.902691e+12 \n", + "25% -79.900000 13.000000 2.113407e+06 5.902805e+12 \n", + "50% -75.700000 24.000000 2.155604e+06 5.902806e+12 \n", + "75% -71.900000 33.000000 2.226340e+06 5.902852e+12 \n", + "max -38.900000 43.000000 2.304122e+06 5.902975e+12 \n", + "\n", + " SubclassID SubclassID.1 ClassID ClassID.1 \\\n", + "count 502689.000000 502689.000000 502689.000000 502689.000000 \n", + "mean 83.920704 83.920704 18.231477 18.231477 \n", + "std 142.489244 142.489244 8.844056 8.844056 \n", + "min 10.000000 10.000000 10.000000 10.000000 \n", + "25% 11.000000 11.000000 10.000000 10.000000 \n", + "50% 82.000000 82.000000 16.000000 16.000000 \n", + "75% 82.000000 82.000000 25.000000 25.000000 \n", + "max 630.000000 630.000000 41.000000 41.000000 \n", + "\n", + " DepartmentID DepartmentID.1 ... SEC MICROSEC \\\n", + "count 502689.000000 502689.000000 ... 502689.000000 502689.000000 \n", + "mean 2.609574 2.609574 ... 29.193547 499773.110213 \n", + "std 0.937828 0.937828 ... 17.223297 288469.414710 \n", + "min 2.000000 2.000000 ... 0.000000 0.000000 \n", + "25% 2.000000 2.000000 ... 14.000000 250000.000000 \n", + "50% 2.000000 2.000000 ... 29.000000 500000.000000 \n", + "75% 3.000000 3.000000 ... 44.000000 749000.000000 \n", + "max 6.000000 6.000000 ... 59.000000 999000.000000 \n", + "\n", + " MILISEC TIME_MS TIME_PER_MEASUREMENT_MS NUMBER_OF_SIGNALS \\\n", + "count 5.026890e+05 5.026890e+05 502689.000000 502689.000000 \n", + "mean 3.721192e+07 2.018186e+06 100132.210719 15383.906986 \n", + "std 1.121487e+06 1.121487e+06 81859.831696 8217.121271 \n", + "min 3.519374e+07 0.000000e+00 0.000000 4597.000000 \n", + "25% 3.624898e+07 1.055248e+06 38108.000000 8533.000000 \n", + "50% 3.719645e+07 2.002711e+06 78477.000000 13321.000000 \n", + "75% 3.815973e+07 2.965991e+06 139431.000000 22217.000000 \n", + "max 3.912875e+07 3.935013e+06 435771.000000 35350.000000 \n", + "\n", + " LENGTH_OF_MEASUREMENT TIME_KMS MAX_PROXIMITY_KMS \\\n", + "count 502689.000000 502689.00000 502689.000000 \n", + "mean 199835.398777 2017.68607 -72.497318 \n", + "std 101049.072703 1121.48684 5.893956 \n", + "min 53538.000000 0.00000 -100.500000 \n", + "25% 127122.000000 1055.00000 -76.400000 \n", + "50% 176026.000000 2002.00000 -72.900000 \n", + "75% 265127.000000 2965.00000 -68.400000 \n", + "max 435771.000000 3935.00000 -38.900000 \n", + "\n", + " SUM_PROXIMITY_KMS \n", + "count 502689.000000 \n", + "mean -398.108291 \n", + "std 262.167663 \n", + "min -2629.400000 \n", + "25% -515.900000 \n", + "50% -342.900000 \n", + "75% -225.700000 \n", + "max -52.300000 \n", + "\n", + "[8 rows x 24 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.boxplot(df['LENGTH_OF_MEASUREMENT']);" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Usuwamy najdĹuĹźsze pomiary\n", + "df.drop(df.loc[df['LENGTH_OF_MEASUREMENT'] > 200000].index, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>PROXIMITY</th>\n", + " <th>MEASUREMENT</th>\n", + " <th>ITEMID</th>\n", + " <th>EAN</th>\n", + " <th>SubclassID</th>\n", + " <th>SubclassID.1</th>\n", + " <th>ClassID</th>\n", + " <th>ClassID.1</th>\n", + " <th>DepartmentID</th>\n", + " <th>DepartmentID.1</th>\n", + " <th>...</th>\n", + " <th>SEC</th>\n", + " <th>MICROSEC</th>\n", + " <th>MILISEC</th>\n", + " <th>TIME_MS</th>\n", + " <th>TIME_PER_MEASUREMENT_MS</th>\n", + " <th>NUMBER_OF_SIGNALS</th>\n", + " <th>LENGTH_OF_MEASUREMENT</th>\n", + " <th>TIME_KMS</th>\n", + " <th>MAX_PROXIMITY_KMS</th>\n", + " <th>SUM_PROXIMITY_KMS</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>3.177620e+05</td>\n", + " <td>3.177620e+05</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>...</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>3.177620e+05</td>\n", + " <td>3.177620e+05</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>mean</th>\n", + " <td>-75.526452</td>\n", + " <td>22.776858</td>\n", + " <td>2.170711e+06</td>\n", + " <td>5.902836e+12</td>\n", + " <td>82.639686</td>\n", + " <td>82.639686</td>\n", + " <td>18.142610</td>\n", + " <td>18.142610</td>\n", + " <td>2.603486</td>\n", + " <td>2.603486</td>\n", + " <td>...</td>\n", + " <td>29.378906</td>\n", + " <td>499539.922961</td>\n", + " <td>3.717748e+07</td>\n", + " <td>1.983745e+06</td>\n", + " <td>66756.283605</td>\n", + " <td>10103.277673</td>\n", + " <td>133222.336198</td>\n", + " <td>1983.244840</td>\n", + " <td>-72.536124</td>\n", + " <td>-404.190290</td>\n", + " </tr>\n", + " <tr>\n", + " <th>std</th>\n", + " <td>5.583369</td>\n", + " <td>12.801744</td>\n", + " <td>7.820521e+04</td>\n", + " <td>7.406653e+07</td>\n", + " <td>141.056329</td>\n", + " <td>141.056329</td>\n", + " <td>8.830015</td>\n", + " <td>8.830015</td>\n", + " <td>0.932835</td>\n", + " <td>0.932835</td>\n", + " <td>...</td>\n", + " <td>17.290876</td>\n", + " <td>288325.166786</td>\n", + " <td>1.183041e+06</td>\n", + " <td>1.183041e+06</td>\n", + " <td>44987.211264</td>\n", + " <td>3113.779828</td>\n", + " <td>39296.957914</td>\n", + " <td>1183.041729</td>\n", + " <td>5.831107</td>\n", + " <td>267.914649</td>\n", + " </tr>\n", + " <tr>\n", + " <th>min</th>\n", + " <td>-110.000000</td>\n", + " <td>1.000000</td>\n", + " <td>2.028742e+06</td>\n", + " <td>5.902691e+12</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>3.519374e+07</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000</td>\n", + " <td>4597.000000</td>\n", + " <td>53538.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-98.000000</td>\n", + " <td>-2629.400000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25%</th>\n", + " <td>-79.900000</td>\n", + " <td>12.000000</td>\n", + " <td>2.113407e+06</td>\n", + " <td>5.902805e+12</td>\n", + " <td>11.000000</td>\n", + " <td>11.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>14.000000</td>\n", + " <td>249000.000000</td>\n", + " <td>3.617897e+07</td>\n", + " <td>9.852318e+05</td>\n", + " <td>29587.000000</td>\n", + " <td>8027.000000</td>\n", + " <td>101041.000000</td>\n", + " <td>985.000000</td>\n", + " <td>-76.400000</td>\n", + " <td>-527.300000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>50%</th>\n", + " <td>-75.700000</td>\n", + " <td>24.000000</td>\n", + " <td>2.155605e+06</td>\n", + " <td>5.902806e+12</td>\n", + " <td>82.000000</td>\n", + " <td>82.000000</td>\n", + " <td>16.000000</td>\n", + " <td>16.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>29.000000</td>\n", + " <td>500000.000000</td>\n", + " <td>3.714444e+07</td>\n", + " <td>1.950704e+06</td>\n", + " <td>59599.000000</td>\n", + " <td>9887.000000</td>\n", + " <td>138579.000000</td>\n", + " <td>1950.000000</td>\n", + " <td>-72.900000</td>\n", + " <td>-350.700000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75%</th>\n", + " <td>-71.900000</td>\n", + " <td>34.000000</td>\n", + " <td>2.226340e+06</td>\n", + " <td>5.902852e+12</td>\n", + " <td>82.000000</td>\n", + " <td>82.000000</td>\n", + " <td>25.000000</td>\n", + " <td>25.000000</td>\n", + " <td>3.000000</td>\n", + " <td>3.000000</td>\n", + " <td>...</td>\n", + " <td>44.000000</td>\n", + " <td>748000.000000</td>\n", + " <td>3.834786e+07</td>\n", + " <td>3.154128e+06</td>\n", + " <td>97761.750000</td>\n", + " <td>12660.000000</td>\n", + " <td>168403.000000</td>\n", + " <td>3154.000000</td>\n", + " <td>-68.400000</td>\n", + " <td>-226.100000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max</th>\n", + " <td>-41.000000</td>\n", + " <td>42.000000</td>\n", + " <td>2.304122e+06</td>\n", + " <td>5.902975e+12</td>\n", + " <td>630.000000</td>\n", + " <td>630.000000</td>\n", + " <td>41.000000</td>\n", + " <td>41.000000</td>\n", + " <td>6.000000</td>\n", + " <td>6.000000</td>\n", + " <td>...</td>\n", + " <td>59.000000</td>\n", + " <td>999000.000000</td>\n", + " <td>3.908956e+07</td>\n", + " <td>3.895821e+06</td>\n", + " <td>189705.000000</td>\n", + " <td>15444.000000</td>\n", + " <td>189705.000000</td>\n", + " <td>3895.000000</td>\n", + " <td>-41.000000</td>\n", + " <td>-52.300000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>8 rows Ă 24 columns</p>\n", + "</div>" + ], + "text/plain": [ + " PROXIMITY MEASUREMENT ITEMID EAN \\\n", + "count 317762.000000 317762.000000 3.177620e+05 3.177620e+05 \n", + "mean -75.526452 22.776858 2.170711e+06 5.902836e+12 \n", + "std 5.583369 12.801744 7.820521e+04 7.406653e+07 \n", + "min -110.000000 1.000000 2.028742e+06 5.902691e+12 \n", + "25% -79.900000 12.000000 2.113407e+06 5.902805e+12 \n", + "50% -75.700000 24.000000 2.155605e+06 5.902806e+12 \n", + "75% -71.900000 34.000000 2.226340e+06 5.902852e+12 \n", + "max -41.000000 42.000000 2.304122e+06 5.902975e+12 \n", + "\n", + " SubclassID SubclassID.1 ClassID ClassID.1 \\\n", + "count 317762.000000 317762.000000 317762.000000 317762.000000 \n", + "mean 82.639686 82.639686 18.142610 18.142610 \n", + "std 141.056329 141.056329 8.830015 8.830015 \n", + "min 10.000000 10.000000 10.000000 10.000000 \n", + "25% 11.000000 11.000000 10.000000 10.000000 \n", + "50% 82.000000 82.000000 16.000000 16.000000 \n", + "75% 82.000000 82.000000 25.000000 25.000000 \n", + "max 630.000000 630.000000 41.000000 41.000000 \n", + "\n", + " DepartmentID DepartmentID.1 ... SEC MICROSEC \\\n", + "count 317762.000000 317762.000000 ... 317762.000000 317762.000000 \n", + "mean 2.603486 2.603486 ... 29.378906 499539.922961 \n", + "std 0.932835 0.932835 ... 17.290876 288325.166786 \n", + "min 2.000000 2.000000 ... 0.000000 0.000000 \n", + "25% 2.000000 2.000000 ... 14.000000 249000.000000 \n", + "50% 2.000000 2.000000 ... 29.000000 500000.000000 \n", + "75% 3.000000 3.000000 ... 44.000000 748000.000000 \n", + "max 6.000000 6.000000 ... 59.000000 999000.000000 \n", + "\n", + " MILISEC TIME_MS TIME_PER_MEASUREMENT_MS NUMBER_OF_SIGNALS \\\n", + "count 3.177620e+05 3.177620e+05 317762.000000 317762.000000 \n", + "mean 3.717748e+07 1.983745e+06 66756.283605 10103.277673 \n", + "std 1.183041e+06 1.183041e+06 44987.211264 3113.779828 \n", + "min 3.519374e+07 0.000000e+00 0.000000 4597.000000 \n", + "25% 3.617897e+07 9.852318e+05 29587.000000 8027.000000 \n", + "50% 3.714444e+07 1.950704e+06 59599.000000 9887.000000 \n", + "75% 3.834786e+07 3.154128e+06 97761.750000 12660.000000 \n", + "max 3.908956e+07 3.895821e+06 189705.000000 15444.000000 \n", + "\n", + " LENGTH_OF_MEASUREMENT TIME_KMS MAX_PROXIMITY_KMS \\\n", + "count 317762.000000 317762.000000 317762.000000 \n", + "mean 133222.336198 1983.244840 -72.536124 \n", + "std 39296.957914 1183.041729 5.831107 \n", + "min 53538.000000 0.000000 -98.000000 \n", + "25% 101041.000000 985.000000 -76.400000 \n", + "50% 138579.000000 1950.000000 -72.900000 \n", + "75% 168403.000000 3154.000000 -68.400000 \n", + "max 189705.000000 3895.000000 -41.000000 \n", + "\n", + " SUM_PROXIMITY_KMS \n", + "count 317762.000000 \n", + "mean -404.190290 \n", + "std 267.914649 \n", + "min -2629.400000 \n", + "25% -527.300000 \n", + "50% -350.700000 \n", + "75% -226.100000 \n", + "max -52.300000 \n", + "\n", + "[8 rows x 24 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 1, 2, 3, 4, 5, 7, 6, 10, 11, 12, 13, 14, 16, 18, 17, 19, 20,\n", + " 21, 23, 24, 26, 27, 28, 29, 31, 32, 34, 35, 36, 38, 37, 40, 39, 41,\n", + " 42], dtype=int64)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.MEASUREMENT.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 , RK485-99X\n", + "2 , RK485-99X\n", + "3 , RK485-99X\n", + "4 , RK485-99X\n", + "5 , RK485-99X\n", + "7 , RK485-99X\n", + "6 , RK485-99X\n", + "10 , RK485-99X\n", + "11 , RK485-99X\n", + "12 , RK485-99X\n", + "13 , RK485-99X\n", + "14 , RK485-99X\n", + "16 , RK485-99X\n", + "18 , RK485-99X\n", + "17 , RK485-99X\n", + "19 , RK485-99X\n", + "20 , RK485-99X\n", + "21 , RK485-99X\n", + "23 , RK485-99X\n", + "24 , RK485-99X\n", + "26 , RK485-99X\n", + "27 , RK485-99X\n", + "28 , RK485-99X\n", + "29 , RK485-99X\n", + "31 , RK485-99X\n", + "32 , RK485-99X\n", + "34 , RK485-99X\n", + "35 , RK485-99X\n", + "36 , RK485-99X\n", + "38 , RK485-99X\n", + "37 , RK485-99X\n", + "40 , RK485-99X\n", + "39 , RK485-99X\n", + "41 , RK485-99X\n", + "42 , RK485-99X\n" + ] + } + ], + "source": [ + "for i in df.MEASUREMENT.unique():\n", + " zb = df[df['MEASUREMENT'] == i]\n", + " for j in zb.StyleColor.unique():\n", + " zbior = zb[zb['StyleColor'] == j]\n", + " if zbior.EPC.unique().size == 1:\n", + " print(i,', ', j)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Zatem usuwamy caĹy ten Stylokolor\n", + "df.drop(df.loc[df['StyleColor'] == 'RK485-99X'].index, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Sprawdzamy, czy mamy klipsy przypisane do wiÄcej niĹź 1 Itemu\n", + "for i in df.MEASUREMENT.unique():\n", + " zb = df[df['MEASUREMENT'] == i]\n", + " for j in zb.EPC.unique():\n", + " zbior = zb[zb['EPC'] == j]\n", + " if zbior.EAN.unique().size > 1:\n", + " print(i,', ', j)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>EPC</th>\n", + " <th>PROXIMITY</th>\n", + " <th>TIMESTAMP</th>\n", + " <th>MEASUREMENT</th>\n", + " <th>ITEMID</th>\n", + " <th>EAN</th>\n", + " <th>StyleColorSize</th>\n", + " <th>StyleColor</th>\n", + " <th>Size</th>\n", + " <th>SubclassID</th>\n", + " <th>...</th>\n", + " <th>SEC</th>\n", + " <th>MICROSEC</th>\n", + " <th>MILISEC</th>\n", + " <th>TIME_MS</th>\n", + " <th>TIME_PER_MEASUREMENT_MS</th>\n", + " <th>NUMBER_OF_SIGNALS</th>\n", + " <th>LENGTH_OF_MEASUREMENT</th>\n", + " <th>TIME_KMS</th>\n", + " <th>MAX_PROXIMITY_KMS</th>\n", + " <th>SUM_PROXIMITY_KMS</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "<p>0 rows Ă 36 columns</p>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [EPC, PROXIMITY, TIMESTAMP, MEASUREMENT, ITEMID, EAN, StyleColorSize, StyleColor, Size, SubclassID, ItemSeason, FashionLevel, SubclassID.1, SubclassName, ClassID, ClassID.1, ClassName, DepartmentID, DepartmentID.1, DepartmentName, BrandID, BrandID.1, BrandName, Active, HOUR, MIN, SEC, MICROSEC, MILISEC, TIME_MS, TIME_PER_MEASUREMENT_MS, NUMBER_OF_SIGNALS, LENGTH_OF_MEASUREMENT, TIME_KMS, MAX_PROXIMITY_KMS, SUM_PROXIMITY_KMS]\n", + "Index: []\n", + "\n", + "[0 rows x 36 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#do testow samego modelu\n", + "test_1 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ((df.MEASUREMENT == 2) | (df.MEASUREMENT == 3) | (df.MEASUREMENT == 4) | (df.MEASUREMENT == 26) | (df.MEASUREMENT == 28) ) ]\n", + "\n", + "test1 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ( (df.MEASUREMENT == 1) ) ]\n", + "test2 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ( (df.MEASUREMENT == 2) ) ]\n", + "test3 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ( (df.MEASUREMENT == 3) ) ]\n", + "test4 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ( (df.MEASUREMENT == 4) ) ]\n", + "test5 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 5) ]\n", + "test6 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 6) ]\n", + "test9 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 9) ]\n", + "test12 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 12) ]\n", + "test22 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 22) ]\n", + "test24 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 24) ]\n", + "test25 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 25) ]\n", + "test21 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 21) ]\n", + "test29 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 29) ]\n", + "test28 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 28) ]\n", + "\n", + "test29" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EAN', data=test6)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EAN', data=test_1[test_1.MEASUREMENT == 3])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EPC', data=test_1[test_1.MEASUREMENT == 4])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EPC', data=test_1[test_1.MEASUREMENT == 26])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EAN', data=test_1[test_1.MEASUREMENT == 28])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "NastÄpnie tworzymy intuicyjny podziaĹ na zbiĂłr testowy i treningowy: pomiary przed godzinÄ 10.30 traktujemy jako zbiĂłr treningowy, natomiast te po godzinie 10.30 - jako zbiĂłr testowy." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "train = df[df.TIMESTAMP <= '2021-10-26T10:30:00.000']\n", + "train = train[['SUM_PROXIMITY_KMS','TIME_MS', 'EAN', 'MEASUREMENT','StyleColor', 'EPC']]\n", + "test = df[df.TIMESTAMP > '2021-10-26T10:30:00.000']\n", + "test = test[['SUM_PROXIMITY_KMS','TIME_MS', 'EAN', 'MEASUREMENT','StyleColor', 'EPC']]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from sklearn.cluster import DBSCAN\n", + "from sklearn import metrics\n", + "from sklearn.datasets import make_blobs\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.preprocessing import normalize\n", + "from sklearn.neighbors import NearestNeighbors\n", + "import plotly.express as px\n", + "from kneed import KneeLocator" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "#zbiĂłr testowy dla kilku stylokolorĂłw\n", + "test0 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & ((df.StyleColor == 'RH797-81X') | (df.StyleColor == 'SL171-99X') \n", + " | (df.StyleColor == 'RH797-59X'))]\n", + "caly1 = test0[['SUM_PROXIMITY_KMS','TIME_MS', 'EAN', 'MEASUREMENT','StyleColor', 'EPC']]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def c1(x, g, d, c):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(2,minimal_epc.astype(int))\n", + "\n", + " if minimal_epc > g:\n", + " minimal_epc = g\n", + " neighbors = g\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(6000,kneedle.knee_y/d)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > c*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "#zbiĂłr testowy dla kilku stylokolorĂłw \n", + "train1 = train[(train.StyleColor == 'RH267-85J') | (train.StyleColor == 'RJ369-87X') | (train.StyleColor =='RM119-93X') \n", + " | (train.StyleColor == 'RS483-99X') | (train.StyleColor == 'SB281-90M')]\n", + "\n", + "train2 = train[(train.StyleColor == 'RV167-MLC') | (train.StyleColor == 'RV462-87X') | (train.StyleColor =='QJ677-33X') \n", + " | (train.StyleColor == 'RH797-00X') | (train.StyleColor == 'RH267-55J')]\n", + "\n", + "train3 = train[(train.StyleColor == 'SL171-99X') | (train.StyleColor == 'SO133-09M') | (train.StyleColor =='RB254-00X') \n", + " | (train.StyleColor == 'SF078-MLC') | (train.StyleColor == 'QY337-00X')]\n", + "\n", + "train4 = train[(train.StyleColor == 'SP095-59X') | (train.StyleColor == 'RN633-00X') | (train.StyleColor =='RH267-59J') \n", + " | (train.StyleColor == 'RV167-87X')]\n", + "\n", + "train5 = train[(train.StyleColor == 'RJ365-09M') | (train.StyleColor == 'RH797-59X') | (train.StyleColor =='SP090-90X') \n", + " | (train.StyleColor == 'RH797-99X') | (train.StyleColor == 'RJ371-59M')]\n", + "\n", + "train6 = train[(train.StyleColor == 'RV462-99X') | (train.StyleColor == 'RH797-81X') | (train.StyleColor =='QZ555-20X') \n", + " | (train.StyleColor == 'RJ371-53M') | (train.StyleColor == 'RS054-99X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# Wybrane StyleColor\n", + "z1 = test[(test.StyleColor == 'RH267-85J') | (test.StyleColor == 'RJ369-87X') | (test.StyleColor =='RM119-93X') \n", + " | (test.StyleColor == 'RS483-99X') | (test.StyleColor == 'SB281-90M')]\n", + "\n", + "z2 = test[(test.StyleColor == 'RV167-MLC') | (test.StyleColor == 'RV462-87X') | (test.StyleColor =='QJ677-33X') \n", + " | (test.StyleColor == 'RH797-00X') | (test.StyleColor == 'RH267-55J')]\n", + "\n", + "z3 = test[(test.StyleColor == 'SL171-99X') | (test.StyleColor == 'SO133-09M') | (test.StyleColor =='RB254-00X') \n", + " | (test.StyleColor == 'SF078-MLC') | (test.StyleColor == 'QY337-00X')]\n", + "\n", + "z4 = test[(test.StyleColor == 'SP095-59X') | (test.StyleColor == 'RN633-00X') | (test.StyleColor =='RH267-59J') \n", + " | (test.StyleColor == 'RV167-87X') | (test.StyleColor == 'RK485-99X')]\n", + "\n", + "z5 = test[(test.StyleColor == 'RJ365-09M') | (test.StyleColor == 'RH797-59X') | (test.StyleColor =='SP090-90X') \n", + " | (test.StyleColor == 'RH797-99X') | (test.StyleColor == 'RJ371-59M')]\n", + "\n", + "z6 = test[(test.StyleColor == 'RV462-99X') | (test.StyleColor == 'RH797-81X') | (test.StyleColor =='QZ555-20X') \n", + " | (test.StyleColor == 'RJ371-53M') | (test.StyleColor == 'RS054-99X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "u = train[(train.StyleColor == 'RV462-99X') | (train.StyleColor == 'RH797-81X') | (train.StyleColor =='SL171-99X')]\n", + "t = test[(test.StyleColor == 'RV462-99X') | (test.StyleColor == 'RH797-81X') | (test.StyleColor =='SL171-99X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 21 5902805820447 RH797-81X\n", + "1 24 5902805820447 RH797-81X\n", + "2 26 5902805820447 RH797-81X\n", + "3 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(u)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 29 5902805820447 RH797-81X\n", + "2 32 5902805820447 RH797-81X\n", + "3 34 5902805820447 RH797-81X" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(t)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train1)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train2)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 18 5902851852638 SO133-09M\n", + "1 11 5902690542745 QY337-00X\n", + "2 11 5902690542769 QY337-00X" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train3)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train4)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train5)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 21 5902805820447 RH797-81X\n", + "1 24 5902805820447 RH797-81X\n", + "2 26 5902805820447 RH797-81X\n", + "3 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train6)" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 34 5902805820447 RH797-81X" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(z6)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 11 5902690542745 QY337-00X\n", + "1 11 5902690542769 QY337-00X\n", + "2 21 5902805820447 RH797-81X\n", + "3 26 5902805820447 RH797-81X\n", + "4 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 34 5902805820447 RH797-81X" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 18 5902851852638 SO133-09M\n", + "1 11 5902690542745 QY337-00X\n", + "2 11 5902690542769 QY337-00X\n", + "3 21 5902805820447 RH797-81X\n", + "4 24 5902805820447 RH797-81X\n", + "5 26 5902805820447 RH797-81X\n", + "6 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train,70,6,0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>42</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38</td>\n", + " <td>5902851445700</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>35</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>42</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>42</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>38</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>42</td>\n", + " <td>5902975236956</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 42 5902805533040 RH267-85J\n", + "1 38 5902851414515 SL171-99X\n", + "2 38 5902851445700 RS483-99X\n", + "3 35 5902690542769 QY337-00X\n", + "4 42 5902690542769 QY337-00X\n", + "5 42 5902690542745 QY337-00X\n", + "6 42 5902690542769 QY337-00X\n", + "7 29 5902805820447 RH797-81X\n", + "8 32 5902805820447 RH797-81X\n", + "9 34 5902805820447 RH797-81X\n", + "10 42 5902805820447 RH797-81X\n", + "11 38 5902975236994 SF078-MLC\n", + "12 42 5902975236956 SF078-MLC\n", + "13 38 5902851852614 SO133-09M\n", + "14 38 5902851852638 SO133-09M\n", + "15 38 5902851852638 SO133-09M\n", + "16 42 5902851852638 SO133-09M\n", + "17 42 5902851852614 SO133-09M\n", + "18 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test,70,6,0.4)" + ] + }, + { + "cell_type": "code", + "execution_count": 268, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>22</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>25</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>30</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 22 5902805820447 RH797-81X\n", + "1 25 5902805820447 RH797-81X\n", + "2 24 5902805820447 RH797-81X\n", + "3 26 5902805820447 RH797-81X\n", + "4 30 5902805820447 RH797-81X\n", + "5 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 268, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(caly1,0.7)" + ] + }, + { + "cell_type": "code", + "execution_count": 225, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>22</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>25</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>30</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 22 5902805820447 RH797-81X\n", + "1 21 5902805820447 RH797-81X\n", + "2 25 5902805820447 RH797-81X\n", + "3 24 5902805820447 RH797-81X\n", + "4 26 5902805820447 RH797-81X\n", + "5 30 5902805820447 RH797-81X\n", + "6 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 225, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(caly1, 0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(z1)" + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 29 5902805820447 RH797-81X\n", + "2 32 5902805820447 RH797-81X\n", + "3 34 5902805820447 RH797-81X\n", + "4 38 5902851852614 SO133-09M\n", + "5 38 5902851852638 SO133-09M\n", + "6 42 5902851852638 SO133-09M\n", + "7 42 5902851852614 SO133-09M\n", + "8 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 183, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 29 5902805820447 RH797-81X\n", + "2 32 5902805820447 RH797-81X\n", + "3 34 5902805820447 RH797-81X\n", + "4 38 5902851852614 SO133-09M\n", + "5 38 5902851852638 SO133-09M\n", + "6 42 5902851852638 SO133-09M\n", + "7 42 5902851852614 SO133-09M\n", + "8 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 186, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "def c2(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(20,minimal_epc.astype(int))\n", + " \n", + " if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(5000,kneedle.knee_y/5)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.3*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>24</td>\n", + " <td>5902805820461</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820461</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>27</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 21 5902805820447 RH797-81X\n", + "1 24 5902805820447 RH797-81X\n", + "2 24 5902805820461 RH797-81X\n", + "3 24 5902805820447 RH797-81X\n", + "4 24 5902805820461 RH797-81X\n", + "5 26 5902805820447 RH797-81X\n", + "6 27 5902805820447 RH797-81X\n", + "7 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(train6)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>42</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 34 5902805820447 RH797-81X\n", + "3 42 5902805820447 RH797-81X" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(z6)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 18 5902851852638 SO133-09M\n", + "1 11 5902690542745 QY337-00X\n", + "2 11 5902690542769 QY337-00X\n", + "3 21 5902805820447 RH797-81X\n", + "4 24 5902805820447 RH797-81X\n", + "5 26 5902805820447 RH797-81X\n", + "6 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>42</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 42 5902690542769 QY337-00X\n", + "2 42 5902690542745 QY337-00X\n", + "3 29 5902805820447 RH797-81X\n", + "4 32 5902805820447 RH797-81X\n", + "5 34 5902805820447 RH797-81X\n", + "6 38 5902851852614 SO133-09M\n", + "7 38 5902851852638 SO133-09M\n", + "8 38 5902851852638 SO133-09M\n", + "9 42 5902851852638 SO133-09M\n", + "10 42 5902851852614 SO133-09M\n", + "11 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "# najlepszy model\n", + "def c6(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EPC', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(10,minimal_epc.astype(int))\n", + " \n", + " if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(2000,kneedle.knee_y/3)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.4*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EPC':proba[proba['EPC'] == b].EPC.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "train7 = train[(train.StyleColor == 'SF078-MLC') | (train.StyleColor == 'RH797-81X') | (train.StyleColor == 'SL171-99X')]\n", + "test7 = test[(test.StyleColor == 'SF078-MLC') | (test.StyleColor == 'RH797-81X') | (test.StyleColor == 'SL171-99X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th></th>\n", + " <th>SUM_PROXIMITY_KMS</th>\n", + " <th>TIME_MS</th>\n", + " <th>EAN</th>\n", + " <th>MEASUREMENT</th>\n", + " </tr>\n", + " <tr>\n", + " <th>StyleColor</th>\n", + " <th>EPC</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th rowspan=\"5\" valign=\"top\">RH797-81X</th>\n", + " <th>3035684754501F0000B5B614</th>\n", + " <td>1699</td>\n", + " <td>1699</td>\n", + " <td>1699</td>\n", + " <td>1699</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3035684754501F0000B5B632</th>\n", + " <td>1393</td>\n", + " <td>1393</td>\n", + " <td>1393</td>\n", + " <td>1393</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3035684754501F4000B5B6E5</th>\n", + " <td>1646</td>\n", + " <td>1646</td>\n", + " <td>1646</td>\n", + " <td>1646</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3035684754501F8000B5B6A5</th>\n", + " <td>1329</td>\n", + " <td>1329</td>\n", + " <td>1329</td>\n", + " <td>1329</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3035684754501F8000B5B6E6</th>\n", + " <td>1625</td>\n", + " <td>1625</td>\n", + " <td>1625</td>\n", + " <td>1625</td>\n", + " </tr>\n", + " <tr>\n", + " <th rowspan=\"4\" valign=\"top\">SF078-MLC</th>\n", + " <th>30356849FC1723C000B5B1A3</th>\n", + " <td>934</td>\n", + " <td>934</td>\n", + " <td>934</td>\n", + " <td>934</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30356849FC17244000B59A90</th>\n", + " <td>1187</td>\n", + " <td>1187</td>\n", + " <td>1187</td>\n", + " <td>1187</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30356849FC1724C000B599A7</th>\n", + " <td>1108</td>\n", + " <td>1108</td>\n", + " <td>1108</td>\n", + " <td>1108</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30356849FC1724C000B59A42</th>\n", + " <td>1179</td>\n", + " <td>1179</td>\n", + " <td>1179</td>\n", + " <td>1179</td>\n", + " </tr>\n", + " <tr>\n", + " <th rowspan=\"4\" valign=\"top\">SL171-99X</th>\n", + " <th>303568480C287A8000B5BA63</th>\n", + " <td>2191</td>\n", + " <td>2191</td>\n", + " <td>2191</td>\n", + " <td>2191</td>\n", + " </tr>\n", + " <tr>\n", + " <th>303568480C287A8000B5BABD</th>\n", + " <td>785</td>\n", + " <td>785</td>\n", + " <td>785</td>\n", + " <td>785</td>\n", + " </tr>\n", + " <tr>\n", + " <th>303568480C287A8000B5BADA</th>\n", + " <td>1241</td>\n", + " <td>1241</td>\n", + " <td>1241</td>\n", + " <td>1241</td>\n", + " </tr>\n", + " <tr>\n", + " <th>303568480C287AC000B5BAD5</th>\n", + " <td>1152</td>\n", + " <td>1152</td>\n", + " <td>1152</td>\n", + " <td>1152</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " SUM_PROXIMITY_KMS TIME_MS EAN \\\n", + "StyleColor EPC \n", + "RH797-81X 3035684754501F0000B5B614 1699 1699 1699 \n", + " 3035684754501F0000B5B632 1393 1393 1393 \n", + " 3035684754501F4000B5B6E5 1646 1646 1646 \n", + " 3035684754501F8000B5B6A5 1329 1329 1329 \n", + " 3035684754501F8000B5B6E6 1625 1625 1625 \n", + "SF078-MLC 30356849FC1723C000B5B1A3 934 934 934 \n", + " 30356849FC17244000B59A90 1187 1187 1187 \n", + " 30356849FC1724C000B599A7 1108 1108 1108 \n", + " 30356849FC1724C000B59A42 1179 1179 1179 \n", + "SL171-99X 303568480C287A8000B5BA63 2191 2191 2191 \n", + " 303568480C287A8000B5BABD 785 785 785 \n", + " 303568480C287A8000B5BADA 1241 1241 1241 \n", + " 303568480C287AC000B5BAD5 1152 1152 1152 \n", + "\n", + " MEASUREMENT \n", + "StyleColor EPC \n", + "RH797-81X 3035684754501F0000B5B614 1699 \n", + " 3035684754501F0000B5B632 1393 \n", + " 3035684754501F4000B5B6E5 1646 \n", + " 3035684754501F8000B5B6A5 1329 \n", + " 3035684754501F8000B5B6E6 1625 \n", + "SF078-MLC 30356849FC1723C000B5B1A3 934 \n", + " 30356849FC17244000B59A90 1187 \n", + " 30356849FC1724C000B599A7 1108 \n", + " 30356849FC1724C000B59A42 1179 \n", + "SL171-99X 303568480C287A8000B5BA63 2191 \n", + " 303568480C287A8000B5BABD 785 \n", + " 303568480C287A8000B5BADA 1241 \n", + " 303568480C287AC000B5BAD5 1152 " + ] + }, + "execution_count": 123, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train7.groupby(['StyleColor','EPC']).count()" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EPC</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>27</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>28</td>\n", + " <td>30356849FC1724C000B59A42</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>21</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>24</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>27</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>28</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EPC StyleColor\n", + "0 27 303568480C287AC000B5BAD5 SL171-99X\n", + "1 28 30356849FC1724C000B59A42 SF078-MLC\n", + "2 21 3035684754501F0000B5B614 RH797-81X\n", + "3 24 3035684754501F0000B5B614 RH797-81X\n", + "4 27 3035684754501F0000B5B614 RH797-81X\n", + "5 28 3035684754501F0000B5B614 RH797-81X" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c6(train7)" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EPC</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>32</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>35</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BA63</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BADA</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>41</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>29</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>32</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EPC StyleColor\n", + "0 32 303568480C287AC000B5BAD5 SL171-99X\n", + "1 35 303568480C287AC000B5BAD5 SL171-99X\n", + "2 38 303568480C287AC000B5BAD5 SL171-99X\n", + "3 38 303568480C287A8000B5BA63 SL171-99X\n", + "4 38 303568480C287A8000B5BADA SL171-99X\n", + "5 38 303568480C287A8000B5BABD SL171-99X\n", + "6 41 303568480C287AC000B5BAD5 SL171-99X\n", + "7 42 303568480C287AC000B5BAD5 SL171-99X\n", + "8 42 303568480C287A8000B5BABD SL171-99X\n", + "9 29 3035684754501F0000B5B614 RH797-81X\n", + "10 32 3035684754501F0000B5B614 RH797-81X" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c6(test7)" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>23</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820461</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>24</td>\n", + " <td>5902805820461</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>27</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 23 5902975236994 SF078-MLC\n", + "1 24 5902975236994 SF078-MLC\n", + "2 21 5902805820447 RH797-81X\n", + "3 24 5902805820447 RH797-81X\n", + "4 24 5902805820461 RH797-81X\n", + "5 24 5902805820447 RH797-81X\n", + "6 24 5902805820461 RH797-81X\n", + "7 26 5902805820447 RH797-81X\n", + "8 27 5902805820447 RH797-81X\n", + "9 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(train7)" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>32</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>38</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>42</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>42</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>29</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>36</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>38</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>38</td>\n", + " <td>5902975236956</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>40</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>41</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>42</td>\n", + " <td>5902975236956</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 32 5902851414515 SL171-99X\n", + "1 38 5902851414515 SL171-99X\n", + "2 38 5902851414508 SL171-99X\n", + "3 38 5902851414508 SL171-99X\n", + "4 38 5902851414508 SL171-99X\n", + "5 42 5902851414515 SL171-99X\n", + "6 42 5902851414508 SL171-99X\n", + "7 29 5902805820447 RH797-81X\n", + "8 32 5902805820447 RH797-81X\n", + "9 34 5902805820447 RH797-81X\n", + "10 42 5902805820447 RH797-81X\n", + "11 29 5902975236994 SF078-MLC\n", + "12 36 5902975236994 SF078-MLC\n", + "13 38 5902975236994 SF078-MLC\n", + "14 38 5902975236956 SF078-MLC\n", + "15 40 5902975236994 SF078-MLC\n", + "16 41 5902975236994 SF078-MLC\n", + "17 42 5902975236956 SF078-MLC" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(test7)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EPC</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>12</td>\n", + " <td>303568480C2B874000B59A39</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>23</td>\n", + " <td>303568480C2B874000B59A39</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>28</td>\n", + " <td>303568480C2B868000B599B2</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>303568480C357A0000B59999</td>\n", + " <td>SB281-90M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>16</td>\n", + " <td>303568480C357A0000B59999</td>\n", + " <td>SB281-90M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>7</td>\n", + " <td>30356847541DA80000B5BA54</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>17</td>\n", + " <td>303568480C3455C000B5B30A</td>\n", + " <td>RV167-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>17</td>\n", + " <td>30356847542CCD0000B59A80</td>\n", + " <td>QJ677-33X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>26</td>\n", + " <td>30356847542CCD8000B599FA</td>\n", + " <td>QJ677-33X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>26</td>\n", + " <td>30356847542CCD0000B59A26</td>\n", + " <td>QJ677-33X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>4</td>\n", + " <td>3035684754340CC000B594C3</td>\n", + " <td>RH267-55J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>18</td>\n", + " <td>3035684754340D0000B594EB</td>\n", + " <td>RH267-55J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>18</td>\n", + " <td>3035684754340CC000B594C6</td>\n", + " <td>RH267-55J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>27</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>7</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>17</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>17</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>24</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>24</td>\n", + " <td>30356847540FE2C000B59A68</td>\n", + " <td>RB254-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>28</td>\n", + " <td>30356849FC1724C000B59A42</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>11</td>\n", + " <td>303568458835008000B5BAD1</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>11</td>\n", + " <td>303568458835010000B5BA58</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>19</td>\n", + " <td>303568458835008000B5BAD1</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>19</td>\n", + " <td>303568458835010000B5BA58</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>7</td>\n", + " <td>303568475415740000B5A5CD</td>\n", + " <td>RN633-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>1</td>\n", + " <td>303568475434134000B5B6DF</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>2</td>\n", + " <td>30356847542B6D4000B5B656</td>\n", + " <td>RJ365-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>2</td>\n", + " <td>30356847542B6D0000B5B65A</td>\n", + " <td>RJ365-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>12</td>\n", + " <td>30356849FC1E348000B5B2D4</td>\n", + " <td>SP090-90X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>21</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>24</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31</th>\n", + " <td>27</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>32</th>\n", + " <td>28</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EPC StyleColor\n", + "0 12 303568480C2B874000B59A39 RS483-99X\n", + "1 23 303568480C2B874000B59A39 RS483-99X\n", + "2 28 303568480C2B868000B599B2 RS483-99X\n", + "3 1 303568480C357A0000B59999 SB281-90M\n", + "4 16 303568480C357A0000B59999 SB281-90M\n", + "5 7 30356847541DA80000B5BA54 RJ369-87X\n", + "6 17 303568480C3455C000B5B30A RV167-MLC\n", + "7 17 30356847542CCD0000B59A80 QJ677-33X\n", + "8 26 30356847542CCD8000B599FA QJ677-33X\n", + "9 26 30356847542CCD0000B59A26 QJ677-33X\n", + "10 4 3035684754340CC000B594C3 RH267-55J\n", + "11 18 3035684754340D0000B594EB RH267-55J\n", + "12 18 3035684754340CC000B594C6 RH267-55J\n", + "13 27 303568480C287AC000B5BAD5 SL171-99X\n", + "14 7 303568480C5343C000B599F6 SO133-09M\n", + "15 17 303568480C5343C000B599C8 SO133-09M\n", + "16 17 303568480C53434000B599E1 SO133-09M\n", + "17 24 303568480C53434000B599E1 SO133-09M\n", + "18 24 30356847540FE2C000B59A68 RB254-00X\n", + "19 28 30356849FC1724C000B59A42 SF078-MLC\n", + "20 11 303568458835008000B5BAD1 QY337-00X\n", + "21 11 303568458835010000B5BA58 QY337-00X\n", + "22 19 303568458835008000B5BAD1 QY337-00X\n", + "23 19 303568458835010000B5BA58 QY337-00X\n", + "24 7 303568475415740000B5A5CD RN633-00X\n", + "25 1 303568475434134000B5B6DF RH267-59J\n", + "26 2 30356847542B6D4000B5B656 RJ365-09M\n", + "27 2 30356847542B6D0000B5B65A RJ365-09M\n", + "28 12 30356849FC1E348000B5B2D4 SP090-90X\n", + "29 21 3035684754501F0000B5B614 RH797-81X\n", + "30 24 3035684754501F0000B5B614 RH797-81X\n", + "31 27 3035684754501F0000B5B614 RH797-81X\n", + "32 28 3035684754501F0000B5B614 RH797-81X" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c6(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EPC</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>42</td>\n", + " <td>3035684754340E0000B594E8</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>35</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>38</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BA63</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BADA</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>41</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>42</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>38</td>\n", + " <td>303568480C2B868000B599B2</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>38</td>\n", + " <td>303568480C34548000B5B2B5</td>\n", + " <td>RV167-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>42</td>\n", + " <td>303568458835010000B5BA58</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>42</td>\n", + " <td>303568458835008000B5BAD1</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>32</td>\n", + " <td>30356847541DA80000B5BA54</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>29</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>32</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>29</td>\n", + " <td>303568475415744000B599FE</td>\n", + " <td>RN633-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>39</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>42</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>42</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>42</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>35</td>\n", + " <td>303568475450218000B59781</td>\n", + " <td>RH797-00X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EPC StyleColor\n", + "0 42 3035684754340E0000B594E8 RH267-85J\n", + "1 32 303568480C287AC000B5BAD5 SL171-99X\n", + "2 35 303568480C287AC000B5BAD5 SL171-99X\n", + "3 38 303568480C287AC000B5BAD5 SL171-99X\n", + "4 38 303568480C287A8000B5BA63 SL171-99X\n", + "5 38 303568480C287A8000B5BADA SL171-99X\n", + "6 38 303568480C287A8000B5BABD SL171-99X\n", + "7 41 303568480C287AC000B5BAD5 SL171-99X\n", + "8 42 303568480C287AC000B5BAD5 SL171-99X\n", + "9 42 303568480C287A8000B5BABD SL171-99X\n", + "10 38 303568480C2B868000B599B2 RS483-99X\n", + "11 38 303568480C34548000B5B2B5 RV167-87X\n", + "12 42 303568458835010000B5BA58 QY337-00X\n", + "13 42 303568458835008000B5BAD1 QY337-00X\n", + "14 32 30356847541DA80000B5BA54 RJ369-87X\n", + "15 29 3035684754501F0000B5B614 RH797-81X\n", + "16 32 3035684754501F0000B5B614 RH797-81X\n", + "17 29 303568475415744000B599FE RN633-00X\n", + "18 39 303568480C5343C000B599F6 SO133-09M\n", + "19 42 303568480C5343C000B599F6 SO133-09M\n", + "20 42 303568480C53434000B599E1 SO133-09M\n", + "21 42 303568480C5343C000B599C8 SO133-09M\n", + "22 35 303568475450218000B59781 RH797-00X" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c6(test)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/DBSCAN_1-zmiany_df-Copy1.ipynb b/DBSCAN_1-zmiany_df-Copy1.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c73e7893e86dd37c1a2996f67821d22991bfcc2a --- /dev/null +++ b/DBSCAN_1-zmiany_df-Copy1.ipynb @@ -0,0 +1,7725 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pyodbc\n", + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import urllib\n", + "import seaborn as sns\n", + "from matplotlib import pyplot as plt\n", + "import numpy as np\n", + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "params = urllib.parse.quote_plus(\"DRIVER={ODBC Driver 17 for SQL Server};\"\n", + " #\"SERVER=dbserver.mif.pg.gda.pl,1433;\"\n", + " \"SERVER=127.0.0.1,1433;\"\n", + " \"DATABASE=silkycoders;\"\n", + " \"UID=;\"\n", + " \"PWD=\")\n", + "\n", + "engine = create_engine(\"mssql+pyodbc:///?odbc_connect={}\".format(params))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"\"\"SELECT rfid.*, it.*, sub.*, cl.*, dep.*, br.*\n", + " FROM rfid.Logs rfid \n", + " JOIN rfid.EanEpc ean \n", + " ON rfid.EPC = ean.EPC \n", + " JOIN dw.Item it \n", + " ON ean.EAN = it.EAN \n", + " JOIN dw.Subclass sub \n", + " ON sub.SubclassID = it.SubclassID\n", + " JOIN dw.Class cl\n", + " ON sub.ClassID = cl.ClassID\n", + " JOIN dw.Department dep\n", + " ON dep.DepartmentID = cl.DepartmentID\n", + " JOIN dw.Brand br\n", + " ON dep.BrandID = br.BrandID\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_sql_query(query, engine)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_timestamp(df : pd.DataFrame):\n", + " \n", + " dt = df.sort_values(by=\"TIMESTAMP\").reset_index(drop=True)\n", + " dt[\"HOUR\"] = dt.TIMESTAMP.astype('datetime64[ns]').dt.hour.astype(int)\n", + " dt[\"MIN\"] = dt.TIMESTAMP.astype('datetime64[ns]').dt.minute.astype(int)\n", + " dt[\"SEC\"] = dt.TIMESTAMP.astype('datetime64[ns]').dt.second.astype(int)\n", + " dt[\"MICROSEC\"] = dt.TIMESTAMP.astype('datetime64[ns]').dt.microsecond.astype(int)\n", + "\n", + " dt[\"MILISEC\"] = dt.MICROSEC/1000 + dt.SEC*1000 + dt.MIN*60000 + dt.HOUR*3600000\n", + " dt[\"TIME_MS\"] = dt.MILISEC - dt.MILISEC[0]\n", + " \n", + " dt['TIME_PER_MEASUREMENT_MS'] = 0\n", + " dt['NUMBER_OF_SIGNALS'] = 0\n", + " dt['LENGTH_OF_MEASUREMENT'] = 0\n", + " \n", + " for m in dt.MEASUREMENT.unique():\n", + " filtr = (dt.MEASUREMENT == m)\n", + " dt.loc[filtr,'TIME_PER_MEASUREMENT_MS'] = dt[filtr].MILISEC - dt[filtr].MILISEC.iloc[0]\n", + " dt.loc[filtr, \"NUMBER_OF_SIGNALS\"] = len(dt[filtr])\n", + " dt.loc[filtr, 'LENGTH_OF_MEASUREMENT'] = dt[filtr].TIME_PER_MEASUREMENT_MS.max()\n", + " \n", + " dt[\"TIME_KMS\"] = np.floor(dt.TIME_MS/1000) \n", + " dt = dt.merge(dt.groupby(['EPC','TIME_KMS'])[\"PROXIMITY\"].max().reset_index(name=\"MAX_PROXIMITY_KMS\"), how=\"left\",\n", + " on = ['EPC','TIME_KMS'])\n", + " dt = dt.merge(dt.groupby(['EPC','TIME_KMS'])[\"PROXIMITY\"].sum().reset_index(name=\"SUM_PROXIMITY_KMS\"), how=\"left\",\n", + " on = ['EPC','TIME_KMS'])\n", + " return dt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = prepare_timestamp(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('df.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 502689 entries, 0 to 502688\n", + "Data columns (total 36 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 EPC 502689 non-null object \n", + " 1 PROXIMITY 502689 non-null float64\n", + " 2 TIMESTAMP 502689 non-null object \n", + " 3 MEASUREMENT 502689 non-null int64 \n", + " 4 ITEMID 502689 non-null int64 \n", + " 5 EAN 502689 non-null int64 \n", + " 6 StyleColorSize 502689 non-null object \n", + " 7 StyleColor 502689 non-null object \n", + " 8 Size 502689 non-null object \n", + " 9 SubclassID 502689 non-null int64 \n", + " 10 ItemSeason 502689 non-null object \n", + " 11 FashionLevel 369997 non-null object \n", + " 12 SubclassID.1 502689 non-null int64 \n", + " 13 SubclassName 502689 non-null object \n", + " 14 ClassID 502689 non-null int64 \n", + " 15 ClassID.1 502689 non-null int64 \n", + " 16 ClassName 502689 non-null object \n", + " 17 DepartmentID 502689 non-null int64 \n", + " 18 DepartmentID.1 502689 non-null int64 \n", + " 19 DepartmentName 502689 non-null object \n", + " 20 BrandID 502689 non-null int64 \n", + " 21 BrandID.1 502689 non-null int64 \n", + " 22 BrandName 502689 non-null object \n", + " 23 Active 502689 non-null bool \n", + " 24 HOUR 502689 non-null int64 \n", + " 25 MIN 502689 non-null int64 \n", + " 26 SEC 502689 non-null int64 \n", + " 27 MICROSEC 502689 non-null int64 \n", + " 28 MILISEC 502689 non-null float64\n", + " 29 TIME_MS 502689 non-null float64\n", + " 30 TIME_PER_MEASUREMENT_MS 502689 non-null float64\n", + " 31 NUMBER_OF_SIGNALS 502689 non-null int64 \n", + " 32 LENGTH_OF_MEASUREMENT 502689 non-null int64 \n", + " 33 TIME_KMS 502689 non-null float64\n", + " 34 MAX_PROXIMITY_KMS 502689 non-null float64\n", + " 35 SUM_PROXIMITY_KMS 502689 non-null float64\n", + "dtypes: bool(1), float64(7), int64(17), object(11)\n", + "memory usage: 134.7+ MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>PROXIMITY</th>\n", + " <th>MEASUREMENT</th>\n", + " <th>ITEMID</th>\n", + " <th>EAN</th>\n", + " <th>SubclassID</th>\n", + " <th>SubclassID.1</th>\n", + " <th>ClassID</th>\n", + " <th>ClassID.1</th>\n", + " <th>DepartmentID</th>\n", + " <th>DepartmentID.1</th>\n", + " <th>...</th>\n", + " <th>SEC</th>\n", + " <th>MICROSEC</th>\n", + " <th>MILISEC</th>\n", + " <th>TIME_MS</th>\n", + " <th>TIME_PER_MEASUREMENT_MS</th>\n", + " <th>NUMBER_OF_SIGNALS</th>\n", + " <th>LENGTH_OF_MEASUREMENT</th>\n", + " <th>TIME_KMS</th>\n", + " <th>MAX_PROXIMITY_KMS</th>\n", + " <th>SUM_PROXIMITY_KMS</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>5.026890e+05</td>\n", + " <td>5.026890e+05</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>...</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>5.026890e+05</td>\n", + " <td>5.026890e+05</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.00000</td>\n", + " <td>502689.000000</td>\n", + " <td>502689.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>mean</th>\n", + " <td>-75.406017</td>\n", + " <td>23.412382</td>\n", + " <td>2.169862e+06</td>\n", + " <td>5.902835e+12</td>\n", + " <td>83.920704</td>\n", + " <td>83.920704</td>\n", + " <td>18.231477</td>\n", + " <td>18.231477</td>\n", + " <td>2.609574</td>\n", + " <td>2.609574</td>\n", + " <td>...</td>\n", + " <td>29.193547</td>\n", + " <td>499773.110213</td>\n", + " <td>3.721192e+07</td>\n", + " <td>2.018186e+06</td>\n", + " <td>100132.210719</td>\n", + " <td>15383.906986</td>\n", + " <td>199835.398777</td>\n", + " <td>2017.68607</td>\n", + " <td>-72.497318</td>\n", + " <td>-398.108291</td>\n", + " </tr>\n", + " <tr>\n", + " <th>std</th>\n", + " <td>5.698062</td>\n", + " <td>12.175284</td>\n", + " <td>7.798483e+04</td>\n", + " <td>7.380986e+07</td>\n", + " <td>142.489244</td>\n", + " <td>142.489244</td>\n", + " <td>8.844056</td>\n", + " <td>8.844056</td>\n", + " <td>0.937828</td>\n", + " <td>0.937828</td>\n", + " <td>...</td>\n", + " <td>17.223297</td>\n", + " <td>288469.414710</td>\n", + " <td>1.121487e+06</td>\n", + " <td>1.121487e+06</td>\n", + " <td>81859.831696</td>\n", + " <td>8217.121271</td>\n", + " <td>101049.072703</td>\n", + " <td>1121.48684</td>\n", + " <td>5.893956</td>\n", + " <td>262.167663</td>\n", + " </tr>\n", + " <tr>\n", + " <th>min</th>\n", + " <td>-110.000000</td>\n", + " <td>1.000000</td>\n", + " <td>2.028742e+06</td>\n", + " <td>5.902691e+12</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>3.519374e+07</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000</td>\n", + " <td>4597.000000</td>\n", + " <td>53538.000000</td>\n", + " <td>0.00000</td>\n", + " <td>-100.500000</td>\n", + " <td>-2629.400000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25%</th>\n", + " <td>-79.900000</td>\n", + " <td>13.000000</td>\n", + " <td>2.113407e+06</td>\n", + " <td>5.902805e+12</td>\n", + " <td>11.000000</td>\n", + " <td>11.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>14.000000</td>\n", + " <td>250000.000000</td>\n", + " <td>3.624898e+07</td>\n", + " <td>1.055248e+06</td>\n", + " <td>38108.000000</td>\n", + " <td>8533.000000</td>\n", + " <td>127122.000000</td>\n", + " <td>1055.00000</td>\n", + " <td>-76.400000</td>\n", + " <td>-515.900000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>50%</th>\n", + " <td>-75.700000</td>\n", + " <td>24.000000</td>\n", + " <td>2.155604e+06</td>\n", + " <td>5.902806e+12</td>\n", + " <td>82.000000</td>\n", + " <td>82.000000</td>\n", + " <td>16.000000</td>\n", + " <td>16.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>29.000000</td>\n", + " <td>500000.000000</td>\n", + " <td>3.719645e+07</td>\n", + " <td>2.002711e+06</td>\n", + " <td>78477.000000</td>\n", + " <td>13321.000000</td>\n", + " <td>176026.000000</td>\n", + " <td>2002.00000</td>\n", + " <td>-72.900000</td>\n", + " <td>-342.900000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75%</th>\n", + " <td>-71.900000</td>\n", + " <td>33.000000</td>\n", + " <td>2.226340e+06</td>\n", + " <td>5.902852e+12</td>\n", + " <td>82.000000</td>\n", + " <td>82.000000</td>\n", + " <td>25.000000</td>\n", + " <td>25.000000</td>\n", + " <td>3.000000</td>\n", + " <td>3.000000</td>\n", + " <td>...</td>\n", + " <td>44.000000</td>\n", + " <td>749000.000000</td>\n", + " <td>3.815973e+07</td>\n", + " <td>2.965991e+06</td>\n", + " <td>139431.000000</td>\n", + " <td>22217.000000</td>\n", + " <td>265127.000000</td>\n", + " <td>2965.00000</td>\n", + " <td>-68.400000</td>\n", + " <td>-225.700000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max</th>\n", + " <td>-38.900000</td>\n", + " <td>43.000000</td>\n", + " <td>2.304122e+06</td>\n", + " <td>5.902975e+12</td>\n", + " <td>630.000000</td>\n", + " <td>630.000000</td>\n", + " <td>41.000000</td>\n", + " <td>41.000000</td>\n", + " <td>6.000000</td>\n", + " <td>6.000000</td>\n", + " <td>...</td>\n", + " <td>59.000000</td>\n", + " <td>999000.000000</td>\n", + " <td>3.912875e+07</td>\n", + " <td>3.935013e+06</td>\n", + " <td>435771.000000</td>\n", + " <td>35350.000000</td>\n", + " <td>435771.000000</td>\n", + " <td>3935.00000</td>\n", + " <td>-38.900000</td>\n", + " <td>-52.300000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>8 rows Ă 24 columns</p>\n", + "</div>" + ], + "text/plain": [ + " PROXIMITY MEASUREMENT ITEMID EAN \\\n", + "count 502689.000000 502689.000000 5.026890e+05 5.026890e+05 \n", + "mean -75.406017 23.412382 2.169862e+06 5.902835e+12 \n", + "std 5.698062 12.175284 7.798483e+04 7.380986e+07 \n", + "min -110.000000 1.000000 2.028742e+06 5.902691e+12 \n", + "25% -79.900000 13.000000 2.113407e+06 5.902805e+12 \n", + "50% -75.700000 24.000000 2.155604e+06 5.902806e+12 \n", + "75% -71.900000 33.000000 2.226340e+06 5.902852e+12 \n", + "max -38.900000 43.000000 2.304122e+06 5.902975e+12 \n", + "\n", + " SubclassID SubclassID.1 ClassID ClassID.1 \\\n", + "count 502689.000000 502689.000000 502689.000000 502689.000000 \n", + "mean 83.920704 83.920704 18.231477 18.231477 \n", + "std 142.489244 142.489244 8.844056 8.844056 \n", + "min 10.000000 10.000000 10.000000 10.000000 \n", + "25% 11.000000 11.000000 10.000000 10.000000 \n", + "50% 82.000000 82.000000 16.000000 16.000000 \n", + "75% 82.000000 82.000000 25.000000 25.000000 \n", + "max 630.000000 630.000000 41.000000 41.000000 \n", + "\n", + " DepartmentID DepartmentID.1 ... SEC MICROSEC \\\n", + "count 502689.000000 502689.000000 ... 502689.000000 502689.000000 \n", + "mean 2.609574 2.609574 ... 29.193547 499773.110213 \n", + "std 0.937828 0.937828 ... 17.223297 288469.414710 \n", + "min 2.000000 2.000000 ... 0.000000 0.000000 \n", + "25% 2.000000 2.000000 ... 14.000000 250000.000000 \n", + "50% 2.000000 2.000000 ... 29.000000 500000.000000 \n", + "75% 3.000000 3.000000 ... 44.000000 749000.000000 \n", + "max 6.000000 6.000000 ... 59.000000 999000.000000 \n", + "\n", + " MILISEC TIME_MS TIME_PER_MEASUREMENT_MS NUMBER_OF_SIGNALS \\\n", + "count 5.026890e+05 5.026890e+05 502689.000000 502689.000000 \n", + "mean 3.721192e+07 2.018186e+06 100132.210719 15383.906986 \n", + "std 1.121487e+06 1.121487e+06 81859.831696 8217.121271 \n", + "min 3.519374e+07 0.000000e+00 0.000000 4597.000000 \n", + "25% 3.624898e+07 1.055248e+06 38108.000000 8533.000000 \n", + "50% 3.719645e+07 2.002711e+06 78477.000000 13321.000000 \n", + "75% 3.815973e+07 2.965991e+06 139431.000000 22217.000000 \n", + "max 3.912875e+07 3.935013e+06 435771.000000 35350.000000 \n", + "\n", + " LENGTH_OF_MEASUREMENT TIME_KMS MAX_PROXIMITY_KMS \\\n", + "count 502689.000000 502689.00000 502689.000000 \n", + "mean 199835.398777 2017.68607 -72.497318 \n", + "std 101049.072703 1121.48684 5.893956 \n", + "min 53538.000000 0.00000 -100.500000 \n", + "25% 127122.000000 1055.00000 -76.400000 \n", + "50% 176026.000000 2002.00000 -72.900000 \n", + "75% 265127.000000 2965.00000 -68.400000 \n", + "max 435771.000000 3935.00000 -38.900000 \n", + "\n", + " SUM_PROXIMITY_KMS \n", + "count 502689.000000 \n", + "mean -398.108291 \n", + "std 262.167663 \n", + "min -2629.400000 \n", + "25% -515.900000 \n", + "50% -342.900000 \n", + "75% -225.700000 \n", + "max -52.300000 \n", + "\n", + "[8 rows x 24 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.boxplot(df['LENGTH_OF_MEASUREMENT']);" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Usuwamy najdĹuĹźsze pomiary\n", + "df.drop(df.loc[df['LENGTH_OF_MEASUREMENT'] > 200000].index, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>PROXIMITY</th>\n", + " <th>MEASUREMENT</th>\n", + " <th>ITEMID</th>\n", + " <th>EAN</th>\n", + " <th>SubclassID</th>\n", + " <th>SubclassID.1</th>\n", + " <th>ClassID</th>\n", + " <th>ClassID.1</th>\n", + " <th>DepartmentID</th>\n", + " <th>DepartmentID.1</th>\n", + " <th>...</th>\n", + " <th>SEC</th>\n", + " <th>MICROSEC</th>\n", + " <th>MILISEC</th>\n", + " <th>TIME_MS</th>\n", + " <th>TIME_PER_MEASUREMENT_MS</th>\n", + " <th>NUMBER_OF_SIGNALS</th>\n", + " <th>LENGTH_OF_MEASUREMENT</th>\n", + " <th>TIME_KMS</th>\n", + " <th>MAX_PROXIMITY_KMS</th>\n", + " <th>SUM_PROXIMITY_KMS</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>3.177620e+05</td>\n", + " <td>3.177620e+05</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>...</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>3.177620e+05</td>\n", + " <td>3.177620e+05</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " <td>317762.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>mean</th>\n", + " <td>-75.526452</td>\n", + " <td>22.776858</td>\n", + " <td>2.170711e+06</td>\n", + " <td>5.902836e+12</td>\n", + " <td>82.639686</td>\n", + " <td>82.639686</td>\n", + " <td>18.142610</td>\n", + " <td>18.142610</td>\n", + " <td>2.603486</td>\n", + " <td>2.603486</td>\n", + " <td>...</td>\n", + " <td>29.378906</td>\n", + " <td>499539.922961</td>\n", + " <td>3.717748e+07</td>\n", + " <td>1.983745e+06</td>\n", + " <td>66756.283605</td>\n", + " <td>10103.277673</td>\n", + " <td>133222.336198</td>\n", + " <td>1983.244840</td>\n", + " <td>-72.536124</td>\n", + " <td>-404.190290</td>\n", + " </tr>\n", + " <tr>\n", + " <th>std</th>\n", + " <td>5.583369</td>\n", + " <td>12.801744</td>\n", + " <td>7.820521e+04</td>\n", + " <td>7.406653e+07</td>\n", + " <td>141.056329</td>\n", + " <td>141.056329</td>\n", + " <td>8.830015</td>\n", + " <td>8.830015</td>\n", + " <td>0.932835</td>\n", + " <td>0.932835</td>\n", + " <td>...</td>\n", + " <td>17.290876</td>\n", + " <td>288325.166786</td>\n", + " <td>1.183041e+06</td>\n", + " <td>1.183041e+06</td>\n", + " <td>44987.211264</td>\n", + " <td>3113.779828</td>\n", + " <td>39296.957914</td>\n", + " <td>1183.041729</td>\n", + " <td>5.831107</td>\n", + " <td>267.914649</td>\n", + " </tr>\n", + " <tr>\n", + " <th>min</th>\n", + " <td>-110.000000</td>\n", + " <td>1.000000</td>\n", + " <td>2.028742e+06</td>\n", + " <td>5.902691e+12</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>3.519374e+07</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000</td>\n", + " <td>4597.000000</td>\n", + " <td>53538.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-98.000000</td>\n", + " <td>-2629.400000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25%</th>\n", + " <td>-79.900000</td>\n", + " <td>12.000000</td>\n", + " <td>2.113407e+06</td>\n", + " <td>5.902805e+12</td>\n", + " <td>11.000000</td>\n", + " <td>11.000000</td>\n", + " <td>10.000000</td>\n", + " <td>10.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>14.000000</td>\n", + " <td>249000.000000</td>\n", + " <td>3.617897e+07</td>\n", + " <td>9.852318e+05</td>\n", + " <td>29587.000000</td>\n", + " <td>8027.000000</td>\n", + " <td>101041.000000</td>\n", + " <td>985.000000</td>\n", + " <td>-76.400000</td>\n", + " <td>-527.300000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>50%</th>\n", + " <td>-75.700000</td>\n", + " <td>24.000000</td>\n", + " <td>2.155605e+06</td>\n", + " <td>5.902806e+12</td>\n", + " <td>82.000000</td>\n", + " <td>82.000000</td>\n", + " <td>16.000000</td>\n", + " <td>16.000000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>...</td>\n", + " <td>29.000000</td>\n", + " <td>500000.000000</td>\n", + " <td>3.714444e+07</td>\n", + " <td>1.950704e+06</td>\n", + " <td>59599.000000</td>\n", + " <td>9887.000000</td>\n", + " <td>138579.000000</td>\n", + " <td>1950.000000</td>\n", + " <td>-72.900000</td>\n", + " <td>-350.700000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75%</th>\n", + " <td>-71.900000</td>\n", + " <td>34.000000</td>\n", + " <td>2.226340e+06</td>\n", + " <td>5.902852e+12</td>\n", + " <td>82.000000</td>\n", + " <td>82.000000</td>\n", + " <td>25.000000</td>\n", + " <td>25.000000</td>\n", + " <td>3.000000</td>\n", + " <td>3.000000</td>\n", + " <td>...</td>\n", + " <td>44.000000</td>\n", + " <td>748000.000000</td>\n", + " <td>3.834786e+07</td>\n", + " <td>3.154128e+06</td>\n", + " <td>97761.750000</td>\n", + " <td>12660.000000</td>\n", + " <td>168403.000000</td>\n", + " <td>3154.000000</td>\n", + " <td>-68.400000</td>\n", + " <td>-226.100000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max</th>\n", + " <td>-41.000000</td>\n", + " <td>42.000000</td>\n", + " <td>2.304122e+06</td>\n", + " <td>5.902975e+12</td>\n", + " <td>630.000000</td>\n", + " <td>630.000000</td>\n", + " <td>41.000000</td>\n", + " <td>41.000000</td>\n", + " <td>6.000000</td>\n", + " <td>6.000000</td>\n", + " <td>...</td>\n", + " <td>59.000000</td>\n", + " <td>999000.000000</td>\n", + " <td>3.908956e+07</td>\n", + " <td>3.895821e+06</td>\n", + " <td>189705.000000</td>\n", + " <td>15444.000000</td>\n", + " <td>189705.000000</td>\n", + " <td>3895.000000</td>\n", + " <td>-41.000000</td>\n", + " <td>-52.300000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>8 rows Ă 24 columns</p>\n", + "</div>" + ], + "text/plain": [ + " PROXIMITY MEASUREMENT ITEMID EAN \\\n", + "count 317762.000000 317762.000000 3.177620e+05 3.177620e+05 \n", + "mean -75.526452 22.776858 2.170711e+06 5.902836e+12 \n", + "std 5.583369 12.801744 7.820521e+04 7.406653e+07 \n", + "min -110.000000 1.000000 2.028742e+06 5.902691e+12 \n", + "25% -79.900000 12.000000 2.113407e+06 5.902805e+12 \n", + "50% -75.700000 24.000000 2.155605e+06 5.902806e+12 \n", + "75% -71.900000 34.000000 2.226340e+06 5.902852e+12 \n", + "max -41.000000 42.000000 2.304122e+06 5.902975e+12 \n", + "\n", + " SubclassID SubclassID.1 ClassID ClassID.1 \\\n", + "count 317762.000000 317762.000000 317762.000000 317762.000000 \n", + "mean 82.639686 82.639686 18.142610 18.142610 \n", + "std 141.056329 141.056329 8.830015 8.830015 \n", + "min 10.000000 10.000000 10.000000 10.000000 \n", + "25% 11.000000 11.000000 10.000000 10.000000 \n", + "50% 82.000000 82.000000 16.000000 16.000000 \n", + "75% 82.000000 82.000000 25.000000 25.000000 \n", + "max 630.000000 630.000000 41.000000 41.000000 \n", + "\n", + " DepartmentID DepartmentID.1 ... SEC MICROSEC \\\n", + "count 317762.000000 317762.000000 ... 317762.000000 317762.000000 \n", + "mean 2.603486 2.603486 ... 29.378906 499539.922961 \n", + "std 0.932835 0.932835 ... 17.290876 288325.166786 \n", + "min 2.000000 2.000000 ... 0.000000 0.000000 \n", + "25% 2.000000 2.000000 ... 14.000000 249000.000000 \n", + "50% 2.000000 2.000000 ... 29.000000 500000.000000 \n", + "75% 3.000000 3.000000 ... 44.000000 748000.000000 \n", + "max 6.000000 6.000000 ... 59.000000 999000.000000 \n", + "\n", + " MILISEC TIME_MS TIME_PER_MEASUREMENT_MS NUMBER_OF_SIGNALS \\\n", + "count 3.177620e+05 3.177620e+05 317762.000000 317762.000000 \n", + "mean 3.717748e+07 1.983745e+06 66756.283605 10103.277673 \n", + "std 1.183041e+06 1.183041e+06 44987.211264 3113.779828 \n", + "min 3.519374e+07 0.000000e+00 0.000000 4597.000000 \n", + "25% 3.617897e+07 9.852318e+05 29587.000000 8027.000000 \n", + "50% 3.714444e+07 1.950704e+06 59599.000000 9887.000000 \n", + "75% 3.834786e+07 3.154128e+06 97761.750000 12660.000000 \n", + "max 3.908956e+07 3.895821e+06 189705.000000 15444.000000 \n", + "\n", + " LENGTH_OF_MEASUREMENT TIME_KMS MAX_PROXIMITY_KMS \\\n", + "count 317762.000000 317762.000000 317762.000000 \n", + "mean 133222.336198 1983.244840 -72.536124 \n", + "std 39296.957914 1183.041729 5.831107 \n", + "min 53538.000000 0.000000 -98.000000 \n", + "25% 101041.000000 985.000000 -76.400000 \n", + "50% 138579.000000 1950.000000 -72.900000 \n", + "75% 168403.000000 3154.000000 -68.400000 \n", + "max 189705.000000 3895.000000 -41.000000 \n", + "\n", + " SUM_PROXIMITY_KMS \n", + "count 317762.000000 \n", + "mean -404.190290 \n", + "std 267.914649 \n", + "min -2629.400000 \n", + "25% -527.300000 \n", + "50% -350.700000 \n", + "75% -226.100000 \n", + "max -52.300000 \n", + "\n", + "[8 rows x 24 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 1, 2, 3, 4, 5, 7, 6, 10, 11, 12, 13, 14, 16, 18, 17, 19, 20,\n", + " 21, 23, 24, 26, 27, 28, 29, 31, 32, 34, 35, 36, 38, 37, 40, 39, 41,\n", + " 42], dtype=int64)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.MEASUREMENT.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 , RK485-99X\n", + "2 , RK485-99X\n", + "3 , RK485-99X\n", + "4 , RK485-99X\n", + "5 , RK485-99X\n", + "7 , RK485-99X\n", + "6 , RK485-99X\n", + "10 , RK485-99X\n", + "11 , RK485-99X\n", + "12 , RK485-99X\n", + "13 , RK485-99X\n", + "14 , RK485-99X\n", + "16 , RK485-99X\n", + "18 , RK485-99X\n", + "17 , RK485-99X\n", + "19 , RK485-99X\n", + "20 , RK485-99X\n", + "21 , RK485-99X\n", + "23 , RK485-99X\n", + "24 , RK485-99X\n", + "26 , RK485-99X\n", + "27 , RK485-99X\n", + "28 , RK485-99X\n", + "29 , RK485-99X\n", + "31 , RK485-99X\n", + "32 , RK485-99X\n", + "34 , RK485-99X\n", + "35 , RK485-99X\n", + "36 , RK485-99X\n", + "38 , RK485-99X\n", + "37 , RK485-99X\n", + "40 , RK485-99X\n", + "39 , RK485-99X\n", + "41 , RK485-99X\n", + "42 , RK485-99X\n" + ] + } + ], + "source": [ + "for i in df.MEASUREMENT.unique():\n", + " zb = df[df['MEASUREMENT'] == i]\n", + " for j in zb.StyleColor.unique():\n", + " zbior = zb[zb['StyleColor'] == j]\n", + " if zbior.EPC.unique().size == 1:\n", + " print(i,', ', j)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Zatem usuwamy caĹy ten Stylokolor\n", + "df.drop(df.loc[df['StyleColor'] == 'RK485-99X'].index, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Sprawdzamy, czy mamy klipsy przypisane do wiÄcej niĹź 1 Itemu\n", + "for i in df.MEASUREMENT.unique():\n", + " zb = df[df['MEASUREMENT'] == i]\n", + " for j in zb.EPC.unique():\n", + " zbior = zb[zb['EPC'] == j]\n", + " if zbior.EAN.unique().size > 1:\n", + " print(i,', ', j)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>EPC</th>\n", + " <th>PROXIMITY</th>\n", + " <th>TIMESTAMP</th>\n", + " <th>MEASUREMENT</th>\n", + " <th>ITEMID</th>\n", + " <th>EAN</th>\n", + " <th>StyleColorSize</th>\n", + " <th>StyleColor</th>\n", + " <th>Size</th>\n", + " <th>SubclassID</th>\n", + " <th>...</th>\n", + " <th>SEC</th>\n", + " <th>MICROSEC</th>\n", + " <th>MILISEC</th>\n", + " <th>TIME_MS</th>\n", + " <th>TIME_PER_MEASUREMENT_MS</th>\n", + " <th>NUMBER_OF_SIGNALS</th>\n", + " <th>LENGTH_OF_MEASUREMENT</th>\n", + " <th>TIME_KMS</th>\n", + " <th>MAX_PROXIMITY_KMS</th>\n", + " <th>SUM_PROXIMITY_KMS</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "<p>0 rows Ă 36 columns</p>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [EPC, PROXIMITY, TIMESTAMP, MEASUREMENT, ITEMID, EAN, StyleColorSize, StyleColor, Size, SubclassID, ItemSeason, FashionLevel, SubclassID.1, SubclassName, ClassID, ClassID.1, ClassName, DepartmentID, DepartmentID.1, DepartmentName, BrandID, BrandID.1, BrandName, Active, HOUR, MIN, SEC, MICROSEC, MILISEC, TIME_MS, TIME_PER_MEASUREMENT_MS, NUMBER_OF_SIGNALS, LENGTH_OF_MEASUREMENT, TIME_KMS, MAX_PROXIMITY_KMS, SUM_PROXIMITY_KMS]\n", + "Index: []\n", + "\n", + "[0 rows x 36 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#do testow samego modelu\n", + "test_1 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ((df.MEASUREMENT == 2) | (df.MEASUREMENT == 3) | (df.MEASUREMENT == 4) | (df.MEASUREMENT == 26) | (df.MEASUREMENT == 28) ) ]\n", + "\n", + "test1 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ( (df.MEASUREMENT == 1) ) ]\n", + "test2 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ( (df.MEASUREMENT == 2) ) ]\n", + "test3 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ( (df.MEASUREMENT == 3) ) ]\n", + "test4 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ( (df.MEASUREMENT == 4) ) ]\n", + "test5 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 5) ]\n", + "test6 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 6) ]\n", + "test9 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 9) ]\n", + "test12 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 12) ]\n", + "test22 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 22) ]\n", + "test24 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 24) ]\n", + "test25 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 25) ]\n", + "test21 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 21) ]\n", + "test29 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 29) ]\n", + "test28 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & (df.MEASUREMENT == 28) ]\n", + "\n", + "test29" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EAN', data=test6)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EAN', data=test_1[test_1.MEASUREMENT == 3])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EPC', data=test_1[test_1.MEASUREMENT == 4])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EPC', data=test_1[test_1.MEASUREMENT == 26])\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10,8))\n", + "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EAN', data=test_1[test_1.MEASUREMENT == 28])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "NastÄpnie tworzymy intuicyjny podziaĹ na zbiĂłr testowy i treningowy: pomiary przed godzinÄ 10.30 traktujemy jako zbiĂłr treningowy, natomiast te po godzinie 10.30 - jako zbiĂłr testowy." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "train = df[df.TIMESTAMP <= '2021-10-26T10:30:00.000']\n", + "train = train[['SUM_PROXIMITY_KMS','TIME_MS', 'EAN', 'MEASUREMENT','StyleColor', 'EPC']]\n", + "test = df[df.TIMESTAMP > '2021-10-26T10:30:00.000']\n", + "test = test[['SUM_PROXIMITY_KMS','TIME_MS', 'EAN', 'MEASUREMENT','StyleColor', 'EPC']]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from sklearn.cluster import DBSCAN\n", + "from sklearn import metrics\n", + "from sklearn.datasets import make_blobs\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.preprocessing import normalize\n", + "from sklearn.neighbors import NearestNeighbors\n", + "import plotly.express as px\n", + "from kneed import KneeLocator" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "#zbiĂłr testowy dla kilku stylokolorĂłw\n", + "test0 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & ((df.StyleColor == 'RH797-81X') | (df.StyleColor == 'SL171-99X') \n", + " | (df.StyleColor == 'RH797-59X'))]\n", + "caly1 = test0[['SUM_PROXIMITY_KMS','TIME_MS', 'EAN', 'MEASUREMENT','StyleColor', 'EPC']]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def c1(x, g, d, c):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(2,minimal_epc.astype(int))\n", + "\n", + " if minimal_epc > g:\n", + " minimal_epc = g\n", + " neighbors = g\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(6000,kneedle.knee_y/d)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > c*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "#zbiĂłr testowy dla kilku stylokolorĂłw \n", + "train1 = train[(train.StyleColor == 'RH267-85J') | (train.StyleColor == 'RJ369-87X') | (train.StyleColor =='RM119-93X') \n", + " | (train.StyleColor == 'RS483-99X') | (train.StyleColor == 'SB281-90M')]\n", + "\n", + "train2 = train[(train.StyleColor == 'RV167-MLC') | (train.StyleColor == 'RV462-87X') | (train.StyleColor =='QJ677-33X') \n", + " | (train.StyleColor == 'RH797-00X') | (train.StyleColor == 'RH267-55J')]\n", + "\n", + "train3 = train[(train.StyleColor == 'SL171-99X') | (train.StyleColor == 'SO133-09M') | (train.StyleColor =='RB254-00X') \n", + " | (train.StyleColor == 'SF078-MLC') | (train.StyleColor == 'QY337-00X')]\n", + "\n", + "train4 = train[(train.StyleColor == 'SP095-59X') | (train.StyleColor == 'RN633-00X') | (train.StyleColor =='RH267-59J') \n", + " | (train.StyleColor == 'RV167-87X')]\n", + "\n", + "train5 = train[(train.StyleColor == 'RJ365-09M') | (train.StyleColor == 'RH797-59X') | (train.StyleColor =='SP090-90X') \n", + " | (train.StyleColor == 'RH797-99X') | (train.StyleColor == 'RJ371-59M')]\n", + "\n", + "train6 = train[(train.StyleColor == 'RV462-99X') | (train.StyleColor == 'RH797-81X') | (train.StyleColor =='QZ555-20X') \n", + " | (train.StyleColor == 'RJ371-53M') | (train.StyleColor == 'RS054-99X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# Wybrane StyleColor\n", + "z1 = test[(test.StyleColor == 'RH267-85J') | (test.StyleColor == 'RJ369-87X') | (test.StyleColor =='RM119-93X') \n", + " | (test.StyleColor == 'RS483-99X') | (test.StyleColor == 'SB281-90M')]\n", + "\n", + "z2 = test[(test.StyleColor == 'RV167-MLC') | (test.StyleColor == 'RV462-87X') | (test.StyleColor =='QJ677-33X') \n", + " | (test.StyleColor == 'RH797-00X') | (test.StyleColor == 'RH267-55J')]\n", + "\n", + "z3 = test[(test.StyleColor == 'SL171-99X') | (test.StyleColor == 'SO133-09M') | (test.StyleColor =='RB254-00X') \n", + " | (test.StyleColor == 'SF078-MLC') | (test.StyleColor == 'QY337-00X')]\n", + "\n", + "z4 = test[(test.StyleColor == 'SP095-59X') | (test.StyleColor == 'RN633-00X') | (test.StyleColor =='RH267-59J') \n", + " | (test.StyleColor == 'RV167-87X') | (test.StyleColor == 'RK485-99X')]\n", + "\n", + "z5 = test[(test.StyleColor == 'RJ365-09M') | (test.StyleColor == 'RH797-59X') | (test.StyleColor =='SP090-90X') \n", + " | (test.StyleColor == 'RH797-99X') | (test.StyleColor == 'RJ371-59M')]\n", + "\n", + "z6 = test[(test.StyleColor == 'RV462-99X') | (test.StyleColor == 'RH797-81X') | (test.StyleColor =='QZ555-20X') \n", + " | (test.StyleColor == 'RJ371-53M') | (test.StyleColor == 'RS054-99X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "u = train[(train.StyleColor == 'RV462-99X') | (train.StyleColor == 'RH797-81X') | (train.StyleColor =='SL171-99X')]\n", + "t = test[(test.StyleColor == 'RV462-99X') | (test.StyleColor == 'RH797-81X') | (test.StyleColor =='SL171-99X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 21 5902805820447 RH797-81X\n", + "1 24 5902805820447 RH797-81X\n", + "2 26 5902805820447 RH797-81X\n", + "3 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(u)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 29 5902805820447 RH797-81X\n", + "2 32 5902805820447 RH797-81X\n", + "3 34 5902805820447 RH797-81X" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(t)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train1)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train2)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 18 5902851852638 SO133-09M\n", + "1 11 5902690542745 QY337-00X\n", + "2 11 5902690542769 QY337-00X" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train3)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train4)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train5)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 21 5902805820447 RH797-81X\n", + "1 24 5902805820447 RH797-81X\n", + "2 26 5902805820447 RH797-81X\n", + "3 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train6)" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 34 5902805820447 RH797-81X" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(z6)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 11 5902690542745 QY337-00X\n", + "1 11 5902690542769 QY337-00X\n", + "2 21 5902805820447 RH797-81X\n", + "3 26 5902805820447 RH797-81X\n", + "4 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 34 5902805820447 RH797-81X" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 18 5902851852638 SO133-09M\n", + "1 11 5902690542745 QY337-00X\n", + "2 11 5902690542769 QY337-00X\n", + "3 21 5902805820447 RH797-81X\n", + "4 24 5902805820447 RH797-81X\n", + "5 26 5902805820447 RH797-81X\n", + "6 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train,70,6,0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>42</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38</td>\n", + " <td>5902851445700</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>35</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>42</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>42</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>38</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>42</td>\n", + " <td>5902975236956</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 42 5902805533040 RH267-85J\n", + "1 38 5902851414515 SL171-99X\n", + "2 38 5902851445700 RS483-99X\n", + "3 35 5902690542769 QY337-00X\n", + "4 42 5902690542769 QY337-00X\n", + "5 42 5902690542745 QY337-00X\n", + "6 42 5902690542769 QY337-00X\n", + "7 29 5902805820447 RH797-81X\n", + "8 32 5902805820447 RH797-81X\n", + "9 34 5902805820447 RH797-81X\n", + "10 42 5902805820447 RH797-81X\n", + "11 38 5902975236994 SF078-MLC\n", + "12 42 5902975236956 SF078-MLC\n", + "13 38 5902851852614 SO133-09M\n", + "14 38 5902851852638 SO133-09M\n", + "15 38 5902851852638 SO133-09M\n", + "16 42 5902851852638 SO133-09M\n", + "17 42 5902851852614 SO133-09M\n", + "18 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test,70,6,0.4)" + ] + }, + { + "cell_type": "code", + "execution_count": 268, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>22</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>25</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>30</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 22 5902805820447 RH797-81X\n", + "1 25 5902805820447 RH797-81X\n", + "2 24 5902805820447 RH797-81X\n", + "3 26 5902805820447 RH797-81X\n", + "4 30 5902805820447 RH797-81X\n", + "5 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 268, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(caly1,0.7)" + ] + }, + { + "cell_type": "code", + "execution_count": 225, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>22</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>25</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>30</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 22 5902805820447 RH797-81X\n", + "1 21 5902805820447 RH797-81X\n", + "2 25 5902805820447 RH797-81X\n", + "3 24 5902805820447 RH797-81X\n", + "4 26 5902805820447 RH797-81X\n", + "5 30 5902805820447 RH797-81X\n", + "6 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 225, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(caly1, 0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + "DF = df[(df.StyleColor == 'SL171-99X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "proba = DF[ DF.MEASUREMENT == 28 ]\n", + "X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + "minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "303568480C287AC000B5BAD5 96\n", + "303568480C287A8000B5BA63 90\n", + "303568480C287A8000B5BADA 54\n", + "303568480C287A8000B5BABD 14\n", + "Name: EPC, dtype: int64" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "proba['EPC'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.0" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "minimal_epc" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "eps = np.floor(proba['EPC'].value_counts().max()*20)" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + "db.fit(X)\n", + "y_pred = db.fit_predict(X)\n", + "clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + "calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [ + "if (db.labels_[db.labels_ == -1].size != 0) :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.6*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': 24, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':'QY337-00X'}, ignore_index = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "303568480C287AC000B5BAD5 7\n", + "303568480C287A8000B5BA63 5\n", + "303568480C287A8000B5BADA 4\n", + "Name: EPC, dtype: int64" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "outliery" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(z1)" + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 29 5902805820447 RH797-81X\n", + "2 32 5902805820447 RH797-81X\n", + "3 34 5902805820447 RH797-81X\n", + "4 38 5902851852614 SO133-09M\n", + "5 38 5902851852638 SO133-09M\n", + "6 42 5902851852638 SO133-09M\n", + "7 42 5902851852614 SO133-09M\n", + "8 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 183, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 29 5902805820447 RH797-81X\n", + "2 32 5902805820447 RH797-81X\n", + "3 34 5902805820447 RH797-81X\n", + "4 38 5902851852614 SO133-09M\n", + "5 38 5902851852638 SO133-09M\n", + "6 42 5902851852638 SO133-09M\n", + "7 42 5902851852614 SO133-09M\n", + "8 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 186, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 277, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 24 5902805820447 RH797-81X\n", + "1 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 277, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# dla /5\n", + "c1(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 278, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 278, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 280, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 24 5902805820447 RH797-81X\n", + "1 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 280, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# dla /6\n", + "c1(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 281, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 34 5902805820447 RH797-81X\n", + "3 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 281, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 283, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 11 5902690542745 QY337-00X\n", + "1 11 5902690542769 QY337-00X\n", + "2 24 5902805820447 RH797-81X\n", + "3 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 283, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# eps min 5000 / 5\n", + "# \n", + "c1(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 284, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 42 5902690542769 QY337-00X\n", + "2 29 5902805820447 RH797-81X\n", + "3 32 5902805820447 RH797-81X\n", + "4 34 5902805820447 RH797-81X\n", + "5 38 5902851852614 SO133-09M\n", + "6 38 5902851852638 SO133-09M\n", + "7 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 284, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# c=0.6\n", + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 317, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 11 5902690542745 QY337-00X\n", + "1 11 5902690542769 QY337-00X\n", + "2 24 5902805820447 RH797-81X\n", + "3 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 317, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 318, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 29 5902805820447 RH797-81X\n", + "2 32 5902805820447 RH797-81X\n", + "3 34 5902805820447 RH797-81X\n", + "4 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 318, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "def c2(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(20,minimal_epc.astype(int))\n", + " \n", + " if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(5000,kneedle.knee_y/5)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.3*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>24</td>\n", + " <td>5902805820461</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820461</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>27</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 21 5902805820447 RH797-81X\n", + "1 24 5902805820447 RH797-81X\n", + "2 24 5902805820461 RH797-81X\n", + "3 24 5902805820447 RH797-81X\n", + "4 24 5902805820461 RH797-81X\n", + "5 26 5902805820447 RH797-81X\n", + "6 27 5902805820447 RH797-81X\n", + "7 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(train6)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>42</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 34 5902805820447 RH797-81X\n", + "3 42 5902805820447 RH797-81X" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(z6)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 18 5902851852638 SO133-09M\n", + "1 11 5902690542745 QY337-00X\n", + "2 11 5902690542769 QY337-00X\n", + "3 21 5902805820447 RH797-81X\n", + "4 24 5902805820447 RH797-81X\n", + "5 26 5902805820447 RH797-81X\n", + "6 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>42</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 42 5902690542769 QY337-00X\n", + "2 42 5902690542745 QY337-00X\n", + "3 29 5902805820447 RH797-81X\n", + "4 32 5902805820447 RH797-81X\n", + "5 34 5902805820447 RH797-81X\n", + "6 38 5902851852614 SO133-09M\n", + "7 38 5902851852638 SO133-09M\n", + "8 38 5902851852638 SO133-09M\n", + "9 42 5902851852638 SO133-09M\n", + "10 42 5902851852614 SO133-09M\n", + "11 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "def c3(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(2,minimal_epc.astype(int))\n", + " \n", + " if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(5000,kneedle.knee_y/4)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='kd_tree')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.5*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 18 5902851852638 SO133-09M\n", + "1 11 5902690542745 QY337-00X\n", + "2 11 5902690542769 QY337-00X\n", + "3 21 5902805820447 RH797-81X\n", + "4 24 5902805820447 RH797-81X\n", + "5 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c3(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>42</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 42 5902690542769 QY337-00X\n", + "2 42 5902690542745 QY337-00X\n", + "3 29 5902805820447 RH797-81X\n", + "4 32 5902805820447 RH797-81X\n", + "5 38 5902851852638 SO133-09M\n", + "6 42 5902851852638 SO133-09M\n", + "7 42 5902851852614 SO133-09M\n", + "8 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c3(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "def c4(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(2,minimal_epc.astype(int))\n", + " \n", + " if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(5000,kneedle.knee_y/2)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if (db.labels_[db.labels_ == -1].size != 0 ) & (db.labels_[db.labels_ == -1].size > minimal_epc ):\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts().sum()\n", + " b = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()[0] \n", + " if a - 2*b < 0:\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EAN': proba[proba['EPC'] == calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts().index[0]].EAN.iloc[0], \"StyleColor\":j}, ignore_index = True)\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>6</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>4</td>\n", + " <td>5902851445731</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>7</td>\n", + " <td>5902851445700</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>11</td>\n", + " <td>5902851445731</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>28</td>\n", + " <td>5902805303681</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>1</td>\n", + " <td>5902805431803</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>5</td>\n", + " <td>5902805431797</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>6</td>\n", + " <td>5902805431803</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>12</td>\n", + " <td>5902851535913</td>\n", + " <td>RV167-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>2</td>\n", + " <td>5902975217986</td>\n", + " <td>RV462-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>4</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>7</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>12</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>4</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>7</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>14</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>20</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>2</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>4</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>13</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>16</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>23</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>24</td>\n", + " <td>5902805219685</td>\n", + " <td>RN633-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>1</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>2</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>13</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>21</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>24</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>26</td>\n", + " <td>5902805444698</td>\n", + " <td>RJ365-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31</th>\n", + " <td>23</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>32</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>33</th>\n", + " <td>3</td>\n", + " <td>5902805385823</td>\n", + " <td>RJ371-53M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 6 5902805533040 RH267-85J\n", + "1 24 5902805533040 RH267-85J\n", + "2 4 5902851445731 RS483-99X\n", + "3 7 5902851445700 RS483-99X\n", + "4 11 5902851445731 RS483-99X\n", + "5 28 5902805303681 RJ369-87X\n", + "6 1 5902805431803 RM119-93X\n", + "7 5 5902805431797 RM119-93X\n", + "8 6 5902805431803 RM119-93X\n", + "9 12 5902851535913 RV167-MLC\n", + "10 2 5902975217986 RV462-87X\n", + "11 4 5902851414508 SL171-99X\n", + "12 7 5902851414508 SL171-99X\n", + "13 12 5902851414508 SL171-99X\n", + "14 4 5902851852638 SO133-09M\n", + "15 7 5902851852638 SO133-09M\n", + "16 14 5902851852638 SO133-09M\n", + "17 20 5902851852638 SO133-09M\n", + "18 2 5902690542769 QY337-00X\n", + "19 4 5902690542745 QY337-00X\n", + "20 13 5902690542769 QY337-00X\n", + "21 16 5902690542745 QY337-00X\n", + "22 23 5902690542745 QY337-00X\n", + "23 24 5902805219685 RN633-00X\n", + "24 1 5902805533255 RH267-59J\n", + "25 2 5902805533255 RH267-59J\n", + "26 13 5902805533255 RH267-59J\n", + "27 21 5902805533255 RH267-59J\n", + "28 24 5902805533255 RH267-59J\n", + "29 26 5902805444698 RJ365-09M\n", + "30 21 5902805820447 RH797-81X\n", + "31 23 5902805820447 RH797-81X\n", + "32 28 5902805820447 RH797-81X\n", + "33 3 5902805385823 RJ371-53M" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c4(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>41</td>\n", + " <td>5902805532999</td>\n", + " <td>RH267-55J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>34</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>42</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>32</td>\n", + " <td>5902805431803</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>40</td>\n", + " <td>5902805431803</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>32</td>\n", + " <td>5902851445700</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>37</td>\n", + " <td>5902851445700</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>40</td>\n", + " <td>5902851445731</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>32</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>36</td>\n", + " <td>5902805820423</td>\n", + " <td>RH797-59X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>37</td>\n", + " <td>5902805303681</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>32</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>38</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>40</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>29</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>36</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>41</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>39</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>32</td>\n", + " <td>5902805820546</td>\n", + " <td>RH797-00X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 41 5902805532999 RH267-55J\n", + "1 32 5902851414515 SL171-99X\n", + "2 34 5902851414515 SL171-99X\n", + "3 42 5902851414515 SL171-99X\n", + "4 32 5902805431803 RM119-93X\n", + "5 40 5902805431803 RM119-93X\n", + "6 32 5902851445700 RS483-99X\n", + "7 37 5902851445700 RS483-99X\n", + "8 40 5902851445731 RS483-99X\n", + "9 32 5902690542745 QY337-00X\n", + "10 36 5902805820423 RH797-59X\n", + "11 37 5902805303681 RJ369-87X\n", + "12 29 5902805820447 RH797-81X\n", + "13 32 5902805820447 RH797-81X\n", + "14 32 5902805533255 RH267-59J\n", + "15 38 5902805533255 RH267-59J\n", + "16 40 5902805533255 RH267-59J\n", + "17 29 5902975236994 SF078-MLC\n", + "18 36 5902975236994 SF078-MLC\n", + "19 41 5902975236994 SF078-MLC\n", + "20 39 5902851852638 SO133-09M\n", + "21 32 5902805820546 RH797-00X" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c4(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "def c5(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + "\n", + " eps = np.floor(proba['EPC'].value_counts().max()*30)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='kd_tree')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.5*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>27</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 21 5902805820447 RH797-81X\n", + "1 24 5902805820447 RH797-81X\n", + "2 26 5902805820447 RH797-81X\n", + "3 27 5902805820447 RH797-81X\n", + "4 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c5(u)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>35</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>42</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 35 5902851414515 SL171-99X\n", + "1 38 5902851414515 SL171-99X\n", + "2 38 5902851414508 SL171-99X\n", + "3 42 5902851414515 SL171-99X\n", + "4 29 5902805820447 RH797-81X\n", + "5 32 5902805820447 RH797-81X\n", + "6 34 5902805820447 RH797-81X" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c5(t)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>13</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>13</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>26</td>\n", + " <td>5902851445700</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>16</td>\n", + " <td>5902851547602</td>\n", + " <td>SB281-90M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>6</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>6</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>7</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>11</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>11</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>11</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>12</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>13</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>13</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>17</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>24</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>4</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>4</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>12</td>\n", + " <td>5902975236956</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>23</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>28</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>19</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>19</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>1</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>5</td>\n", + " <td>5902805533255</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>27</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 13 5902805533040 RH267-85J\n", + "1 13 5902805533040 RH267-85J\n", + "2 26 5902851445700 RS483-99X\n", + "3 16 5902851547602 SB281-90M\n", + "4 6 5902851852638 SO133-09M\n", + "5 6 5902851852614 SO133-09M\n", + "6 7 5902851852638 SO133-09M\n", + "7 11 5902851852614 SO133-09M\n", + "8 11 5902851852638 SO133-09M\n", + "9 11 5902851852638 SO133-09M\n", + "10 12 5902851852638 SO133-09M\n", + "11 13 5902851852638 SO133-09M\n", + "12 13 5902851852614 SO133-09M\n", + "13 18 5902851852638 SO133-09M\n", + "14 17 5902851852638 SO133-09M\n", + "15 24 5902851852638 SO133-09M\n", + "16 4 5902975236994 SF078-MLC\n", + "17 4 5902975236994 SF078-MLC\n", + "18 12 5902975236956 SF078-MLC\n", + "19 23 5902975236994 SF078-MLC\n", + "20 28 5902975236994 SF078-MLC\n", + "21 11 5902690542745 QY337-00X\n", + "22 11 5902690542769 QY337-00X\n", + "23 19 5902690542745 QY337-00X\n", + "24 19 5902690542769 QY337-00X\n", + "25 1 5902805533255 RH267-59J\n", + "26 5 5902805533255 RH267-59J\n", + "27 21 5902805820447 RH797-81X\n", + "28 24 5902805820447 RH797-81X\n", + "29 26 5902805820447 RH797-81X\n", + "30 27 5902805820447 RH797-81X\n", + "31 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# bez if i 0,3\n", + "c5(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>35</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>42</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>28</td>\n", + " <td>5902851445731</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>29</td>\n", + " <td>5902851445731</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>34</td>\n", + " <td>5902851445731</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>35</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>35</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>42</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>32</td>\n", + " <td>5902805303681</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>38</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 35 5902851414515 SL171-99X\n", + "1 38 5902851414515 SL171-99X\n", + "2 38 5902851414508 SL171-99X\n", + "3 42 5902851414515 SL171-99X\n", + "4 28 5902851445731 RS483-99X\n", + "5 29 5902851445731 RS483-99X\n", + "6 34 5902851445731 RS483-99X\n", + "7 35 5902690542745 QY337-00X\n", + "8 35 5902690542769 QY337-00X\n", + "9 42 5902690542769 QY337-00X\n", + "10 42 5902690542745 QY337-00X\n", + "11 32 5902805303681 RJ369-87X\n", + "12 29 5902805820447 RH797-81X\n", + "13 32 5902805820447 RH797-81X\n", + "14 34 5902805820447 RH797-81X\n", + "15 38 5902975236994 SF078-MLC\n", + "16 38 5902851852614 SO133-09M\n", + "17 38 5902851852638 SO133-09M\n", + "18 38 5902851852638 SO133-09M\n", + "19 42 5902851852638 SO133-09M\n", + "20 42 5902851852614 SO133-09M\n", + "21 42 5902851852638 SO133-09M" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c5(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [], + "source": [ + "outliery1 = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + "j = 'RH797-81X' #'RH797-81X' # 'RH267-55J'\n", + "i = 24" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [], + "source": [ + "DF = train[(train.StyleColor == j)]\n", + "proba = DF[ DF.MEASUREMENT == i ]\n", + "X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + "minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [], + "source": [ + "eps = 6000\n", + "\n", + "db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + "db.fit(X)\n", + "y_pred = db.fit_predict(X)\n", + "clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + "calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [], + "source": [ + "if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.5*proba[proba['EPC'] == b].count()[0] :\n", + " outliery1 = outliery1.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 24 5902805820447 RH797-81X" + ] + }, + "execution_count": 141, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "outliery1" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [], + "source": [ + "outliery2 = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + "for i in train.MEASUREMENT.unique():\n", + " DF = train[(train.MEASUREMENT == i)]\n", + "\n", + " for k in DF.StyleColor.unique():\n", + " proba = DF[ DF.StyleColor == k]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2) \n", + "\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.9*proba[proba['EPC'] == b].count()[0] :\n", + " outliery2 = outliery2.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':k}, ignore_index = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>3</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>3</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>3</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>4</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>4</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>171</th>\n", + " <td>28</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>172</th>\n", + " <td>28</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>173</th>\n", + " <td>28</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>174</th>\n", + " <td>28</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>175</th>\n", + " <td>28</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>176 rows Ă 3 columns</p>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 3 5902690542769 QY337-00X\n", + "1 3 5902690542745 QY337-00X\n", + "2 3 5902690542769 QY337-00X\n", + "3 4 5902805533040 RH267-85J\n", + "4 4 5902805533040 RH267-85J\n", + ".. ... ... ...\n", + "171 28 5902805533040 RH267-85J\n", + "172 28 5902805533040 RH267-85J\n", + "173 28 5902805533040 RH267-85J\n", + "174 28 5902805533040 RH267-85J\n", + "175 28 5902805533040 RH267-85J\n", + "\n", + "[176 rows x 3 columns]" + ] + }, + "execution_count": 151, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "outliery2" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [], + "source": [ + "DF = train[(train.StyleColor == 'RH797-81X')]\n", + "proba = DF[ DF.MEASUREMENT == 24 ]\n", + "X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + "y_pred = db.fit_predict(X)\n", + "clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + "calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "y_pred = db.fit_predict(X)\n", + "\n", + "if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.5*proba[proba['EPC'] == b].count()[0] :\n", + " outliery1 = outliery1.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3035684754501F0000B5B614 83\n", + "3035684754501F4000B5B6E5 30\n", + "3035684754501F8000B5B6E6 28\n", + "3035684754501F0000B5B632 23\n", + "3035684754501F8000B5B6A5 20\n", + "Name: EPC, dtype: int64" + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "def c6(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EPC', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(10,minimal_epc.astype(int))\n", + " \n", + " if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(2000,kneedle.knee_y/3)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.4*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EPC':proba[proba['EPC'] == b].EPC.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "train7 = train[(train.StyleColor == 'SF078-MLC') | (train.StyleColor == 'RH797-81X') | (train.StyleColor == 'SL171-99X')]\n", + "test7 = test[(test.StyleColor == 'SF078-MLC') | (test.StyleColor == 'RH797-81X') | (test.StyleColor == 'SL171-99X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EPC</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>27</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>28</td>\n", + " <td>30356849FC1724C000B59A42</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>21</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>24</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>27</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>28</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EPC StyleColor\n", + "0 27 303568480C287AC000B5BAD5 SL171-99X\n", + "1 28 30356849FC1724C000B59A42 SF078-MLC\n", + "2 21 3035684754501F0000B5B614 RH797-81X\n", + "3 24 3035684754501F0000B5B614 RH797-81X\n", + "4 27 3035684754501F0000B5B614 RH797-81X\n", + "5 28 3035684754501F0000B5B614 RH797-81X" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c6(train7)" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EPC</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>32</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>35</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BA63</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BADA</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>41</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>29</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>32</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EPC StyleColor\n", + "0 32 303568480C287AC000B5BAD5 SL171-99X\n", + "1 35 303568480C287AC000B5BAD5 SL171-99X\n", + "2 38 303568480C287AC000B5BAD5 SL171-99X\n", + "3 38 303568480C287A8000B5BA63 SL171-99X\n", + "4 38 303568480C287A8000B5BADA SL171-99X\n", + "5 38 303568480C287A8000B5BABD SL171-99X\n", + "6 41 303568480C287AC000B5BAD5 SL171-99X\n", + "7 42 303568480C287AC000B5BAD5 SL171-99X\n", + "8 42 303568480C287A8000B5BABD SL171-99X\n", + "9 29 3035684754501F0000B5B614 RH797-81X\n", + "10 32 3035684754501F0000B5B614 RH797-81X" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c6(test7)" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>23</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820461</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>24</td>\n", + " <td>5902805820461</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>27</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 23 5902975236994 SF078-MLC\n", + "1 24 5902975236994 SF078-MLC\n", + "2 21 5902805820447 RH797-81X\n", + "3 24 5902805820447 RH797-81X\n", + "4 24 5902805820461 RH797-81X\n", + "5 24 5902805820447 RH797-81X\n", + "6 24 5902805820461 RH797-81X\n", + "7 26 5902805820447 RH797-81X\n", + "8 27 5902805820447 RH797-81X\n", + "9 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(train7)" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>32</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>38</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>42</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>42</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>29</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>36</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>38</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>38</td>\n", + " <td>5902975236956</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>40</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>41</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>42</td>\n", + " <td>5902975236956</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 32 5902851414515 SL171-99X\n", + "1 38 5902851414515 SL171-99X\n", + "2 38 5902851414508 SL171-99X\n", + "3 38 5902851414508 SL171-99X\n", + "4 38 5902851414508 SL171-99X\n", + "5 42 5902851414515 SL171-99X\n", + "6 42 5902851414508 SL171-99X\n", + "7 29 5902805820447 RH797-81X\n", + "8 32 5902805820447 RH797-81X\n", + "9 34 5902805820447 RH797-81X\n", + "10 42 5902805820447 RH797-81X\n", + "11 29 5902975236994 SF078-MLC\n", + "12 36 5902975236994 SF078-MLC\n", + "13 38 5902975236994 SF078-MLC\n", + "14 38 5902975236956 SF078-MLC\n", + "15 40 5902975236994 SF078-MLC\n", + "16 41 5902975236994 SF078-MLC\n", + "17 42 5902975236956 SF078-MLC" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(test7)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>4</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>4</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>12</td>\n", + " <td>5902975236956</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>23</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>28</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>27</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 4 5902975236994 SF078-MLC\n", + "1 4 5902975236994 SF078-MLC\n", + "2 12 5902975236956 SF078-MLC\n", + "3 23 5902975236994 SF078-MLC\n", + "4 28 5902975236994 SF078-MLC\n", + "5 21 5902805820447 RH797-81X\n", + "6 24 5902805820447 RH797-81X\n", + "7 26 5902805820447 RH797-81X\n", + "8 27 5902805820447 RH797-81X\n", + "9 28 5902805820447 RH797-81X" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c5(train7)" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>35</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>42</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>38</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 35 5902851414515 SL171-99X\n", + "1 38 5902851414515 SL171-99X\n", + "2 38 5902851414508 SL171-99X\n", + "3 42 5902851414515 SL171-99X\n", + "4 29 5902805820447 RH797-81X\n", + "5 32 5902805820447 RH797-81X\n", + "6 34 5902805820447 RH797-81X\n", + "7 38 5902975236994 SF078-MLC" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c5(test7)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EPC</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>12</td>\n", + " <td>303568480C2B874000B59A39</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>23</td>\n", + " <td>303568480C2B874000B59A39</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>28</td>\n", + " <td>303568480C2B868000B599B2</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>303568480C357A0000B59999</td>\n", + " <td>SB281-90M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>16</td>\n", + " <td>303568480C357A0000B59999</td>\n", + " <td>SB281-90M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>7</td>\n", + " <td>30356847541DA80000B5BA54</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>17</td>\n", + " <td>303568480C3455C000B5B30A</td>\n", + " <td>RV167-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>17</td>\n", + " <td>30356847542CCD0000B59A80</td>\n", + " <td>QJ677-33X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>26</td>\n", + " <td>30356847542CCD8000B599FA</td>\n", + " <td>QJ677-33X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>26</td>\n", + " <td>30356847542CCD0000B59A26</td>\n", + " <td>QJ677-33X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>4</td>\n", + " <td>3035684754340CC000B594C3</td>\n", + " <td>RH267-55J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>18</td>\n", + " <td>3035684754340D0000B594EB</td>\n", + " <td>RH267-55J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>18</td>\n", + " <td>3035684754340CC000B594C6</td>\n", + " <td>RH267-55J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>27</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>7</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>17</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>17</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>24</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>24</td>\n", + " <td>30356847540FE2C000B59A68</td>\n", + " <td>RB254-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>28</td>\n", + " <td>30356849FC1724C000B59A42</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>11</td>\n", + " <td>303568458835008000B5BAD1</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>11</td>\n", + " <td>303568458835010000B5BA58</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>19</td>\n", + " <td>303568458835008000B5BAD1</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>19</td>\n", + " <td>303568458835010000B5BA58</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>7</td>\n", + " <td>303568475415740000B5A5CD</td>\n", + " <td>RN633-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>1</td>\n", + " <td>303568475434134000B5B6DF</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>2</td>\n", + " <td>30356847542B6D4000B5B656</td>\n", + " <td>RJ365-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>2</td>\n", + " <td>30356847542B6D0000B5B65A</td>\n", + " <td>RJ365-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>12</td>\n", + " <td>30356849FC1E348000B5B2D4</td>\n", + " <td>SP090-90X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>21</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>24</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31</th>\n", + " <td>27</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>32</th>\n", + " <td>28</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EPC StyleColor\n", + "0 12 303568480C2B874000B59A39 RS483-99X\n", + "1 23 303568480C2B874000B59A39 RS483-99X\n", + "2 28 303568480C2B868000B599B2 RS483-99X\n", + "3 1 303568480C357A0000B59999 SB281-90M\n", + "4 16 303568480C357A0000B59999 SB281-90M\n", + "5 7 30356847541DA80000B5BA54 RJ369-87X\n", + "6 17 303568480C3455C000B5B30A RV167-MLC\n", + "7 17 30356847542CCD0000B59A80 QJ677-33X\n", + "8 26 30356847542CCD8000B599FA QJ677-33X\n", + "9 26 30356847542CCD0000B59A26 QJ677-33X\n", + "10 4 3035684754340CC000B594C3 RH267-55J\n", + "11 18 3035684754340D0000B594EB RH267-55J\n", + "12 18 3035684754340CC000B594C6 RH267-55J\n", + "13 27 303568480C287AC000B5BAD5 SL171-99X\n", + "14 7 303568480C5343C000B599F6 SO133-09M\n", + "15 17 303568480C5343C000B599C8 SO133-09M\n", + "16 17 303568480C53434000B599E1 SO133-09M\n", + "17 24 303568480C53434000B599E1 SO133-09M\n", + "18 24 30356847540FE2C000B59A68 RB254-00X\n", + "19 28 30356849FC1724C000B59A42 SF078-MLC\n", + "20 11 303568458835008000B5BAD1 QY337-00X\n", + "21 11 303568458835010000B5BA58 QY337-00X\n", + "22 19 303568458835008000B5BAD1 QY337-00X\n", + "23 19 303568458835010000B5BA58 QY337-00X\n", + "24 7 303568475415740000B5A5CD RN633-00X\n", + "25 1 303568475434134000B5B6DF RH267-59J\n", + "26 2 30356847542B6D4000B5B656 RJ365-09M\n", + "27 2 30356847542B6D0000B5B65A RJ365-09M\n", + "28 12 30356849FC1E348000B5B2D4 SP090-90X\n", + "29 21 3035684754501F0000B5B614 RH797-81X\n", + "30 24 3035684754501F0000B5B614 RH797-81X\n", + "31 27 3035684754501F0000B5B614 RH797-81X\n", + "32 28 3035684754501F0000B5B614 RH797-81X" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c6(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EPC</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>42</td>\n", + " <td>3035684754340E0000B594E8</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>35</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>38</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BA63</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BADA</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>41</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>42</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>38</td>\n", + " <td>303568480C2B868000B599B2</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>38</td>\n", + " <td>303568480C34548000B5B2B5</td>\n", + " <td>RV167-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>42</td>\n", + " <td>303568458835010000B5BA58</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>42</td>\n", + " <td>303568458835008000B5BAD1</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>32</td>\n", + " <td>30356847541DA80000B5BA54</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>29</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>32</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>29</td>\n", + " <td>303568475415744000B599FE</td>\n", + " <td>RN633-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>39</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>42</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>42</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>42</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>35</td>\n", + " <td>303568475450218000B59781</td>\n", + " <td>RH797-00X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EPC StyleColor\n", + "0 42 3035684754340E0000B594E8 RH267-85J\n", + "1 32 303568480C287AC000B5BAD5 SL171-99X\n", + "2 35 303568480C287AC000B5BAD5 SL171-99X\n", + "3 38 303568480C287AC000B5BAD5 SL171-99X\n", + "4 38 303568480C287A8000B5BA63 SL171-99X\n", + "5 38 303568480C287A8000B5BADA SL171-99X\n", + "6 38 303568480C287A8000B5BABD SL171-99X\n", + "7 41 303568480C287AC000B5BAD5 SL171-99X\n", + "8 42 303568480C287AC000B5BAD5 SL171-99X\n", + "9 42 303568480C287A8000B5BABD SL171-99X\n", + "10 38 303568480C2B868000B599B2 RS483-99X\n", + "11 38 303568480C34548000B5B2B5 RV167-87X\n", + "12 42 303568458835010000B5BA58 QY337-00X\n", + "13 42 303568458835008000B5BAD1 QY337-00X\n", + "14 32 30356847541DA80000B5BA54 RJ369-87X\n", + "15 29 3035684754501F0000B5B614 RH797-81X\n", + "16 32 3035684754501F0000B5B614 RH797-81X\n", + "17 29 303568475415744000B599FE RN633-00X\n", + "18 39 303568480C5343C000B599F6 SO133-09M\n", + "19 42 303568480C5343C000B599F6 SO133-09M\n", + "20 42 303568480C53434000B599E1 SO133-09M\n", + "21 42 303568480C5343C000B599C8 SO133-09M\n", + "22 35 303568475450218000B59781 RH797-00X" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c6(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "def c7(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EPC', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(10,minimal_epc.astype(int))\n", + " \n", + " if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(4000,kneedle.knee_y/5)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.4*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EPC':proba[proba['EPC'] == b].EPC.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EPC</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>26</td>\n", + " <td>303568480C2B868000B599B2</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>16</td>\n", + " <td>303568480C357A0000B59999</td>\n", + " <td>SB281-90M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>4</td>\n", + " <td>30356847541DA7C000B5BADD</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>26</td>\n", + " <td>30356847541DA80000B5BA54</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>11</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>11</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>11</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>18</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>18</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>18</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>17</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>17</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>24</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>24</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>24</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>24</td>\n", + " <td>30356847540FE2C000B59A68</td>\n", + " <td>RB254-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>23</td>\n", + " <td>30356849FC1724C000B59A42</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>11</td>\n", + " <td>303568458835008000B5BAD1</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>11</td>\n", + " <td>303568458835010000B5BA58</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>1</td>\n", + " <td>303568475434134000B5B6DF</td>\n", + " <td>RH267-59J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>21</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>24</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>26</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>27</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>28</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EPC StyleColor\n", + "0 26 303568480C2B868000B599B2 RS483-99X\n", + "1 16 303568480C357A0000B59999 SB281-90M\n", + "2 4 30356847541DA7C000B5BADD RJ369-87X\n", + "3 26 30356847541DA80000B5BA54 RJ369-87X\n", + "4 11 303568480C53434000B599E1 SO133-09M\n", + "5 11 303568480C5343C000B599F6 SO133-09M\n", + "6 11 303568480C5343C000B599C8 SO133-09M\n", + "7 18 303568480C5343C000B599C8 SO133-09M\n", + "8 18 303568480C5343C000B599F6 SO133-09M\n", + "9 18 303568480C53434000B599E1 SO133-09M\n", + "10 17 303568480C5343C000B599C8 SO133-09M\n", + "11 17 303568480C53434000B599E1 SO133-09M\n", + "12 24 303568480C5343C000B599F6 SO133-09M\n", + "13 24 303568480C53434000B599E1 SO133-09M\n", + "14 24 303568480C5343C000B599C8 SO133-09M\n", + "15 24 30356847540FE2C000B59A68 RB254-00X\n", + "16 23 30356849FC1724C000B59A42 SF078-MLC\n", + "17 11 303568458835008000B5BAD1 QY337-00X\n", + "18 11 303568458835010000B5BA58 QY337-00X\n", + "19 1 303568475434134000B5B6DF RH267-59J\n", + "20 21 3035684754501F0000B5B614 RH797-81X\n", + "21 24 3035684754501F0000B5B614 RH797-81X\n", + "22 26 3035684754501F0000B5B614 RH797-81X\n", + "23 27 3035684754501F0000B5B614 RH797-81X\n", + "24 28 3035684754501F0000B5B614 RH797-81X" + ] + }, + "execution_count": 113, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c7(train)" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EPC</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>42</td>\n", + " <td>3035684754340E0000B594E8</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>32</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>38</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BA63</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BADA</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>51356847542A2B0000B5B280</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>42</td>\n", + " <td>30356847542A2B0000B5B215</td>\n", + " <td>RM119-93X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>38</td>\n", + " <td>303568480C2B868000B599B2</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>34</td>\n", + " <td>303568458835008000B5BAD1</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>35</td>\n", + " <td>303568458835010000B5BA58</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>42</td>\n", + " <td>303568458835010000B5BA58</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>42</td>\n", + " <td>303568458835008000B5BAD1</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>42</td>\n", + " <td>303568458835010000B5BA61</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>42</td>\n", + " <td>30356847541DA80000B5BA54</td>\n", + " <td>RJ369-87X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>29</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>32</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>34</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>42</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>38</td>\n", + " <td>30356849FC1724C000B59A42</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>39</td>\n", + " <td>30356849FC1724C000B59A42</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>42</td>\n", + " <td>30356849FC1723C000B5B1A3</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>38</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>38</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>38</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>39</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>42</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>42</td>\n", + " <td>303568480C53434000B599E1</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>42</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31</th>\n", + " <td>38</td>\n", + " <td>303568480C357A0000B59A61</td>\n", + " <td>SB281-90M</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EPC StyleColor\n", + "0 42 3035684754340E0000B594E8 RH267-85J\n", + "1 32 303568480C287AC000B5BAD5 SL171-99X\n", + "2 38 303568480C287AC000B5BAD5 SL171-99X\n", + "3 38 303568480C287A8000B5BA63 SL171-99X\n", + "4 38 303568480C287A8000B5BADA SL171-99X\n", + "5 38 303568480C287A8000B5BABD SL171-99X\n", + "6 42 303568480C287AC000B5BAD5 SL171-99X\n", + "7 42 303568480C287A8000B5BABD SL171-99X\n", + "8 42 51356847542A2B0000B5B280 RM119-93X\n", + "9 42 30356847542A2B0000B5B215 RM119-93X\n", + "10 38 303568480C2B868000B599B2 RS483-99X\n", + "11 34 303568458835008000B5BAD1 QY337-00X\n", + "12 35 303568458835010000B5BA58 QY337-00X\n", + "13 42 303568458835010000B5BA58 QY337-00X\n", + "14 42 303568458835008000B5BAD1 QY337-00X\n", + "15 42 303568458835010000B5BA61 QY337-00X\n", + "16 42 30356847541DA80000B5BA54 RJ369-87X\n", + "17 29 3035684754501F0000B5B614 RH797-81X\n", + "18 32 3035684754501F0000B5B614 RH797-81X\n", + "19 34 3035684754501F0000B5B614 RH797-81X\n", + "20 42 3035684754501F0000B5B614 RH797-81X\n", + "21 38 30356849FC1724C000B59A42 SF078-MLC\n", + "22 39 30356849FC1724C000B59A42 SF078-MLC\n", + "23 42 30356849FC1723C000B5B1A3 SF078-MLC\n", + "24 38 303568480C53434000B599E1 SO133-09M\n", + "25 38 303568480C5343C000B599F6 SO133-09M\n", + "26 38 303568480C5343C000B599C8 SO133-09M\n", + "27 39 303568480C5343C000B599F6 SO133-09M\n", + "28 42 303568480C5343C000B599F6 SO133-09M\n", + "29 42 303568480C53434000B599E1 SO133-09M\n", + "30 42 303568480C5343C000B599C8 SO133-09M\n", + "31 38 303568480C357A0000B59A61 SB281-90M" + ] + }, + "execution_count": 114, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c7(test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/DBSCAN_1-zmiany_df.ipynb b/DBSCAN_1-zmiany_df.ipynb index e73fa9a39778f2f7174671da8c26cde128aeb617..694c4ac49efedcbc045715864b0e6e838c605e6d 100644 --- a/DBSCAN_1-zmiany_df.ipynb +++ b/DBSCAN_1-zmiany_df.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -500,23 +500,6 @@ "execution_count": 5, "metadata": {}, "outputs": [ - { - "data": { - "text/plain": [ - "{'whiskers': [<matplotlib.lines.Line2D at 0x215c384d970>,\n", - " <matplotlib.lines.Line2D at 0x215c384dcd0>],\n", - " 'caps': [<matplotlib.lines.Line2D at 0x215c4d13070>,\n", - " <matplotlib.lines.Line2D at 0x215c4d133d0>],\n", - " 'boxes': [<matplotlib.lines.Line2D at 0x215c384d610>],\n", - " 'medians': [<matplotlib.lines.Line2D at 0x215c4d13730>],\n", - " 'fliers': [<matplotlib.lines.Line2D at 0x215c4d13a90>],\n", - " 'means': []}" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - }, { "data": { "image/png": "\n", @@ -531,12 +514,12 @@ } ], "source": [ - "plt.boxplot(df['LENGTH_OF_MEASUREMENT'])" + "plt.boxplot(df['LENGTH_OF_MEASUREMENT']);" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -546,7 +529,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -855,7 +838,7 @@ "[8 rows x 24 columns]" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -866,7 +849,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -877,7 +860,7 @@ " 42], dtype=int64)" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -944,7 +927,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -954,7 +937,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -969,9 +952,72 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>EPC</th>\n", + " <th>PROXIMITY</th>\n", + " <th>TIMESTAMP</th>\n", + " <th>MEASUREMENT</th>\n", + " <th>ITEMID</th>\n", + " <th>EAN</th>\n", + " <th>StyleColorSize</th>\n", + " <th>StyleColor</th>\n", + " <th>Size</th>\n", + " <th>SubclassID</th>\n", + " <th>...</th>\n", + " <th>SEC</th>\n", + " <th>MICROSEC</th>\n", + " <th>MILISEC</th>\n", + " <th>TIME_MS</th>\n", + " <th>TIME_PER_MEASUREMENT_MS</th>\n", + " <th>NUMBER_OF_SIGNALS</th>\n", + " <th>LENGTH_OF_MEASUREMENT</th>\n", + " <th>TIME_KMS</th>\n", + " <th>MAX_PROXIMITY_KMS</th>\n", + " <th>SUM_PROXIMITY_KMS</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "<p>0 rows Ă 36 columns</p>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [EPC, PROXIMITY, TIMESTAMP, MEASUREMENT, ITEMID, EAN, StyleColorSize, StyleColor, Size, SubclassID, ItemSeason, FashionLevel, SubclassID.1, SubclassName, ClassID, ClassID.1, ClassName, DepartmentID, DepartmentID.1, DepartmentName, BrandID, BrandID.1, BrandName, Active, HOUR, MIN, SEC, MICROSEC, MILISEC, TIME_MS, TIME_PER_MEASUREMENT_MS, NUMBER_OF_SIGNALS, LENGTH_OF_MEASUREMENT, TIME_KMS, MAX_PROXIMITY_KMS, SUM_PROXIMITY_KMS]\n", + "Index: []\n", + "\n", + "[0 rows x 36 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "#do testow samego modelu\n", "test_1 = df[(df.TIMESTAMP <= '2021-10-26T10:30:00.000') & (df.StyleColor == 'RH797-81X') & ((df.MEASUREMENT == 2) | (df.MEASUREMENT == 3) | (df.MEASUREMENT == 4) | (df.MEASUREMENT == 26) | (df.MEASUREMENT == 28) ) ]\n", @@ -996,9 +1042,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 112, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ "plt.figure(figsize=(10,8))\n", "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EAN', data=test6)\n", @@ -1007,9 +1066,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ "plt.figure(figsize=(10,8))\n", "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EAN', data=test_1[test_1.MEASUREMENT == 3])\n", @@ -1018,9 +1090,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ "plt.figure(figsize=(10,8))\n", "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EPC', data=test_1[test_1.MEASUREMENT == 4])\n", @@ -1029,9 +1114,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ "plt.figure(figsize=(10,8))\n", "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EPC', data=test_1[test_1.MEASUREMENT == 26])\n", @@ -1040,9 +1138,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAnQAAAHhCAYAAADnKiOiAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAABxoElEQVR4nO3deXzcVb3/8deZPfueNm3aJm3ThS4UGpYCIgULXBCQohVEBC7CT6/oBRcuKoreK/Zy3UBFvXgBcblwWbSgslRRBFwoLZSl+5a2adM2+z7r9/z+mHRomtCmZJnM5P18PObRmXO+3+98ZqbJfHJWY61FRERERFKXK9kBiIiIiMjgKKETERERSXFK6ERERERSnBI6ERERkRSnhE5EREQkxSmhExEREUlxnmQHkEzFxcW2oqIi2WGIiIiIHNWaNWsarLUl/dWN6YSuoqKC1atXJzsMERERkaMyxux8pzp1uYqIiIikOCV0IiIiIilOCZ2IiIhIilNCJyIiIpLilNCJiIiIpDgldCIiIiIpTgmdiIiISIpTQiciIiKS4pTQiYiIiKQ4JXQiIiIiKU4JnYiIiEiKU0InIiIikuKU0ImIiIikOCV0IiIiIilOCZ2IiAw76zhYa5MdxpBqbGimfn9jssMQAcCT7ABERCR9OZEw4bZWQs0NuP0BAkWleDKzkh3WoLS3dfDHZ17gv+96ELfHxeVXL+X9H1hCQVF+skOTMUwJnYiIDAtrLcGmBoL79wIQ6+ok0tpC7vRZuAMZSY7u3Xv1lTcIdXRw9w9uxe/10NjSybo3NnLG4lOTHdqAOI5DqK0Lj9+LN8Of7HBSnnUcnGgU43bjcruTFocSOhERGRZOJEzwwL5eZdaJEe3uSumEzmUdlpwyAxOLAjFy8z10enOIRqJ4vKP7a7WjvoXNf1rLzpc3kFOaz/yl76GkaiLGmGSHlpJiwW66D+wj0taCOyODzLJyPJnZSYlFY+hERGSYmH4ThdGSPFhrqdm+i1V/e5XtW2qIxWIDOm965YSeZO5t2TaIsc5whDlkopEo6373D7b+eS2RrhBNNft54fu/oXWPxgG+G04sSueenYRbGuN/qHR20L59C9FQMCnxjO4/JUREJGW5fT4C4yfQvXd3osx4PLgzMoftObdu2sEfn3mBzRu2ce6FZ3HKaSf2GdvW2NDElk07OLCvgTu+/F26u4N4fV6+fuctnH/x2Xg8R/5qzM7OINLZu8zGosDonvQRbO5g16pNvcqcaIy2ukbyy4uTFFVqsrEY0a4uop0dvcudGE4oCP7AiMekhE5ERIaNP78It9dHuLUFdyCANzcf9zB92e3euYf/99HPUX8g3uL0x6f/wme+cD3XferKRKtgW0sb3/qPexg3roRHfvUE3d3x1pRIOMJXb7mTWXOrmD6j8ojPk5GbS+TAXjhk1q6/qBSX1zcsr2uouLxuvJl+wp29W5A8gdEd92hjrSXU0ogTDoMxvf4fABhXcjo/1eUqIiLDxuXx4MsrIHtyJRmlZXiGcezc5g3bEsncQT/94S/YW7s/8XjrlhqeWvFHPD4PnR1dvY6NRqIc2N9w1OdxBzLJqazCk5WN8XoJlJYRKC4dNV3J7ySzIIcFH3pvr7LCynEUlJckKaLU5ERCdNXVEm5tJlBU2qvOm5OXtPGhaqETEZG04Dh9uzwdx8Hy9ti2jvZ4X2kkHCE7JyvxGMDr8zJu3NGTG2MM3uxc3BlZ4DgYj2fUJ3MHlZ8wnazCHFr2NBDIzaSwcjwZBckZxJ+qbMwBx8EJh4h2d5ExbiLWieLyZ+DLycXl8SYlLrXQiYhIWpgxeyr5BXm9yj52w4eZMHF84nHF1Elk52Tx5OPPcsOnryIrOz6eLxDw841v38qUqeUDfj6X243L602ZZA7A4/dSMqOcqsULmLRwBlmFuckOKeW4fH48WTkARDvb6d6/h2BjA57MrKR2u5t0W7n7WFRXV9vVq1cnOwwRERkim9ZvZcUjT7Fh3RYu+eD5vGfxIopLC3sd8/qr6/j2N37EgX31fPxTH2X8hBImTZ7I5MpyXEka/ySpJb5cyd6esaEZZJZNwpudM+zPa4xZY62t7rdOCZ0SOhGRdOM4zhGTs472TtrbOigoyicQ0OK6cuziCwpHMC43rqPMjB4qR0roNIZORETSztFa2rJzssjOSe0tyCS5jMuF2zd6/hhQ27KIiIhIilNCJyIiIpLilNCJiIiIpDgldCIiIiIpTgmdiIiISIpTQiciIiKS4pTQiYiIiKQ4JXQiIiIiKU4JnYiIiEiKU0InIiIikuKU0ImIiIikOCV0IiIiIinOk+wARERERIZbLBwi0tZKuKMVf0Exsa5OYsEgvoJCPFk5uL3eZIc4KEroRETkXXGiEWKhEE40gsvtxh3IxOXR14qMPk4sSteeXUTaWwmUjKdrz05sNApApL2FjPETCZSMxxiT5EjfPf3kyahlHYdYsBsnEsbl9eL2Z2Dc7mSHJSJALBQk0t5GsPEATigIgCc7h6zyCtw+f5KjE+nNCYWItLcCYFyuRDJ3UPeBOnz5hSn9f1cJ3TCJhYJEOjtwQkE8mdl4srL1l+sxsNYSammkq3ZnoixjfDmB4lKMS0M/j6attZ1gd5D8wjxamloJZATIzctJ1EcjEVp2HqC1rgl/dgYFU8aRVZhzhCuK9BbuaCfW3ZlI5gCiHe1EOtpwF5YkMTKRI7PW9lM48nEMNWUYwyAWCdOxcxuxYHeiLB2ac4/GWodoVxfR7k6My02suxO3P4AnOxdPIOOYrhULBenas6tXWfe+Wrw5uXgyMocy7LQSi8V45e+v8Z07fszp7z2ZjvZOnn7yOcomjuNzX/okJ512Ah6Ph71rt/OP+55OnFc0tYxF119AZoGSOhkYJxwi1t3Vpzza2QEpkNBZa4l2dhDpaMXGYnhz8vBk5eBSL0Bacvn8eLJziHa0A2DcHmzs7Va6QOl4XF5fssIbEmrqGAax7u5eyRxA9/46nHAoSRGNjGhnBx07t2EjEbpqawg11tO1dzcdO7YQO+Sv+IGwsSj081eUc1gzufS2af02PvmxW3BiDgf2NfDIL5+gva2DzRu28S/X/Bub1m2lu6WD1x75S6/zGrfX0VLbkKSoJRW5AwE8mdl9yr3ZqfFHQbSznY6dWwke2EeosZ6Omq2JLjlJPy6Ph6yJU8goKyfa1UFm+RT8RaV4suLDBAKFJSnf4KIWumFgrdNPodN/M2+asNahu34/vvwCQk31veqcSJhYsBu3PzDg67m8vj5/QeFy4fal9l9Qw23b5h3EYjFOP+tkHvrZr3vVxWIxtm2pYcqE8YQ7uvucGw2m9x8cMrS8WTkQc3BnZBHr7gTAl1eAJ2v0J3TWWiKd7dhYrFd58EAd3pxcXG59NaYjtz9ARsl4AsXjMMbgzyvAWpvyidxBaqEbBu5A38H78cGWaZyMWCAWwxgX1umb0PZXdiRun5/sKdMSTeDG4yVnyjRcKTxgdSRkZce7o1ua2ygqKexTn5mVSSA/m0knzexV7nK7yBlfNCIxSnpw+/z4CorImjiZ7IoqcqbPIjNlJkRYrNP3D2zrOGkxlkqO7NAELl2SOVBCNyw8/gA5lTPw5hXg8vkJlJaRMW4CxpW+YzOMy4W/uJRwWwu+/MMSCZfrXY1782bnkDt9FrlVx5E7fTbenLy0+uEbDrPnzWDWnCr+8NTzLPvoJb3er6pZU5k9bwYer4c57z+Vqe+Zh8fvJW9iMe/59AfIn1icxMglFbncbjyZWfhy8/BmZqfM+DNjXHiz+nYX+4tLNXlNUpZJ527Ao6murrarV68etutbx8E6zpj5BeHEYkTaW+PjB60l0taCy59BRum4fsfayPDYU7uP9W9sJBKOkpefw57afeQX5DFn/kwmTipLHOfEYgTbuvD4vfgyB94dLpIOYtEo0c52Qg0HsLEo/sISvHl5uL2p0MIoY5UxZo21trq/urGRaSSJcbnG1BIbLrcbf35hfFyKgUBpGcaYMfUejAYTy8czsXz8UY9zud2a1SpjltvjwZ1XgDcrB4vF7UntXQJElNDJkDs4flCdoyIy2o2VHhRJf2o6EREREUlxSuhEREREUpwSOhEREZEUp4ROREREJMUpoRMRERFJcaMyoTPGfMsYs9EY84Yx5jfGmPxD6r5ojNlqjNlkjDnvkPKFxpg3e+q+b7QCrYiIiIwRozKhA/4AzLXWzgc2A18EMMYcB1wOzAHOB35kjDm4NPmPgRuAqp7b+SMdtIiIiEgyjMqEzlq70lp7cFf2fwDlPfcvAR621oastTuArcDJxpgyINda+3cb3/ri58AHRjpuERERkWQYlQndYf4ZeLrn/kRg9yF1tT1lE3vuH14uIiIikvaStkS2MeaPQH/7E33ZWvtEzzFfBqLArw6e1s/x9gjl/T3vDcS7Zpk8efIxRi0iIiIy+iQtobPWvu9I9caYq4H3A+f0dKNCvOVt0iGHlQN7e8rL+ynv73nvBe4FqK6u7jfpExEREUklo7LL1RhzPvBvwMXW2q5Dqp4ELjfG+I0xlcQnP6yy1tYB7caYU3tmt34MeGLEAxcRERFJgtG6K/EPAT/wh57VR/5hrf2EtXadMeYRYD3xrthPWWtjPed8EvgZkEF8zN3Tfa4qIiIikoZGZUJnrZ1+hLo7gDv6KV8NzB3OuERERERGo1HZ5SoiIiIiAzcqW+hEZHD219Wz7o2NNDW1UjahlMb6JjIyM5g1ZzqTpmhFH5HBcByHfXv309rSTigUZmL5eErGFSc7LBnjlNCJpJn29nZ+/j+P8Iv/eYRbvnojN99wG8FgCICyieP4yS++ReW0KUe8xv599WzdvINoJMqWjdvp6Ojk5EUncuLJ8wkE/CPxMkRGpZbmNv7+4itYa7nnO/eze+ceJk4q4/b//DynnlGd7PBkDFNCJ5JmNq/fzq/uf4wF1XN56fmXE8kcQFZWJpvWbz1iQtfW1sHP/vthZs+p4u4776X+QCMA9//of/nWPbdz3vvPHvbXIDJarV3zFsYY/vP2u4lGY1x2xUXk5mbzw2/fR/nkCZRPnpDsEGWMUkInkmba2zpwHIfikkK2b90JwIxZU/nK1z7FlIklZOTk0LVvD7GuTjzZufjy8nH7A4nzd2zdSWZmgNrddYlk7qAffOs+TjltIfmFeSP6mkRGi+1bduAPBBg/oZQl/3QW//eLFTQ3t3LehYvZX1evhE6SRpMiRNLMlMpyxpWV8OqqNzjjrFMwxnD7129kSr6PQGYGoeYGggfqiHS00b2vlo7dO3CikcT54VAYt9tDLBrtc+32tnYikb7lImNFcXEheXk5nHvhYr7/rZ/iOA43fupqTpozm1BrJ2+vgy8yspTQiaSZyulTWH7XbZRPmUhbawc3/dsNlBVk4MsrwMaiRDvaeh0f6+okFgomHk+ZOom6vfsYV1aKx+PudexHrr2MknFFI/I6REajBSfNJ+bEcGIxxo0v4d+/+Glyt7cSXr2T/U+tpXbt1qNeI9QZZM/r21j9q+fY/NyrtO1rGoHIJd2ZsfzXRHV1tV29enWywxAZUh31LbTvb8bxuQi3duP1e8jNdvAEMsDtJri/7654OVNn4s3OSTzesmk7WzfVEI1EeOyh39JY38RlV7yf9196rmbzyZi3v+4Af39+FZ0tHQTWNxA7pNXaE/Cx5EsfIac0n+7WTjrqW/D4PORNLMHljrehbH7uNdY++pfEOVlFubz35svILtZQBjkyY8waa22/s280hk4kjbQfaOGtJ/5K9rgCOhva2LVqI8YYzvrM+3FCzfhLxuPJyiba2ZE4x52R2WsMHUDVzKlUTJ1MZ0cnp5xRjdfjoqCoYKRfjsioNK6slGnlE9ln99Ec2derLhoM093aQaQryOu/fon6zbW4vW5mnX8SU98zDxuN8dZv/97rnM7GNlpq65XQDYFYOIgTjmDcbtyBAMaMnY5IJXQiaaRxex25ZfEu0V2rNgJgrWXtipdZcOkp+KJR/EWluDOziXV14MnKxVdQiMvr7XMtr9dDfoG+YET6U5CTQ6DCS+vftuJEY4lyt9dDRl4W637/MvWbawGIRWKs++0/yJtQTP6kkl7HH2RjzojFnq4iHW107NyGjcUAQ8aESQQKizAu91HPTQdjJ3UVGQNCHd1Yx8E6vYdSNO88wPN3/45QxIM/v5DM8RPJmTqTzPET8BzWOiciR1dUWUZ7bSOzzz8p0ZXq8rg56eolOFGH/et29jmndW8DmYU5VC0+vle5N+Ajb4KGMgyGEwnTuXtHTzIHYOneu4tYMHjE89KJWuhE0khhxThqX92K2+cmIz+b7pa3u1YzCnMI5GQCYIxJVogiaSEjL4t5F51K24FmiqdPJBaJkFWcR864AsId3eSUFRLasqfXOVmFubhcLqrOPoHMwhx2/G09eROKqDr7BHLLCpP0StKDE43iRCJ9ymORMB6ykhDRyFNCJ5JGCqeMI9wZJNjWxZyLFlHz9/U079xPyYyJzP/AGWTkZyc7RJG04c3wUzRlfJ/yQG4Wx11wCn//798RCYYBKJ4+kcLK+LGZBTlULT6BytPn4vK4cbnUWTZYLo8Hl9fbJ6lze31JimjkaZarZrlKGgp1dBMNRRJrYgVyMvD4x84vNpHRoLFmH+37mvD4feRPKtGkh2HWdwxdOYHC4rQaQ6dZriJjjD87A392RrLDEBnTiirGU1TRtwVPhoc3O5fcqtk44TDG7cHtD2DGUOunEjoRERFJC25fALdvbE70Gjupq4iIiEiaUkInIiIikuKU0ImIiIikOCV0IiIiIilOCZ2IiIhIilNCJyIiIpLilNCJiIiIpDitQyciIoNmHYdoVyfhrna8gSyccIhIRxsunx9/fiGezLGxn6ZIsqiFTkREBi3a1UH7js14/BlEuzro2ruLSFsLoYb9tG/fTCzYnewQRdKaEjoRERkUay3BhgP48gqw0Sjh5obe9U6MSFdnkqITGRuU0ImIyOBZCxjAxu/2Wy8iw0UJnYiIDIoxBn9xKeHWZozHi7+g6LADXBpDJzLMNClCREQGzZuVTU7FNMKd7XiyssnwlBNpa8Hl9eEvKsGTkZnsEIeFtZZYKAQGXC43xuPGGLWVyMhTQicyBjnRKLFgF04kjPF4cWdk4fbo14G8e8blxpuThyc7FwCbnYu/qBSXK32Tm2iwm2hXB1hLuLWZWLAbb04+GSXjcAcykh2ejDH6DS6S4mLRKJ2N7USDYRzHIbs4j0DOO7eG2FiMSFsLnXt2JsY1+YtKyRg/EZfbPVJhS5oyxvT6N11Za4l1d+OEQoSaGrCxKADh5gZioSA5ldNxufUVKyNH/9tEUlhnQyttB1oItnay+Y+v0rqngaziPE66egnF0yYAfb9go6Fuuvbt6TVIPdR4AG9uHr6cvJF/ESIpyIlGiHS04vL6EsncQbGuDpxQCFemvmJl5Oh/m0gKa6zZR6i9i01/eJWupnYAZi5ZQF5pFt31dbg9XiId7dhoBH9hCd6cXGwsho1G+lyrvzIR6Z/B4PJ4+m+JNAbSuKtZRicldCIpKtQZxOVyEe4MJpK58XOmUH78lHiLW1YOXXtrwToARDs7yJw4GU9GNm5/gFgo2Ot6Ll+gz3N0NrXR2dCKy+0mf1IJHp93+F+YSApweb14snMJNTfiycoh2tmeqAuUluH2+5MYnYxFSuhEUpTb5yEWc3D7fLg8bpxojKmnz4ZYlGhHO57M7EQyd1D3/jpyp88mo2wSXXt34YRD4HKRWTapzyzEhm172fDsK9S9sQOASSfNYP4lp5NVrG5ZEYjP7DUYnFgEX34BTjSKJyMTT2aWZrrKiFNCJ5KiPF4P2SX5eP1eZpxzIhuffQXrHFzctX/GGIzL4MvNw+WbjhOJ4PJ4+iRz3a2d7H1jeyKZA9j9ymaKp06gavGCYXpFIqklPrM3N9lhiABK6ERSWlHFONoONOMJ+CisKAUDxuPBm5vfk7y5sU4scXxg3ARcnni3qSeQAe+wtEI0GKJxx74+5Xvf3KGETkRkFFJCJ5LicksLyC0tSDyOhkP4XS5iwW4yysqJhbpxIhH8BUV4snIGdE2310tuWSH1m2t7lRdPKxvS2EVEZGgooRNJMx6fH3x+GMQSJJmFOZQvmM6Bjbtp398MQG5ZEeUnVA1VmCIiMoSU0IlIv0pmlnPq9RfQsb8Zl8dNfnkJWUUaLyQiMhopoRORfrlcLgrKSygoL0l2KCIichSaVy0iIiKS4pTQiYiIiKQ4dbmKjAEb1m2hfl89ra3tZGdnEQqFqduzn6LiAhZUz2NyxcRkhygiIoOghE4kzW14awtr17zJo798kgs+sIRGTxPfW/7fWGsBmFQxkZ/84ttMmjwhyZGKiMi7pYROJM1t27yd/XX1nHTaCdTt2ccrf38tkcwB7K7Zw/o3NimhExFJYRpDJ5LmuoMhurq6ycrKxOvz0tTY0ueYjvbOkQ9MRESGjBI6kTRXUTGJieXj+cdLq3EchyUXnNWr3uVyMWP21OQEJyIiQ0JdriJpbvb8GcSsg9fno7Ojk9zcHJZ99BL++PRfKBlXzE23/j9mz52R7DBFxgzrODjRKGAxbg8utxuAxvomNry1mddWv8lbr2/k1DOqed8F79VwCBkQc+hYmrGmurrarl69OtlhiIyI+v0NdHR0kpWdicvlIhQKk5uTTU7ewPZ3FZHBC3d1YiMRbDRCuLkB6zj4i0px5+Ty+xXPseKRp3n1lTcSx5902gl898f/Tl6+dmkRMMassdZW91enFjqRMaJkXDEl44qTHYbImBULh7DhEE44TPe+2kR5156dtAUK2VdX3yuZA3jlb69Rs303x584Z6TDlRSjhE5kDNq2eQevrX6Tv72wmmlVU1h87hkcN29mssMSSWtOOEwsEsaGQ33qYsEu4B16zMZwT5oMnBI6kTGmob6R3zzyFD//6SMA/PFpWPHI09z38F1MrixPcnQi6c0A1mV6Hpj4zXEozs1gXFkpxy+cw+tr1iWOP/Gk+UyZOik5wUpKUUInMsbs2lHLo798slfZ/n31rHtz04gkdDu27aJuz36aG1t4dfUbTJgwjtPee7ImZkjac/sDOLEYjoX6iIe31u+go7OLuXOmcdzkQs446xSKigpYs+oN3ly7ntPOPIlzL1xMfkFeskOXFKCETmSMcRxLNBbrWx5zhv25d+/ay28e+T0FBfl8b/lPEuW/fOBx/ueh7zGtqmLYYxBJFpfXy+bte2lsbObfv/htGg40xctdLn5w33Lec/aplJyziDPPWZTkSCUVaR06kTFm0pSJXHLZ+b3K8vJzR6SFbNO6rcw6roqHHvx1r/LG+ibeWrth2J9fJJmi0Sjr39rMm2s3JJI5AMdx+PHdD9DZ0ZXE6CTVqYVOZAxoamxmd80e2to6cLkMFy5dwoTy8Tz/3N+YVjWFpR++kKlVU4Y9jkgkgssYuruCferC4fCwP79IMkWjMay1dHd296lrbmwhHA6TRWYSIpN0oIROJM21trTx7O/+HF97LhjG43Hz7Tt+xNTpUzjtzJNoaGimuLRoRGKpmjmVP//hJS667Fx+ed9jiXKf38fsuZplK+ktEPAzrqwUr9eLMabXnsrLPnoJBYX5A7pOJBzh9dfW89Kf/8GGdZs5/cxTeM/Zp1A5bfj/KJPRSwmdSJrbumkHefm5/O43Kzmhej6/X/EXopEomzdsY/OGbQCcufgUJk4qG/ZYps+spLs7SFNDM7m5Ofzhqb8wfkIJH7v+w8yZr4ROUovjOOyq2UNHWycut4twOExBQR6TK8txHIetm2poa2ujuzPIgf0NZOdkMeO46WQE/Nz+n5/noQd/Q2tLG8s+egnvX3regJ93/Vub+fcvfpuabbsA+PsLq1n35ka+/l//RiDgH66XK6PcqE7ojDGfB74FlFhrG3rKvghcB8SAz1hrn+0pXwj8DMgAngL+1Y7lbTBEerS1teN2u7E2vuXQvr37+xzTWN88YvHMWzCbSDjCiSfP50MfvYTsnEz8fn0JSWppa+vg1VWvs21LDQUF+ax49CnWrn6LjIwAN3/pk0ydMYXG/Y20tLRx95330tXTzTppykR+/PNvMWtOFdWLTiDg91M6Pr7gt3Ucol2dxEJBsBZ3RiY2GsE6Du5ABu5ABsYYdm7fnUjmDnrmyT/x0X/+EPMWzB7x90JGh1E7KcIYMwlYAuw6pOw44HJgDnA+8CNjjLun+sfADUBVz633qG+RMWpKRTkdHZ0UFubj8/s485zT+hwza07ViMbk9XnJyc2mqLhAyZykpK2btvPMb/8EwPN/eIm1q98CoLs7yOuvvkXNtt3U7qrjpedfTiRzALt37uEfL60mvzCPyVMmJpI5gGhnBx27ttO1ZydOOERnbQ0dO7fRuXsHbVs3EO1sB8D2swBxvP1CbRhj2ahN6IDvAbfQ+3/oJcDD1tqQtXYHsBU42RhTBuRaa//e0yr3c+ADIx2wyGhUOX0K06qmsOTC9+I4DgtPns8/XXwOHo+bopJClt99G3OOV3enyLFobmol1B3CiTq8/NdXe9WVjivGGIhEI+zdva/PuTt31PYpi4XDRDrasNFIvMDlwgkdMnnIWrr27cGJxaionMTkiom9zj/3wsVUzZw6+BcmKWtUdrkaYy4G9lhrXzfGHFo1EfjHIY9re8oiPfcPLxcZ84wxLFg4j5bmViqnTSIcjrLgpHlcf+NHyS/IG7EJESLpJDMzI77tg4n/0bT+zU2Julf+/hqTpkwkOzuLRe+pZtuWml7nnnrGwj7Xs1hiB7cEMy5w+q4L6YRD4DjMnjeDr915C3/+w1/Z8NZmzjjrFN77vtMIZASG8iVKiklaQmeM+SMwvp+qLwNfAs7t77R+yuwRyvt73huId80yefLkAcUqkg7yC/K04rzIEKmYOokzzz6N9rYOPrDsn9ixbRfdXfGu1cLiAuafOId9e/aTk5fDxR88j6eeeI5AwM+nPvvPHL9wbp/rud0evNm5RFqbwToYT9+vZ39BMcbjwWcM1acuoPrUBcP9MiWFmNE2b8AYMw94Dji4wmI5sBc4GbgWwFq7vOfYZ4GvATXAn621s3rKrwDOstb+vyM9V3V1tV29evXQvwgREUl7zY0tbN9aQyQaIxqJ0tzUQlFRAbPnzaCgMJ9QKMSBfY0Eu7uJRmPk5ecyoby/doy4SFcXkfYWQvX7cfkD+AsK6T5Qh43G8BUWk1EyHrfGnI5pxpg11trq/upGXZertfZNoPTgY2NMDVBtrW0wxjwJ/K8x5rvABOKTH1ZZa2PGmHZjzKnAy8DHgB+MfPQiIjJWFBTls7BowTvW+/1+Jk2ZMODreTMz8QT8+HLysIA7kIEvrwBrLS6vF2NG87B3SbZRl9AdibV2nTHmEWA9EAU+Za09uCnlJ3l72ZKne24iIiIpw7jceDKz3i5w+ZIXjKSUUZ/QWWsrDnt8B3BHP8etBvoOTBARERFJc6M+oROR9BMNBbHRKBhw+wK4+hkALiIiA6ffoiIyYqzjEO7swIaCdNfXYSMR3BmZZE2c0rubSUREjolGWIrIiIkGu7GhbrrqdmMj8QVUY91ddO7egRONJjk6EZHUpYROREaMjUbiidthyyXFQkGcSDhJUYmIpD4ldCIyYozLhXH1/bVjXG6My93PGSIiMhBK6ERkxLj9AVw+H768gl7lmRMnacFUEZFB0KQIERkxLq8PT1YOLrcXb24+1nFw+zPwZGYmOzQRkZSmhE5ERpTb68Pt1WKpIiJDSQmdiAyrcDjM+jc309rShsu42LxxK7tq9lB96gJOO/MkiooLkx2iiEjKU0InIsPq1VVvsGndVsZNKOWBnzzEhrc2A/Cb/3uKK6+9jM9+6ZN4fd4kRykikto0KUJEhk0oFOZXP3uc7Nxs9tbuSyRzBz388xXs2rknSdGJiKQPJXQiMmycmEOoO4y1llg01qc+FosRjfQtFxGRY6OETkSGTUZmgEuXXUAkEmXCpPGUjivuVX/m2Yson1KWpOhERNKHsYet2D6WVFdX29WrVyc7DJFRr6ulEycSwbgNYHCiMdx+L5l52Uc9t62tgzX/WIu1Fmstf3j6L2xev42zlpzOB5ZdwOSKicP/AkRE0oAxZo21trq/Ok2KEJF31N3SQfOuA4S6gljHwZvhp6uhjW0vvokTc5j5vhOZfNJM/NkZ73iN3NxsFp97BsHuII61VJ96PAYXufk5I/hKRETSmxI6EXlHjTv20VJbT6Q7hMvjJpCbxeuPv5iof+3/nseb4afi1NlHvVYgIxC/k/nOyZ+IiLw7SuhEpF+hjm7a9jWBBW/Ajz87g/qttX2O2/7Sm0w+aQYut/ZiFTnIcRxi3R04kRjG7cIJhwFwZ2TizcxKcnSSjpTQiUi/XB43Hr+XSMwBwHFi+LL6tq5lFORgjOZXiRwq2tlOtL0Nd1Y2nbt2YGNRAIzbQ3bFNLxZGnIgQ0u/hUWkX96Aj4LJJeCCUGc34c4QRVPL8Abe3rbL5XEz/b3zMS6TxEhFRpdYJEws2E002E2koy2RzAHYWJRwS1MSo5N0pRY6EXlHRZUTcLndBNu6sDEH43Vz8j+fT/u+JowxFE2bQFHl+GSHKTKq2FgM6zi4/QFioWCf+v7KRAZLCZ2IvCOX20VRZT/rxM2fOvLBiKQIl9eHy+Mh1NaCL7+IaEdbr3pfXkGSIpN0poRORERkCLncbjyZ2RiXG8eJESgZR7ChHoBAcSme7NwkRyjpSAmdiBxVNBolHIoQicZwGUt8OXJDbu7RFxYWGYs8GZm4vD5ikTA2I4ucvEKMMbgDGRijMacy9JTQicgRrXtjEw31jbiMIRKJ4na7WfHY0+zZVcf5F53NuReeRfnkCckOU2TUcXk8uDz6mpWRof9pIvKOdmzbxVNP/JGFJ8/n9dc3MnfBLG791/+gs6MLgI3rttBwoJHP3fYvuLUOnYhI0mjZEhHpV1trO2+8tp7JUyawq6aWwqJ8anfuTSRzBz3yqyfZW7vvmK5traVm2y42vLWZzRu20tbSPpShi4iMOWqhE5E+2lrbWfm7P9PR0YU/4MPt9mCtxdNP95HP5z2m1rmW5lZe+cdamhqa+flPH2H3zj3MnlPFV+/8AnPmzRzKlyEiMmYooRORPrZu2kFjYzPtbR2EQxFOf+/JrFn1OieeNJ/xE0rZt/dA4tjrP30V48pKeO2VN9m2eTtl5WVEYzFszGHf3gP8+Q8vMbmynH+6+BxOPGk+G97czMZ1W/jV/Y/R1dkNwIZ1W/js//sKv1zxY0pKi5L1skVEUpYSOhHpo6mxmVAwzP/+7Nf88yc/QigU4dQzFtLdHeLr/3ULr73yJntr93H6Wadw6hnVbFy3hR/f9QBLL38/rS1t1GzbRSzm8NMf/gKAv7+4mmee/BP3P3I327buBEgkcwfV7dlPXe1+JXQiIu+CEjoR6WPSlIns3FFLNBLl3u//nJzcbMomlnLx0vP5yLWXseg9J/U6/sU//Z3S8SWEQiG6u4JUTJvMf3zpO72OaW1pY+NbWygqKaCjvbPPc/r8PnLytGm5iMi7oUkRItLHtBkVzJ47g//3mY+RlZ1Je1sHRSVFnPm+0/B4j/x34MEVtqy1ferC4QgzZk0lGoly6Ycv6FX3udv+hckV5UP1EkRExpRjbqEzxhQBZwK7rLVrhj4kEUk2j8fDaWeeRNXMSs469wzcLheTKyaSmZXZ7/HTZlTw5OPP4vf78Xq97Ni6i49ccxn/c88vE8fk5ecya850plVV8v6l57Jndx2L3nMSXR1dTJ1RwazjpmvpExGRd8n091d0rwOM+R1wq7X2LWNMGfAqsBqYBtxrrb1r2KMcJtXV1Xb16tXJDkMk5cViMd54dT3btu5gwoTxRKLxzcnr9h7g+T+8xOSKnkkRJ89PdqgiI65m2y7Wv7WZmu27Oem0E4hFoqx7cxN/+8srzJg9lYuWnsdxmuEtA2CMWWOtre63bgAJ3Tpr7Zye+18CZllrP2aMyQH+aq1N2d/QSuhERGQ41e9v4O7/upeO9i7OOf9M8gty+dOzL/L4Q79LHFNUUsjPH/8hk6ZMTGKkkgqOlNANZAxd5JD75wBPAVhr2wFn8OGJiIikpx3bdlE2YTx/XvkSBQW5dHd18+Tjz/Y6prG+iQ1vbUlShJIuBjKGbrcx5tNALXAi8AyAMSYD8A5jbCKS4sKdQdobWnEcB6IxrGOJhqO43C7yJhaToVmtMoZYE58yZIzpU9dfmcixGEhCdx3w78D7gA9ba1t6yk8FHhimuEQkxbXXt9BUsx+31024I0gsGqV+8x5qX423RGSV5HP6J95P/sTiJEcqMnymTp/Cbx9/lvf905k0NzRTUJTPJR88n0d/9WTimJJxRcyeW5XEKCUdHHUMXTrTGDqR4bN79Sa6WjrIyM+m40ALWHjrt3/vdcykk2Zy8tVLcPezpZhIuqjZvpuN67awc0ctJ540j1jMYdOGrfz9xVeYMWs6/3TxOUroZECONIbuqL9FjTFPHqneWnvxuw1MRNJXsCNILBIj2NaFtRBq7+pzTP2m3US6QrhzldBJ+qqYOomKqZN6lZ16xkKuvv7DSYpI0tFAfosuAnYDDwEv8/a6oSIi7yiQE8CJxgjkZhENhgnk9F3DrnTmJLyZ/iREJyKSXgYyy3U88CVgLnA3sARosNb+xVr7l+EMTkRSV8Hk8QTys3B53QRys/Bm+Zm0cEaiPrs0n9nnn6TuVhGRIXDU36TW2hjxma3PGGP8wBXA88aYf7fW/mC4AxSR1JRdkoc3009nYysZeVnEIjHyJhZTseg4jNtF3oQizXIVERkiA/rTuCeRu5B4MlcBfB/49fCFJSLpwJ8VwJ8VSHYYIiJpbyCTIh4k3t36NPB1a+1bwx6ViIiIiAzYQFrorgI6gRnAZw5Z/NAA1lqbO0yxicgwam5qoX5/Ix6Pm0lTJuL1aZ1wEZFUNZAxdO84caKnK1ZEUoi1lrWr3+IfL63ml/c/RldnNxdeuoRP/us1TJxcluzwRETkXTjqLFdjzFfeoTwXeLa/OhEZvWq27+a11W/y47t+RntbB7FYjCcfe4ZH//dJxvJC4yIiqWwgy5a8xxhzx6EFxpjxwIvAn4clKhEZNntr69i/r75P+e9+vZLmxpaRD0hERAZtIAndxcDxxpjvAhhjqoCXgB9Za78+nMGJyNDLys4iJze7T3nl9MlkZGYkISIRERmsoyZ01togcCkwxRjzMPBH4AvW2v8e7uBEZOhNn1FJ+aQyplVVJMoCAT//8tl/JiNTS4yIiKSigSxb8tmeu6uAW4h3tVYeLLfWfnf4whORoZadk8Xic89gSuUkanftJeY4zJ5bxczZ05MdmoiIvEsDWbYk55D73++nTERSTF5+LiecNI8TTpqX7FBERGQIDGTZkgGNkzPGfNFau3zwIYmIiIjIsRjIpIiB+tAQXktEREREBmhAe7kOkDn6ISKSSoLBEOte30goFMQYF9FIjLfe2Mj+unpOOf0E5syfxeSK8mSHKSIy5g1lQqcVSUXSzLrXN+BEHVzGRWtLOyt//zxLP7iEaedWk+F109bVRkdrG9l52gFQRCSZhrLLVS10ImkkEo6Qn5tJc3MrAP6Aj/eeVc1xZdn4okFi3Z1kRbtwWpu0w4SISJINZOuvUwd4rUcHGYuIjCJONEJeThZlE0oAyAj4mVY5ERuL9Tou2tqIEwknI0QREekxkBa6Hxtj/tsYk3+kg6y13xyakOKMMZ82xmwyxqwzxvzXIeVfNMZs7ak775DyhcaYN3vqvm+MUYuhyCB4PG68Hjf5+TmMK85nQlkxmVn97CRhjAZciIgk2UASuoXABmCVMeaqYY4HAGPMYuASYL61dg7w7Z7y44DLgTnA+cCPjDHuntN+DNwAVPXczh+JWEXSldvnx+f3UZSTwfjxRfj8XvIL8zGe3kNvM0on4PL5khSliIjAwNahc4C7jDErgb8bY35E/O9xE6+2wzEa+pPAf1prQz0xHOgpvwR4uKd8hzFmK3CyMaYGyLXW/h3AGPNz4APA08MQmwwBJxImGo6AdcCJYWMxjNuNJyMLl9eb7PCkhz8nh2h3F9ZaCvKysY4DWVOIdXcSC4fx5ebhzc5FDeIiIsk1oFmuxpjrgFuBLwP32OEfAT0DeI8x5g4gCHzeWvsKMBH4xyHH1faURXruH14uo1AsFCTU1oLb58cJhwk17E+MwXJnZJI9eSpuv/YUHQ1cHi++nLy+FXkFIx+MiIi8o4Hs5fo3oAZ4j7V231A9sTHmj8D4fqq+3BNXAXAqcBLwiDFmKv3PpLVHKO/veW8g3jXL5MmTjz1wGbRwWwueQCaRjjZsJNJrQH2su4tIR5sSOhERedestUS7u3CiEcDEuxSdKDYaxeX1487Mwp1mvUEDaaF7yFr7g6F+Ymvt+96pzhjzSeDXPS2Bq4wxDlBMvOVt0iGHlgN7e8rL+ynv73nvBe4FqK6u1lDuJIh2duD2Z4C1xELd/dR3QlESAhMRkbQQ7egg3NGG2+sDF8RCIYL1b7dJ+fILyZwwGZdnKJfjTa6BvJJrgSFP6I5iBXA28LwxZgbgAxqAJ4H/NcZ8F5hAfPLDKmttzBjT3rPEysvAx5IQswyQNzcfDBi3C09mNrHurt71OVqkdihZa2nYtpdQezexmIPb7SLY3kXdWzvIKsxl8smzKJ5aluwwRUSGhBOLEg12YXq+Z2LhEMGGA72OCbc04S8qweXJSVKUQ2+0pqb3A/cbY94CwsDVPa1164wxjwDrgSjwKWvtwUWxPgn8DMggPhlCEyJGKW9OLqGWpngyFwrizc4l0tEGgK+wGG9W+vyAjQbNu/aze/UW8suL6W7uABes++3bQ1Fr/r6es7+wjPzykiRGKSIyNKzjYJ1o/EF8+mZ8At7hxx22pmaqG0hCN98Y09ZP+bDNcrXWhoGPvkPdHcAd/ZSvBuYOdSwy9NxeHxlFJURDITxuN+5ABv6S8bg8btz+DIxrKDcwkY76NnLLCtj75g7KT6zirSf/1qs+GorQuL1OCZ2IpAWXx4vb6ycS7MLEHIzLhTuQQSx4yBAf48Ll8ycvyGEwkITuTWvtCcMeiYwpxuXGm5GZ7DDGCPv2v8bG/1o9/Ah76H1LpLsLolEw4ITDYAyezCw8gX4WFhYRGUWMMXiycsC4cKyDO5BBoGQcoaZGop3tuP0BMidOSbvfZ6O1y1VEhkhWST71W9dTNreSroY2KhYdx/rfv5yod/s8FE19e8J5tLODWDCIy+Oma+8ubDTedWE8XnIqq/AoEReRUc7t9+PyeYmFwz1/sfrJmJCBweDy+tJqMsRBA3lF2qNVJIUVTirFCUcIdXTjCXhxezyccPlZ7Fm7jczCHCpPm0PBpFIAnGg0Ps3fxoi0dySSOQAbjRBuaVJCJyIpwRgXnjG0BNZAEroyY8z336nSWvuZIYxHRIaYcRlKqsr7lFedtaBPmXViYC3G5Y53tR6m1xgUEREZNQaS0K0e9ihEZFRwebzxcSfREJ6sbKKd7b3qvXn5yQlMRESOaCB7uT44EoGISPLFZ4MFOLg1q7+wmFBzY/x+USne/rYBExGRpBvI1l9PHqneWnvx0IUjIsnmCWRgvT5i4RC+whJ8BUUYV3x5GWP622VPRESSbSBdrouA3cBDxHdh0G90kTRn3G5NfhARSSEDSejGA0uAK4CPAL8nvr/ruuEMTEREREQG5qhL8ltrY9baZ6y1VwOnAluJ77H66WGPTkRERESOakAr6xlj/MCFxFvpKoDvA78evrBEREREZKAGMiniQeJ7pD4NfN1a+9awRyUiIiIiAzaQFrqrgE5gBvCZQ2a5GcBaa3OHKTYRERERGYCBrEN31HF2IiIiIpI8A+lyLTxSvbW2aejCEREREZFjNZAu1zWApf/15ywwdUgjEhEREZFjMpAu18qRCERERERE3p2jjo8zxlz2DuU+Y8xXhj4kERERETkWA+lyvcEY83HgU9ba7QDGmH8Cvgc8M5zBiYiIjBWRYIjWuiYi3WGMAetYGrbsoW1/M2VzKyibW0lGXlayw5RRaiBdrucZY64A/mCM+V/ia9KVAB+21r4+3AGKiIiMBY3b64hGojjhGG6vh9cff4GO+lYA9ry2lRlLTmT+B87A5dbiE0PJiURwYlE4uCyb4wDg8gdwuVLnvR7QThHAI8Ac4GagBTjbWrt5uIISEREZS0KdQWJRh876Nvw5GXQ3tCaSuYO2/mkt086YR864giRFmV5sLEako51oKIjL7QaPBxOL0X2gDicWw5dfQKCwBE9GZrJDHZCBjKE7A3gNKAImATcCvzXG/HvPlmAiIiIyGI4DBiwWANvTSnQo61istSMdWdqKdncSC3WDE8OJhDGOQ9fe3fgLisiaMAm310e4rYVweyvd9fvprt9HtKtz1H4GA2lLvAv4uLX2k9baZmvtCuAEwA+oy1VERGSQ/DmZGAxZhbkYl4vsknwCub1bhqYsmk1WsTZnGiqxYDex7q74A2NwwmECpeOx1tK9r5bufXswxtCxcxvddbvprqulbdtGop0dyQ38HQyky/Vka61jjKkk3u1qgQ3W2n8zxvxsWKMTEREZI4oqx9O2r2dShNtN9VXvo/bVrbTVNTHxhGlMWjgDt2egI6XkqNxujMfLwS1NjcdDLNiNy+PBiUQwHi9OOJQYUweAtYQaD+DJyuaQrVBHhYH8z8gyxtwHVANriS8wfLwxZg1w3TDGJiIiMmb4szMomT6RWCyGE45i3IaSGeUYY/D4vMkOL+14MrOx0SjR7m5cXi8erw97SJeqcbuwsVif85xIGKx9exLFKDGQhO4HwHrgcmutA2DiaelXgB8CHxu+8ERERMYWt9uNO8Od7DDSnscfwOTm4w5kYJ0YWAdfbh6xUBAwOKEQ7oJiaG3udZ6/qBQzCme/DiShO91ae82hBTaevv67MWbLsEQlIiIiMszc/gBufyDx2DoxrDFkTpxEqKmRSGcHmROnEGo8gLUOgZIyvDl5SYz4nQ0koRtdbYoiIiIiw8C43Phz83FiUby5BRgDLo8XX34BWHCN4jGMA2kz/Ksx5qvmsNF/Pdt+/WN4whIRERFJDpfbg9vrxeXxJh6P5mQOBtZC92ngPmCrMWYt8VmuJxBfm+7jwxeaiIiIiAzEQLb+agM+ZIyZBhxHvAv236y124Y7OBERERE5ugG3H/YkcIkkzhgzE/i8tfb64QhMRERERAZmIFt/zTfGrDTGvGWM+YYxZpwx5nHgOeLLmYiIiIhIEg1kUsRPgf8FLgPqgVeB7cB0a+33hjE2ERERERmAgXS5+q21P+u5v8kY83ngVmtt3+WTRURERGTEDSShCxhjTuDt9eg6gPkHlzGx1r46XMHJ2OFEI0S7u8DlglgMLNhYFAt4Ahm4MzJH3b55IiIio8VAErp9wHff4bEFzh7qoGRssdYSbmnGFQgQ7WgHINreRrSro+cIQ07l9FG7OreIiEiyDWTZkrNGIA4Zw2LBbsIdbfhcrkQz8NvJHIClc88ucqfPHvULO4qIiCTDQGa5VhljVvTMcn3IGDNxJAKTscPGongzMuNdrNYS3yq4Nyccim+eLCIiIn0MpLnjfuDnwAvAxcAPgKXDGZSMLS6fn8j+fQSKS4iFgv0e48srSGzBIqOX4zjU7dlPKBTGiTmJkbcFhfkUFRckNzgRkTQ2kIQux1r705773zLGaBKEDCm3z0/G+DKcaAzj9WEdh4zxEwnW74+33uXmkTF+IsY1kFV2JFkO7Gtgw/otRMMRQsEwwWCQ5559kZf+/DKl44v58n/czOlnnYLXq25zEZGh9m5muWYc+lizXGUoeLNyiEXCxKJRjOMAkJ2dizEu3D4fxu1OcoRyNOve2MjePfvwuN3U1R1g84ZtvPTnlwHYX1fPTTfcxq+e+DFz5s9KcqQiIulnIAldHZrlKiPA7fXh9vqSHYa8C8FgiF01tWTnZBEMhfG43fz1+VW9jnEchx3bdiuhExEZBgOZ5bp4IBcyxiyx1v5h8CGJSKrxej3k5uUSCoYw1hCJxCgdX8z+uvpex+Xl5yQpQhGR9DaUg5LuHMJriUgKcbvdHDdvBl6fl+zcTCZMLOXKaz+I65Bxj6eevpBZx01PYpQiIulrKEcnaxl/kTFs5nHT8Xg8NDQ0gbVEIlG+/aOv0dTYTHFpMfOOn0XJuOJkhykikpaGMqHru3iYiIwp02ZUMG1GRbLDEBEZc7QOhIiIiEiKG8oWupohvJakkWhXJ9FQEGNcYB2cSBjj8eLNysHt9yc7PBERkZR31ITOGHPEXSGstb/u+Ve7R0gf0c4O2ndtI1A0DmsduvfvTdS5AxlkV1Th9mmpEhERkcEYSAvdY8Danhv0nvxggV8PbUiSLqy1BBsP4MstwIlGCLc09qqPBbuJdXcpoRMRERmkgSR0lwEfBuYDTwAPWWu3DmtUkiYsTjiMJ9sP1mKjsb5HOH3LRERE5NgcdVKEtfY31trLgfcC24DvGGNeMsa8d9ijk5RmjAt/UQnRjjYcx8GXl3/4AbgDGUmJTUREJJ0cy6SIINAKtAGTgcCwRCRpxZuTi3UcnEgEVyADXC4iba24fH4yy8qV0ImIiAyBgUyKWAxcAZwM/BG421q7ergDk/Tg8ngJFJXgRCNYC96cPDLGTcC4Pbjc7mSHJ0lgHUtHYyvRUBhiFpfHTWZxLl6/xlKKiLxbA2mhew54A3gJ8AMfM8Z87GCltfYzwxSbpBGXx5vsEGQUCLZ30VSzn3B3iFBbJ5tWriHY1sX446aw4ENnkltWlOwQRURS0kASun9Gu0CIyBBoqa2no76ZWDjKm0/8LfGbZd/6naz99Uucdv0FeHxK/kVEjtVREzpr7c9GIA4RGQO6GtsJdwTjDw77M3HfmzsItnSSXZo/4nGJiKS6gYyh+y29f/VaoAH4s7X2l8MVmIikH2+mH3eHByfm9KnLKMjGE1DrnIjIuzGQLtdv91NWCHzUGDPXWnvrEMeEMWYB8BPiM2mjwL9Ya1f11H0RuA6IAZ+x1j7bU74Q+BmQATwF/Ku1Vl3FIqNI/sQioqEIoY5uSmdO4sCm3QAYl2HhFWcTyM1KcoQiIqlpIF2uf+mv3BjzJLAGGPKEDvgv4OvW2qeNMRf0PD7LGHMccDkwB5gA/NEYM8NaGwN+DNwA/IN4Qnc+8PQwxCYi71LOuEKMy0VnUxt5E4qoOO04nGiM/PJi8stLkx2eiEjKOpZ16Hqx1saMMUc/8F1eHsjtuZ8HHNwA9BLgYWttCNhhjNkKnGyMqQFyrbV/BzDG/Bz4AEroREad7JJ8skvykx2GiEhaGcgYusJ+iguAjwHrhjyiuJuAZ40x3ya+m8VpPeUTibfAHVTbUxbpuX94uYiIiEjaG0gL3RriLWYHm+McoBF4Hvjku31iY8wfgfH9VH0ZOAe42Vr7uDFmGXAf8L5DYjiUPUJ5f897A/GuWSZPnvwuIhcREREZXQaS0H0Y2G2trQMwxlwNXEZ8wsJgumzf9051PV2m/9rz8FHgf3ru1wKTDjm0nHh3bG3P/cPL+3vee4F7AaqrqzVpQkRERFKeawDH/AQIARhjzgSWAw8S39f13mGKay/w3p77ZwNbeu4/CVxujPEbYyqBKmBVT7LZbow51cQH9n0MeGKYYhMREREZVQbSwua21jb13P8wcK+19nHgcWPM2mGK63rgbmOMBwjS00VqrV1njHkEWE98OZNP9cxwhXj378+IL1vyNJoQISIiImPEgBI6Y4zHWhslPrbthmM8/5hZa18CFr5D3R3AHf2UrwbmDkc8IiIiIqPZQBKyh4C/GGMagG7gRQBjzHTi3a4iIiIikkQDWVj4DmPMc0AZsPKQ3RdcwKeHMzgREREROboBdZlaa//RT9nmoQ9HRERERI7VQGa5ioiIiMgopoROREREJMUpoRMRERFJcUroRERERFKcEjoRERGRFKeETkRERCTFKaETERERSXFK6ERERERSnBI6ERERkRSnhE5EREQkxSmhExEREUlxSuhEREREUpwSOhEREZEU50l2ACIiImONE4vRuq+JaHcYXOByu7FRh2BrJ7FIlNwJxRRMKkl2mJJClNCJiIiMIOtYDmyuJdwVxBvw4/a6iQSjbP7Ta+x7qwYAf3YGi264kNIZ5ckNVlKGulxFRERGUEdDK13NHbhdboKtXXS3dtHZ2JZI5gBCHd1seHoV4a5g8gKVlKKETkREZARFQxFikSjugBdvpp9YJEZXc3uf45p3HSDY3p2ECCUVqctVRERkBPkyfXi8HiLdIcLdYXwZfrJL8vscVzprMpkF2SMfoKQkJXQi8q5EQhGcaBTrWHBBd2MnXS3teDN8RIIRvD4P3a0dOFGHvInFFE4Zl+yQRUaFrKI8sorbCXV0k12YAy6DN9PHjHNOYMufX8c6DvmTSpj5vhPw+LzJDldShBI6ETkm3S0dtOxtwOVyEYtEMW4XHq+HunU7yZ9UTHtdE5lFOWz4/cs01ewDwOP3cvonL2bcrElJjl5kdCidUU5HYyuhjm6MAYyLyafOomx+JU7UIbesiKzCnGSHKSlECZ2IDJh1LHXrasjIz6ZtfzO+zABur5tYKMqe17ZSPLWMmpfXM/X0uYlkDuJjhtb97h8UVozDG/Al8RWIjB7ZRXlkF+UlOwxJE5oUISID1tXSnhjQ7fF7wTp0NrQR7gqSkZ9NpDtEZkEOwbauPue272siGgwnIWoRkfSnhE5EBsy44r8yjDFYx+I4lu7WDgK5WXQ2thLIzaS7uYOccYV9zp2wYBr+nMyRDllEZExQQiciA5aZn00gNxPjdmEMeAI+CiaV0tHQyvyl76Fp5wHmfuB0IqEwc95/arwVDyibU0HV4gW43PqVIyIyHDSGTkSOyfg5FbTUNuDxe3CiMQj4cPs8hDq6KZ42AZfHRWZ+Fg6G8XMqAEvO+CJ8GRo7JyIyXJTQicgx8WX4Ka2amOwwRETkEOr/EBEREUlxSuhEREREUpwSOhEREZEUp4ROREREJMVpUoSIHDNrLbFgECcaxhqDwYDjYGMRjNePNysbY0yywxQRGTOU0InIMQu3txFtb8WVkYnL6yXa2U7wQF280hiyyivwFxQlN0gRkTFEXa4ickxioSBOdxfG7cZlDMZx3k7mAKyla+8uot19t/8SEZHhoYRORI6JE41irdPr8eFsLIYTjYxkWCIiY5oSOhE5Ji6vN7Gn68HH0Hu8nPF4cXm1M4SIyEhRQicix8Tt8+PJyMYC1nGwxpA5YRL0JHnG7SGrfAqeQEZyAxURGUM0KUJEjpk3Jwfj9+FEIhgsTmYW2RXTsbFYT8KXmewQRUTGFCV0IvKueHx+8PmTHYaIiKAuVxEREZGUp4ROREREJMUpoRMRERFJcUroRERERFKcEjoRERGRFKeETkRERCTFKaETERERSXFK6ERERERSnBI6ERERkRSnhE5EREQkxSmhExEREUlx2stVRI5Ze1sHO7btoquzG5fbDdZh/756mhtbmDRlIgtPOZ7cvJxkhykiMmYooRORYxKJRHnr9Y10dnTS0d5Jdm42K3/3Z5757Z8Sx1z3Lx/hkzddi8/vS2KkIiJjh7pcReSY7NlVR2dHJwf2NZCVlUlHe2evZA7ggZ88TM2O3UmKUERk7FFCJyLHJBwJE43F6OzsIuY4dHZ09jnGcRw627uSEJ2IyNikLlcROSal40rYt+cAZRPH43IZxo0vobC4gKaG5sQxkysnMrmyPIlRioxuu2r2ULNtJ/6AHydmycrJpLOji21bdrC3dh/zFhzHqWdUU1CYl+xQJUUooRORY5JfkMvEKWV46+oJhyNkZmTw1W9+jgfv/T/Wv7mJkxadwKc+dx1FxQXJDlVkVGpv6+DXD/+O2XNnsG3rTk48+Xhamlu4757/5bXVbyaOu/qGD/OZW67H6/UmMVpJFUlL6IwxHwK+BswGTrbWrj6k7ovAdUAM+Iy19tme8oXAz4AM4CngX6211hjjB34OLAQagQ9ba2tG7MWIjDHTplcwsbyMA/vqCYXClJaV8NXln8NaS/nkCQQyAskOUWTU2rJxOwCRSITurm7CoTA7tu3ulcwB/PK+x/jAsguYVlWRhCgl1SRzDN1bwFLghUMLjTHHAZcDc4DzgR8ZY9w91T8GbgCqem7n95RfBzRba6cD3wPuHPboRca4aCRKbl4OWdlZZGUHyMzOIjsnWzNbRY4iGArhD8R/TrKys3Ach0g40ue4WCxGuJ9ykf4krYXOWrsBwBhzeNUlwMPW2hCwwxizFTjZGFMD5Fpr/95z3s+BDwBP95zztZ7zHwN+aIwx1lo7zC9DZMxpbWlj9ctryc7OIhwK4w/4CQZDPPHoM2x4azOnv/ckll31AapmTk12qCKj0tRpU3jmieeYUjmJhv2N+E88jvLJEygqKaSxvilx3EmLTmDS5AlJjFRSyWgcQzcR+Mchj2t7yiI99w8vP3jObgBrbdQY0woUAQ3DHq3IGPPS8y+TmRmg4UAjxuXCcSzfuO277K+rB+D/fvEEm9Zv5YcP3KnFhUX6MX5CKcuuuoTdO/dy0mkL6OoMMq6shK/c8VmeeOwZNq3fypnnLOKKq5eSnZOV7HAlRQxrQmeM+SMwvp+qL1trn3in0/ops0coP9I5/cV0A/FuWyZPnvwOIYhIf4LBIL+6/zGu+5cr2bevnukzprJnd10imTto7Zp17KrZw9zjZyUpUpHRbe7xs5kxexpNjS24XS5iMYeJ5WVMn1GBL+CndFwxLpdWFpOBG9aEzlr7vndxWi0w6ZDH5cDenvLyfsoPPafWGOMB8oAm+mGtvRe4F6C6ulpdsiLHwO1yU1CQhzEGn8+HtQ5+f98ZeC6XC18/5SLyNp/Px/iy0mSHIWliNKb/TwKXG2P8xphK4pMfVllr64B2Y8ypJj7w7mPAE4ecc3XP/Q8Cf9L4OZGh5/V5ueYTVxDsDlI6rpj2tk7GlZVSfeqCXsd96KMXM6VyUv8XERGRIWeSlfcYYy4FfgCUAC3AWmvteT11Xwb+GYgCN1lrn+4pr+btZUueBj7ds2xJAPgFcALxlrnLrbXbjxZDdXW1Xb169dEOE5FDRKNR1r2xiUgkQjgUwev14jgxtm6uYVdNLfOOn82p76mmuKQw2aGKiKQVY8waa211v3VjuSFLCZ2IiIikiiMldKOxy1VEREREjoESOhEREZEUp4ROREREJMWNxoWFRURE0pq1lmiwGxuLgeOA2wM42HAY43bjzsjC7dXSPzJwSuhERERGkLWWcEc7WIsT6sadkQmxCF17duFEwgB4c/LInDAJtz+Q5GglVajLVUREZAQ54RBYB5wY4fY2sBBqakgkcwCR9lYine1JjFJSjRI6ERGREeTEYuBYbCyKLycPrCXW3dXnuFgwmIToJFUpoRMRERlBLrcHXC6M20uorQWMwZOVjePPpN6Vw44uD+3eHDwZmckOVVKIxtCJiIiMILffT1dnJ8HuMPtaI0wpiNAc8bB9VxPfvePH7NxRy5TKcm79+r9y+nuLkh2upAi10ImIiIyw1Ws2sG79NnbtqmPvviZqa/dx+xf+i507agHYuaOWL930DbZtqUluoJIylNCJiIiMIGst9fsbaGlqJTsni3AwHH/c3NrruOamVnbX1CYpSkk16nIVEREZQcYYMrMy8Hm9RCNRvD4vufm5uN1uYrFY4ji3201+YX7yApWUohY6ERGRETZ7zgwyszPo7g4SiUYpLMrn2k9c3uuY6z/9UWbPqUpShJJq1EInIiIywqbNqCAQ8HPgQAOxSJSsnCzOWnIGc4+fTcOBRsZPHMf8E+bgD/iTHaqkCCV0IiIiSTBxchkTJ5clOwxJE+pyFREREUlxSuhEREREUpwSOhEREZEUp4ROREREJMUpoRMRERFJcUroRERERFKcEjoRERGRFKeETkRERCTFKaETERERSXFK6ERERERSnBI6ERERkRSnhE5EREQkxSmhExEREUlxSuhEREREUpwSOhEREZEU50l2AKNNJBKhtraWYDCY7FBkCAUCAcrLy/F6vckORUREZMgpoTtMbW0tOTk5VFRUYIxJdjgyBKy1NDY2UltbS2VlZbLDERERGXJK6A4TDAaVzKUZYwxFRUXU19cnO5QxyVpLe1sHxkA0FsPlcgGGrMwMPF79ChIRGQr6bdoPJXPpR59pcmzfWsO+unp8Pg/dXSGikSi7avbwzG//RGFRHh+9bhknn3YCbrc72aGKiKQ0TYpIU263mwULFiRu//mf/5moq6+vx+v18t///d+9zqmoqOCyyy5LPH7ssce45pprRipkSTNtLe1s37KTpsYWOtu7Wf/mZta/tZnv3PEj1r2xkRf//DKfuuYW3lq7IdmhioikPCV0aSojI4O1a9cmbrfeemui7tFHH+XUU0/loYce6nPe6tWrWbdu3UiGKmmqrm4/dXsPkJ+fx/atO5lQPo6nn/hjr2Oi0RhrVr2RpAhFRNKHErox6KGHHuI73/kOtbW17Nmzp1fd5z//eb75zW8mKTJJJ263G7fLBVg8Hjc44A/4+xzn9/tGPjgRkTSjhC5NdXd39+py/b//+z8Adu/ezb59+zj55JNZtmxZovygZcuW8eqrr7J169ZkhC1pZMLE8UycXEZTQzNTqyqoqdnNB5Zd0OuY7Jwsqk9dkJwARUTSiCZFpKmDXa6He/jhh1m2bBkAl19+Oddddx2f/exnE/Vut5svfOELLF++nH/6p38aqXAlDWVmZTDzuOns2rEH47JUn7KAcDjM8rtv45W/vUbxuCLee85pzDxuerJDFRFJeUroxpiHHnqI/fv386tf/QqAvXv3smXLFqqqqhLHXHXVVSxfvpw5c+YkK0xJE+PLShlfVtqn/MIPLElCNCIi6UtdrmPIpk2b6OzsZM+ePdTU1FBTU8MXv/hFHn744V7Heb1ebr75Zu66667kBCoiIiLHRAldmjp8DN2tt97KQw89xKWXXtrruMsuu6zf2a7XXXcd0Wh0pMIVERGRQVCXa5qKxWIDOm7+/PmsX78egJqamkS53+9n7969wxGaiIiIDDG10ImIiIikOCV0IiIiIilOCZ2IiIhIilNCJyIiIpLilNCJiIiIpDgldCIiIiIpTgndKFVRUcG8efNYsGAB1dXVALz++ussWrSIefPmcdFFF9HW1pY4fvny5UyfPp2ZM2fy7LPPAtDV1cWFF17IrFmzmDNnDrfeemvi+F27drF48WJOOOEE5s+fz1NPPZWoe/DBB6mqqqKqqooHH3wwUX7NNddQWVmZWNvu4NZira2tXHTRRRx//PHMmTOHBx54AIjvG7t48WJmz57NnDlzuPvuuxPXampqYsmSJVRVVbFkyRKam5t7vf5du3aRnZ3Nt7/97T7vzcUXX8zcuXMTj2+++eZETDNmzCA/P/9Y324REZHUZq0ds7eFCxfaw61fv75PWTJMmTLF1tfX9yqrrq62zz//vLXW2vvuu8/edttt1lpr161bZ+fPn2+DwaDdvn27nTp1qo1Go7azs9P+6U9/stZaGwqF7BlnnGGfeuopa621119/vf3Rj36UOH/KlCnWWmsbGxttZWWlbWxstE1NTbaystI2NTVZa629+uqr7aOPPton1jvuuMPecsst1lprDxw4YAsKCmwoFLJ79+61a9assdZa29bWZquqquy6deustdZ+4QtfsMuXL7fWWrt8+fLE+QctXbrUfvCDH7Tf+ta3epU//vjj9oorrrBz5szp9337/ve/b6+99tp+60bLZysiIvJuAKvtO+Q0aqEbpFBzIy0b3qDpjdW0bHiDUHPjsD3Xpk2bOPPMMwFYsmQJjz/+OABPPPEEl19+OX6/n8rKSqZPn86qVavIzMxk8eLFAPh8Pk488URqa2sBMMYkWvhaW1uZMGECAM8++yxLliyhsLCQgoIClixZwjPPPHPEuIwxtLe3Y62lo6ODwsJCPB4PZWVlnHjiiQDk5OQwe/Zs9uzZk4j56quvBuDqq69mxYoVieutWLGCqVOn9tlLtqOjg+9+97vcdttt7xjLQw89xBVXXHH0N1NERCSNKKEbhFBzI521O3EiYQCcSJjO2p1DktQZYzj33HNZuHAh9957LwBz587lySefBODRRx9l9+7dAOzZs4dJkyYlzi0vL08kTge1tLTw29/+lnPOOQeAr33ta/zyl7+kvLycCy64gB/84AcDutaXv/xl5s+fz80330woFALgxhtvZMOGDUyYMIF58+Zx991343L1/q9VU1PDa6+9ximnnALA/v37KSsrA6CsrIwDBw4A0NnZyZ133sntt9/e5z35yle+wuc+9zkyMzP7fc927tzJjh07OPvss9/5jRUREUlDSugGoXvfHrBO70LrxMsH6a9//SuvvvoqTz/9NPfccw8vvPAC999/P/fccw8LFy6kvb0dn88Xf0pr+5xvjEncj0ajXHHFFXzmM59h6tSpQLwl65prrqG2tpannnqKq666Csdxjnit5cuXs3HjRl555RWampq48847gXir3oIFC9i7dy9r167lxhtv7DW+r6Ojg8suu4y77rqL3NzcI77u22+/nZtvvpns7Oxe5WvXrmXr1q199qI91MMPP8wHP/hB3G73EZ9DREQk3Wgv10E42DI30PJjcbALtLS0lEsvvZRVq1bx+c9/npUrVwKwefNmfv/73wPxVrSDrXUAtbW1ifMBbrjhBqqqqrjpppsSZffdd1+iK3XRokUEg0EaGhooLy/n+eef73Wts846CyDRoub3+7n22msTExYeeOABbr31VowxTJ8+ncrKSjZu3MjJJ59MJBLhsssu48orr2Tp0qWJ644bN466ujrKysqoq6ujtLQUgJdffpnHHnuMW265hZaWFlwuF4FAALfbzZo1a6ioqCAajXLgwAHOOuusXrE+/PDD3HPPPYN520VERFKSWugGweX1HVP5QHV2dtLe3p64v3LlSubOnZvolnQch2984xt84hOfAOKzPh9++GFCoRA7duxgy5YtnHzyyQDcdttttLa2ctddd/V6jsmTJ/Pcc88BsGHDBoLBICUlJZx33nmsXLmS5uZmmpubWblyJeeddx4AdXV1QLxFcMWKFYmZpodea//+/WzatImpU6direW6665j9uzZfPazn+31/BdffHFiBu2DDz7IJZdcAsCLL75ITU0NNTU13HTTTXzpS1/ixhtv5JOf/CR79+6lpqaGl156iRkzZvRK5jZt2kRzczOLFi0a1HsvIiKSitRCNwgZ4yfSWbuzd7ercZExfuKgrrt///5E12I0GuUjH/kI559/PnfffXeiBWrp0qVce+21AMyZM4dly5Zx3HHH4fF4uOeee3C73dTW1nLHHXcwa9asxOSEG2+8kY9//ON85zvf4frrr+d73/sexhh+9rOfYYyhsLCQr3zlK5x00kkAfPWrX6WwsBCAK6+8kvr6eqy1LFiwgJ/85CdAfGzbNddcw7x587DWcuedd1JcXMxLL73EL37xi8TyKwDf/OY3ueCCC7j11ltZtmwZ9913H5MnT+bRRx8d1Hv20EMPcfnll/fqahYRERkrTH9jpsaK6upqu3r16l5lGzZsYPbs2QO+Rqi5ke59e3AiYVxeHxnjJ+IvKBrqUGUIHOtnKyIiMpoYY9ZYa6v7q1ML3SD5C4qUwImIiEhSJW0MnTHmQ8aYdcYYxxhTfUj5EmPMGmPMmz3/nn1I3cKe8q3GmO+bnv41Y4zfGPN/PeUvG2MqkvCSRERERJIimZMi3gKWAi8cVt4AXGStnQdcDfzikLofAzcAVT2383vKrwOarbXTge8Bdw5j3CIiIiKjStK6XK21G4A+g9itta8d8nAdEDDG+IFCINda+/ee834OfAB4GrgE+FrPOY8BPzTGGDuWBwiKjAKR7k6cSJR4UzpYxxILdoMTw5OdgzcrB+PSZHsRkcEa7WPoLgNes9aGjDETgdpD6mqBg9NJJwK7Aay1UWNMK1BEvLWvF2PMDcRb+Zg8efIwhi4ytkW6Ooh1d2Pcbqwx2GiM7n212Gg0fkD9PrKnTMOXV5DcQEVE0sCw/mlsjPmjMeatfm6XDODcOcS7Tv/fwaJ+DrMDqOtdaO291tpqa211SUnJQF6GiLwLkY52cLlwYlGs4+CEQ28ncz269+3BOaxMRESO3bAmdNba91lr5/Zze+JI5xljyoHfAB+z1m7rKa4Fyg85rBzYe0jdpJ5zPUAe0DSUr2WkVVRUJNZvq66Ozxl5/fXXWbRoEfPmzeOiiy7qtb3W8uXLmT59OjNnzuTZZ58FoKuriwsvvJBZs2YxZ84cbr311sTxu3btYvHixZxwwgnMnz+fp556KlH34IMPUlVVRVVVVWLxX4BrrrmGyspKFixYwIIFC1i7di0Ara2tXHTRRRx//PHMmTOHBx54AIDdu3ezePFiZs+ezZw5c7j77rsT12pqamLJkiVUVVWxZMkSmpube73+Xbt2kZ2dndiNAuCss85i5syZiec/uNDyQY899hjGGA5fikaSwzrO239qWafvNnmA48RAIyNERAbPWpvUG/A8UH3I43zgdeCyfo59BTiV+NfE08AFPeWfAn7Sc/9y4JGBPPfChQvt4davX9+nLBmmTJli6+vre5VVV1fb559/3lpr7X333Wdvu+02a62169ats/Pnz7fBYNBu377dTp061UajUdvZ2Wn/9Kc/WWutDYVC9owzzrBPPfWUtdba66+/3v7oRz9KnD9lyhRrrbWNjY22srLSNjY22qamJltZWWmbmpqstdZeffXV9tFHH+0T6x133GFvueUWa621Bw4csAUFBTYUCtm9e/faNWvWWGutbWtrs1VVVXbdunXWWmu/8IUv2OXLl1trrV2+fHni/IOWLl1qP/jBD9pvfetbibL3vve99pVXXun3/Wpra7Pvec977CmnnPKOx4yWz3asCLU22+6G/ba7qd4GW5psV/0+2/j6K71uXfv3JjtMEZGUAay275DTJHPZkkuNMbXAIuD3xphne6puBKYDXzHGrO25lfbUfRL4H2ArsI14UgdwH1BkjNkKfBZ4uylqmO18eSO/+9J9PPKJu/jdl+5j58sbh+25Nm3axJlnngnAkiVLePzxxwF44oknuPzyy/H7/VRWVjJ9+nRWrVpFZmYmixcvBsDn83HiiSdSWxsfhmiMSbTwtba2JvZ+ffbZZ1myZAmFhYUUFBSwZMmSxJ6v78QYQ3t7O9ZaOjo6KCwsxOPxUFZWltihIicnh9mzZ7Nnz55EzFdffTUAV199NStWrEhcb8WKFUydOpU5c+YM+L35yle+wi233EIgEBjwOTK8PFnZuLx+jNsDGFz+AJnlFXgys3D5/GSML9cajiIiQyRpCZ219jfW2nJrrd9aO85ae15P+TestVnW2gWH3A701K228S7badbaG3uyVay1QWvth6y10621J1trt4/Ea9j58kZW/+qPdDXF913tampn9a/+OCRJnTGGc889l4ULF3LvvfcCMHfuXJ588kkAHn30UXbv3g3Anj17mDRpUuLc8vLyROJ0UEtLC7/97W8555xzAPja177GL3/5S8rLy7ngggv4wQ9+MKBrffnLX2b+/PncfPPNhEIhIL6d2IYNG5gwYQLz5s3j7rvvxnXYzMWamhpee+01TjnlFCC+vVlZWRkAZWVlie7Tzs5O7rzzTm6//fZ+35drr72WBQsW8B//8R8HW2157bXX2L17N+9///sH9ubKiHC5Pfhy8/Bm5eDOyMDl8+POyiarvJLcaTPJKB0/6H2PRUQkTusFDMKbT/yVWLj3gO5YOMqbT/x10Nf+61//yquvvsrTTz/NPffcwwsvvMD999/PPffcw8KFC2lvb8fni38ZHkxsDnXocjDRaJQrrriCz3zmM0ydOhWI7316zTXXUFtby1NPPcVVV12F4zhHvNby5cvZuHEjr7zyCk1NTdx5Z3y5v2effZYFCxawd+9e1q5dy4033thrfF9HRweXXXYZd911F7m5uUd83bfffjs333wz2dnZfep+9atf8eabb/Liiy/y4osv8otf/ALHcbj55pv5zne+c7S3VJLE5Xbj8fnx+gN4/QHcgYASORGRIaaEbhAOtswNtPxYHOwCLS0t5dJLL2XVqlXMmjWLlStXsmbNGq644gqmTZsGxFvRDrbWAdTW1ibOB7jhhhuoqqripptuSpTdd999LFu2DIBFixYRDAZpaGg44rXKysowxuD3+7n22mtZtWoVAA888ABLly7FGMP06dOprKxk48Z4K2UkEuGyyy7jyiuvZOnSpYnrjhs3jrq6OgDq6uooLY33qr/88svccsstVFRUcNddd/HNb36TH/7whwBMnBhfpSYnJ4ePfOQjrFq1ivb2dt566y3OOussKioq+Mc//sHFF1+siREiIjKmKKEbhMzCnGMqH6jOzk7a29sT91euXMncuXMT3ZKO4/CNb3yDT3ziEwBcfPHFPPzww4RCIXbs2MGWLVs4+eSTAbjttttobW3lrrvu6vUckydP5rnnngPim9YHg0FKSko477zzWLlyJc3NzTQ3N7Ny5UrOO+88gEQCZq1lxYoVzJ07t8+19u/fz6ZNm5g6dSrWWq677jpmz57NZz/72V7Pf/HFFydm0D744INcckl8JZsXX3yRmpoaampquOmmm/jSl77EjTfeSDQapaEhvqxgJBLhd7/7HXPnziUvL4+GhobEOaeeeipPPvlkYmawiIjIWDDaFxYe1eZdcjqrf/XHXt2ubp+HeZecPqjr7t+/n0svvRSId5d+5CMf4fzzz+fuu+/mnnvuAWDp0qVce+21AMyZM4dly5Zx3HHH4fF4uOeee3C73dTW1nLHHXcwa9asxOSEG2+8kY9//ON85zvf4frrr+d73/sexhh+9rOfYYyhsLCQr3zlK5x00kkAfPWrX6WwsBCAK6+8kvr6eqy1LFiwgJ/85CdAfELCNddcw7x587DWcuedd1JcXMxLL73EL37xi8TyKwDf/OY3ueCCC7j11ltZtmwZ9913H5MnT+bRRx894nsSCoU477zziEQixGIx3ve+93H99dcP6n0WERFJF6a/MVNjRXV1tT28a27Dhg3Mnj17wNfY+fJG3nzir3Q1tZNZmMO8S05nyimzhjpUGQLH+tmKiIiMJsaYNdbafrug1EI3SFNOmaUETkRERJJKY+hEREREUpxa6ERkSESD3cSCQehZ5sbGosSCXbi8frw5uXgCGUmOUEQkfSmhE5FBi4VDhFubcfsD2FgUG4nQvX9voj7U4CVn6kzcfu3kISIyHNTlKiKDFguHcMIhbCwKjkOwqb5XvROJEO3uSlJ0IiLpTwmdiAyetT07ihiwYGOxvoc4zsjHJSIyRiihG6UqKioS67cdXCT39ddfZ9GiRcybN4+LLrqo1/Zay5cvZ/r06cycOZNnn30WgK6uLi688EJmzZrFnDlzuPXWWxPH79q1i8WLF3PCCScwf/58nnrqqUTdgw8+SFVVFVVVVYnFfwGuueYaKisrWbBgAQsWLGDt2rUAtLa2ctFFF3H88cczZ84cHnjgAQB2797N4sWLmT17NnPmzOHuu+9OXKupqYklS5ZQVVXFkiVLaG5u7vX6d+3aRXZ2Nt/+9rcTZeFwmBtuuIEZM2Ywa9YsHn/8cQBeeOEFTjzxRDweD4899tig3nd5d4zXB8aFcbmwxuDPLzrsAKMxdCIiw8laO2ZvCxcutIdbv359n7JkmDJliq2vr+9VVl1dbZ9//nlrrbX33Xefve2226y11q5bt87Onz/fBoNBu337djt16lQbjUZtZ2en/dOf/mSttTYUCtkzzjjDPvXUU9Zaa6+//nr7ox/9KHH+lClTrLXWNjY22srKStvY2GibmppsZWWlbWpqstZae/XVV9tHH320T6x33HGHveWWW6y11h44cMAWFBTYUChk9+7da9esWWOttbatrc1WVVXZdevWWWut/cIXvmCXL19urbV2+fLlifMPWrp0qf3gBz9ov/WtbyXKvvrVr9ovf/nL1lprY7FY4v3ZsWOHff311+1VV13Vb3wHjZbPNl2FO9ptd8MBG2xttsHmRtuxZ5dt3vCGbd220YbbW63jOMkOUUQkpQGr7TvkNGqhG6Tfr/gD5522jOMrzuK805bx+xV/GLbn2rRpE2eeeSYAS5YsSbRQPfHEE1x++eX4/X4qKyuZPn06q1atIjMzk8WLFwPg8/k48cQTqa2tBcAYk2jha21tTezX+uyzz7JkyRIKCwspKChgyZIlPPPMM0eMyxhDe3s71lo6OjooLCzE4/FQVlaW2KEiJyeH2bNns2fPnkTMV199NQBXX301K1asSFxvxYoVTJ06lTlz5vR6nvvvv58vfvGLALhcLoqLi4F4a+b8+fNxufTfOZm8Wdn4C4rwBDLxZGURKCole+pMciqm483O7emSFRGR4aBvwEH4/Yo/8PVbv0Xdnv1Ya6nbs5+v3/qtIUnqjDGce+65LFy4kHvvvReAuXPn8uSTTwLw6KOPsnv3bgD27NnDpEmTEueWl5cnEqeDWlpa+O1vf8s555wDwNe+9jV++ctfUl5ezgUXXMAPfvCDAV3ry1/+MvPnz+fmm28mFAoB8e3ENmzYwIQJE5g3bx533313n+SqpqaG1157jVNOOQWIb29WVlYGQFlZWWKf2s7OTu68805uv/32PvFDfJuxE088kQ996EPs37//mN5TGX7G5cLt8+H2+nH7/Xh8PozLneywRETSnhK6Qfj+f/2UYHeoV1mwO8T3/+ung772X//6V1599VWefvpp7rnnHl544QXuv/9+7rnnHhYuXEh7ezs+nw+Id5sf7tDWkGg0yhVXXMFnPvMZpk6dCsBDDz3ENddcQ21tLU899RRXXXUVjuMc8VrLly9n48aNvPLKKzQ1NXHnnXcC8Va9BQsWsHfvXtauXcuNN97Ya3xfR0cHl112GXfddRe5ublHfN233347N998M9nZ2b3Ko9EotbW1nH766bz66qssWrSIz3/+8wN5K0VERNKeErpB2Lf3wDGVH4uDXaClpaVceumlrFq1ilmzZrFy5UrWrFnDFVdcwbRp04B4K9rB1jqA2traxPkAN9xwA1VVVdx0002Jsvvuu49ly5YBsGjRIoLBIA0NDUe8VllZGcYY/H4/1157LatWrQLggQceYOnSpRhjmD59OpWVlWzcuBGASCTCZZddxpVXXsnSpUsT1x03bhx1dXUA1NXVUVpaCsDLL7/MLbfcQkVFBXfddRff/OY3+eEPf0hRURGZmZlceumlAHzoQx/i1VdfHfT7LCIikg6U0A3C+Amlx1Q+UJ2dnbS3tyfur1y5krlz5ya6JR3H4Rvf+Aaf+MQnALj44ot5+OGHCYVC7Nixgy1btnDyyScDcNttt9Ha2spdd93V6zkmT57Mc889B8Q3rQ8Gg5SUlHDeeeexcuVKmpubaW5uZuXKlZx33nkAiQTMWsuKFSuYO3dun2vt37+fTZs2MXXqVKy1XHfddcyePZvPfvazvZ7/4osvTsygffDBB7nkkksAePHFF6mpqaGmpoabbrqJL33pS9x4440YY7jooot4/vnnAXjuuec47rjjBvU+i4iIpI13mi0xFm6DneX6u9+stCfNXGLnTT4zcTtp5hL7u9+sHPA1+rNt2zY7f/58O3/+fHvcccfZb3zjG9Zaa++66y5bVVVlq6qq7L/927/1mjX4jW98w06dOtXOmDEjMZN19+7dFrCzZs2yxx9/vD3++OPtT3/6U2ttfGbraaedZufPn2+PP/54++yzzyaudd9999lp06bZadOm2fvvvz9RvnjxYjt37lw7Z84ce+WVV9r29nZrrbV79uyxS5YsSdT94he/sNZa++KLL1rAzps3L/H8v//976211jY0NNizzz7bTp8+3Z599tm2sbGxz/tw++2395rlWlNTY9/znvfYefPm2bPPPtvu3LnTWmvtqlWr7MSJE21mZqYtLCy0xx13XL/vq2a5iohIKuMIs1yN7WfM1FhRXV1tV69e3atsw4YNzJ49e8DX+P2KP/D9//op+/YeYPyEUj5zy/Vc+IElQx2qDIFj/WxFRERGE2PMGmttdX912st1kC78wBIlcCIiIpJUGkMnIiIikuKU0ImIiIikOCV0/RjL4wrTlT5TERFJZ0roDhMIBGhsbFQCkEastTQ2NhIIBJIdioiIyLDQpIjDlJeXU1tbS319fbJDkSEUCAQoLy9PdhgiIiLDQgndYbxeL5WVlckOQ0RERGTA1OUqIiIikuKU0ImIiIikOCV0IiIiIiluTG/9ZYypB3YeUlQMNCQpHHln+lxGL302o5M+l9FJn8volSqfzRRrbUl/FWM6oTucMWb1O+2RJsmjz2X00mczOulzGZ30uYxe6fDZqMtVREREJMUpoRMRERFJcUroers32QFIv/S5jF76bEYnfS6jkz6X0SvlPxuNoRMRERFJcWqhExEREUlxaZ/QGWMmGWP+bIzZYIxZZ4z5136OOcsY02qMWdtz++ohdecbYzYZY7YaY24d2ejT22A+m4GcK+/OYH9meurdxpjXjDG/G7nI098Q/D7LN8Y8ZozZ2HONRSP7CtLTEHwuN/ec95Yx5iFjTGBkX0F6Guj3RM9ns7bnmL8cUp5a3//W2rS+AWXAiT33c4DNwHGHHXMW8Lt+znUD24CpgA94/fBzdUvaZ3PUc3Ub+c/lkPrPAv97pGN0G/nPBngQ+HjPfR+Qn+zXlA63Qf4umwjsADJ6Hj8CXJPs15QOtwF+LvnAemByz+PSnn9T7vs/7VvorLV11tpXe+63AxuI/wANxMnAVmvtdmttGHgYuGR4Ih17BvPZDPJzlSMY7HtrjCkHLgT+Z3giHLsG89kYY3KBM4H7es4PW2tbhinUMWUIfh95gAxjjAfIBPYOfZRjzwA/l48Av7bW7uo57kBPecp9/6d9QncoY0wFcALwcj/Vi4wxrxtjnjbGzOkpmwjsPuSYWpQ0DIt38dkM9FwZhHf5udwF3AI4wx/h2PUuPpupQD3wQE93+P8YY7JGKNwx41g/F2vtHuDbwC6gDmi11q4cqXjHiiN8LjOAAmPM88aYNcaYj/WUp9z3/5hJ6Iwx2cDjwE3W2rbDql8lvp3G8cAPgBUHT+vnUpoWPMTe5WczkHNlEN7N52KMeT9wwFq7ZiRjHWve5c+MBzgR+LG19gSgExj944JSyLv8mSkg3vJTCUwAsowxHx2xoMeAo3wuHmAh8V6F84CvGGNmkILf/2MioTPGeIl/mL+y1v768HprbZu1tqPn/lOA1xhTTDwjn3TIoeWoKXxIDeKzOeq58u4N4nM5HbjYGFNDvIvibGPML0cu8vQ3yN9ntdbagy0UjxFP8GQIDOJzeR+ww1pbb62NAL8GThvB0NPaAL4naoFnrLWd1toG4AXgeFLw+z/tEzpjjCE+ZmSDtfa773DM+J7jMMacTPx9aQReAaqMMZXGGB9wOfDkyESe/gbz2QzkXHl3BvO5WGu/aK0tt9ZWEP95+ZO1Vq0NQ2SQn80+YLcxZmbPoecQHwwugzTI75ldwKnGmMye+nOIj/WSQRrg98QTwHuMMR5jTCZwCvH3P+W+/z3JDmAEnA5cBbxpjFnbU/YlYDKAtfYnwAeBTxpjokA3cLm11gJRY8yNwLPEZ7zcb61dN8Lxp7N3/dkYY87o79yev3xlcAbzMyPDa7CfzaeBX/V8QW0Hrh3B2NPZYD6Xl40xjxHvko0Cr5EGuxaMEkf9XKy1G4wxzwBvEB/3+z/W2rcAUu37XztFiIiIiKS4tO9yFREREUl3SuhEREREUpwSOhEREZEUp4ROREREJMUpoRMREREZRsaY+40xB4wxbw3w+GXGmPXGmHXGmP8d0Dma5SoiIiIyfIwxZwIdwM+ttXOPcmwV8AhwtrW22RhTesges+9ILXQiMmYZY4qMMWt7bvuMMXsOedzVc0yFMcYaY/7jkPOKjTERY8wPex5/7bBz1xpj8t/hOc/qud51h5Sd0FP2+Z7HpxpjXu65zgZjzNeG830QkeFlrX0BaDq0zBgzzRjzjInvIfuiMWZWT9X1wD3W2uaec4+azMHYWFhYRKRf1tpGYAHEkzKgw1r77Z7HHYccuh14P/CVnscfAg5fZPR7B88dgDeBDxNfxR7iq9C/fkj9g8Aya+3rxhg3MBMRSTf3Ap+w1m4xxpwC/Ag4G5gBYIz5K/FFjb9mrX3maBdTQicicnTdwAZjTLW1djXxZOwR4pupvxu7gFxjzDjgAHA+cOguJ6VAHYC1Noa26BJJK8aYbOJ79j7asyMcgL/nXw9QBZxFfA/ZF40xc621LUe6phI6EZGBeRi43BizD4gR36j70ITuZmPMwX1rm621i49yvceIt/S9Rnzbp9Ahdd8DNhljngeeAR601gYH/xJEZJRwAS3W2gX91NUC/7DWRoAdxphNxBO8V452QRERObpngCXAFcD/9VP/PWvtgp7b0ZI5iLfwfajneg8dWmGt/XegGlgJfKTnuUUkTVhr24gnax8CMHHH91SvABb3lBcT74LdfrRrKqETERkAa20YWAN8Dnh8CK63D4gQTxKf66d+m7X2x8A5wPHGmKLBPqeIJIcx5iHg78BMY0xtz6SoK4HrjDGvEx+Te0nP4c8CjcaY9cCfgS/0jPc9InW5iogM3HeAv1hrGw8Z9zIYXwVKrbWxQ69njLkQeMrG15WqIt7F2zIUTygiI89ae8U7VJ3fz7EW+GzPbcCU0ImIDJC1dh19Z7cedOgYOoAPWGtrjnK9v71D1VXA93qWTokCV/ZMjhAR6ZcWFhYRERFJcRpDJyIiIpLi1OUqIjIMjDHnAXceVrzDWntpMuIRkfSmLlcRERGRFKcuVxEREZEUp4ROREREJMUpoRMRERFJcUroRERERFKcEjoRERGRFPf/Ad0zL2o8DUR0AAAAAElFTkSuQmCC\n", + "text/plain": [ + "<Figure size 720x576 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ "plt.figure(figsize=(10,8))\n", "sns.scatterplot(x=\"TIME_MS\", y=\"SUM_PROXIMITY_KMS\", hue='EAN', data=test_1[test_1.MEASUREMENT == 28])\n", @@ -1058,7 +1169,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -1070,7 +1181,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -1088,7 +1199,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -1100,11 +1211,11 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "def c1(x):\n", + "def c1(x, g, d, c):\n", " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", " for j in x.StyleColor.unique():\n", " DF = x[(x.StyleColor == j)]\n", @@ -1115,10 +1226,10 @@ " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", " neighbors = max(2,minimal_epc.astype(int))\n", - " \n", - " if minimal_epc > 70:\n", - " minimal_epc = 70\n", - " neighbors = 70\n", + "\n", + " if minimal_epc > g:\n", + " minimal_epc = g\n", + " neighbors = g\n", " \n", " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", @@ -1131,7 +1242,7 @@ " curve=\"convex\", #parameter from figure\n", " direction=\"decreasing\") #parameter from figure\n", "\n", - " eps = max(6000,kneedle.knee_y/8)\n", + " eps = max(6000,kneedle.knee_y/d)\n", "\n", " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", " db.fit(X)\n", @@ -1142,7 +1253,7 @@ " if db.labels_[db.labels_ == -1].size != 0 :\n", " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", " for b in a.index:\n", - " if a[b] > 0.5*proba[proba['EPC'] == b].count()[0] :\n", + " if a[b] > c*proba[proba['EPC'] == b].count()[0] :\n", " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", " 'StyleColor':j}, ignore_index = True)\n", "\n", @@ -1152,7 +1263,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -1178,7 +1289,43 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# Wybrane StyleColor\n", + "z1 = test[(test.StyleColor == 'RH267-85J') | (test.StyleColor == 'RJ369-87X') | (test.StyleColor =='RM119-93X') \n", + " | (test.StyleColor == 'RS483-99X') | (test.StyleColor == 'SB281-90M')]\n", + "\n", + "z2 = test[(test.StyleColor == 'RV167-MLC') | (test.StyleColor == 'RV462-87X') | (test.StyleColor =='QJ677-33X') \n", + " | (test.StyleColor == 'RH797-00X') | (test.StyleColor == 'RH267-55J')]\n", + "\n", + "z3 = test[(test.StyleColor == 'SL171-99X') | (test.StyleColor == 'SO133-09M') | (test.StyleColor =='RB254-00X') \n", + " | (test.StyleColor == 'SF078-MLC') | (test.StyleColor == 'QY337-00X')]\n", + "\n", + "z4 = test[(test.StyleColor == 'SP095-59X') | (test.StyleColor == 'RN633-00X') | (test.StyleColor =='RH267-59J') \n", + " | (test.StyleColor == 'RV167-87X') | (test.StyleColor == 'RK485-99X')]\n", + "\n", + "z5 = test[(test.StyleColor == 'RJ365-09M') | (test.StyleColor == 'RH797-59X') | (test.StyleColor =='SP090-90X') \n", + " | (test.StyleColor == 'RH797-99X') | (test.StyleColor == 'RJ371-59M')]\n", + "\n", + "z6 = test[(test.StyleColor == 'RV462-99X') | (test.StyleColor == 'RH797-81X') | (test.StyleColor =='QZ555-20X') \n", + " | (test.StyleColor == 'RJ371-53M') | (test.StyleColor == 'RS054-99X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "u = train[(train.StyleColor == 'RV462-99X') | (train.StyleColor == 'RH797-81X') | (train.StyleColor =='SL171-99X')]\n", + "t = test[(test.StyleColor == 'RV462-99X') | (test.StyleColor == 'RH797-81X') | (test.StyleColor =='SL171-99X')]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1208,28 +1355,54 @@ " </tr>\n", " </thead>\n", " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - "Empty DataFrame\n", - "Columns: [MEASUREMENT, EAN, StyleColor]\n", - "Index: []" + " MEASUREMENT EAN StyleColor\n", + "0 21 5902805820447 RH797-81X\n", + "1 24 5902805820447 RH797-81X\n", + "2 26 5902805820447 RH797-81X\n", + "3 28 5902805820447 RH797-81X" ] }, - "execution_count": 17, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "c1(train1)" + "c2(u)" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -1259,28 +1432,156 @@ " </tr>\n", " </thead>\n", " <tbody>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [MEASUREMENT, EAN, StyleColor]\n", - "Index: []" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "c1(train2)" - ] - }, + " <tr>\n", + " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>29</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 29 5902805820447 RH797-81X\n", + "2 32 5902805820447 RH797-81X\n", + "3 34 5902805820447 RH797-81X" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c2(t)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train1)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>MEASUREMENT</th>\n", + " <th>EAN</th>\n", + " <th>StyleColor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1(train2)" + ] + }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -1339,7 +1640,7 @@ "2 11 5902690542769 QY337-00X" ] }, - "execution_count": 19, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1350,7 +1651,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -1390,7 +1691,7 @@ "Index: []" ] }, - "execution_count": 20, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1401,7 +1702,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1441,7 +1742,7 @@ "Index: []" ] }, - "execution_count": 21, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1452,7 +1753,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 74, "metadata": {}, "outputs": [ { @@ -1518,7 +1819,7 @@ "3 28 5902805820447 RH797-81X" ] }, - "execution_count": 22, + "execution_count": 74, "metadata": {}, "output_type": "execute_result" } @@ -1529,7 +1830,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 75, "metadata": {}, "outputs": [ { @@ -1561,43 +1862,19 @@ " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>18</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>11</td>\n", - " <td>5902690542745</td>\n", - " <td>QY337-00X</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>11</td>\n", - " <td>5902690542769</td>\n", - " <td>QY337-00X</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>21</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>24</td>\n", + " <td>29</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>5</th>\n", - " <td>26</td>\n", + " <th>1</th>\n", + " <td>32</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>6</th>\n", - " <td>28</td>\n", + " <th>2</th>\n", + " <td>34</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", @@ -1607,27 +1884,23 @@ ], "text/plain": [ " MEASUREMENT EAN StyleColor\n", - "0 18 5902851852638 SO133-09M\n", - "1 11 5902690542745 QY337-00X\n", - "2 11 5902690542769 QY337-00X\n", - "3 21 5902805820447 RH797-81X\n", - "4 24 5902805820447 RH797-81X\n", - "5 26 5902805820447 RH797-81X\n", - "6 28 5902805820447 RH797-81X" + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 34 5902805820447 RH797-81X" ] }, - "execution_count": 23, + "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "c1(train)" + "c1(z6)" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 68, "metadata": {}, "outputs": [ { @@ -1659,101 +1932,59 @@ " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>38</td>\n", - " <td>5902851414515</td>\n", - " <td>SL171-99X</td>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", - " <td>42</td>\n", + " <td>11</td>\n", " <td>5902690542769</td>\n", " <td>QY337-00X</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", - " <td>29</td>\n", + " <td>21</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", - " <td>32</td>\n", + " <td>26</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", - " <td>34</td>\n", + " <td>28</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", - " <tr>\n", - " <th>5</th>\n", - " <td>38</td>\n", - " <td>5902851852614</td>\n", - " <td>SO133-09M</td>\n", - " </tr>\n", - " <tr>\n", - " <th>6</th>\n", - " <td>38</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", - " </tr>\n", - " <tr>\n", - " <th>7</th>\n", - " <td>38</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", - " </tr>\n", - " <tr>\n", - " <th>8</th>\n", - " <td>42</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", - " </tr>\n", - " <tr>\n", - " <th>9</th>\n", - " <td>42</td>\n", - " <td>5902851852614</td>\n", - " <td>SO133-09M</td>\n", - " </tr>\n", - " <tr>\n", - " <th>10</th>\n", - " <td>42</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", - " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " MEASUREMENT EAN StyleColor\n", - "0 38 5902851414515 SL171-99X\n", - "1 42 5902690542769 QY337-00X\n", - "2 29 5902805820447 RH797-81X\n", - "3 32 5902805820447 RH797-81X\n", - "4 34 5902805820447 RH797-81X\n", - "5 38 5902851852614 SO133-09M\n", - "6 38 5902851852638 SO133-09M\n", - "7 38 5902851852638 SO133-09M\n", - "8 42 5902851852638 SO133-09M\n", - "9 42 5902851852614 SO133-09M\n", - "10 42 5902851852638 SO133-09M" + " MEASUREMENT EAN StyleColor\n", + "0 11 5902690542745 QY337-00X\n", + "1 11 5902690542769 QY337-00X\n", + "2 21 5902805820447 RH797-81X\n", + "3 26 5902805820447 RH797-81X\n", + "4 28 5902805820447 RH797-81X" ] }, - "execution_count": 24, + "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "c1(test)" + "c1(train)" ] }, { "cell_type": "code", - "execution_count": 268, + "execution_count": 67, "metadata": {}, "outputs": [ { @@ -1785,37 +2016,19 @@ " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>22</td>\n", + " <td>29</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", - " <td>25</td>\n", + " <td>32</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", - " <td>24</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>26</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>30</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5</th>\n", - " <td>28</td>\n", + " <td>34</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", @@ -1825,26 +2038,23 @@ ], "text/plain": [ " MEASUREMENT EAN StyleColor\n", - "0 22 5902805820447 RH797-81X\n", - "1 25 5902805820447 RH797-81X\n", - "2 24 5902805820447 RH797-81X\n", - "3 26 5902805820447 RH797-81X\n", - "4 30 5902805820447 RH797-81X\n", - "5 28 5902805820447 RH797-81X" + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 34 5902805820447 RH797-81X" ] }, - "execution_count": 268, + "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "c1(caly1,0.7)" + "c1(test)" ] }, { "cell_type": "code", - "execution_count": 225, + "execution_count": 92, "metadata": {}, "outputs": [ { @@ -1876,37 +2086,37 @@ " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>22</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", - " <td>21</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", + " <td>11</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", - " <td>25</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", + " <td>11</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", - " <td>24</td>\n", + " <td>21</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", - " <td>26</td>\n", + " <td>24</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", - " <td>30</td>\n", + " <td>26</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", @@ -1922,331 +2132,27 @@ ], "text/plain": [ " MEASUREMENT EAN StyleColor\n", - "0 22 5902805820447 RH797-81X\n", - "1 21 5902805820447 RH797-81X\n", - "2 25 5902805820447 RH797-81X\n", - "3 24 5902805820447 RH797-81X\n", - "4 26 5902805820447 RH797-81X\n", - "5 30 5902805820447 RH797-81X\n", + "0 18 5902851852638 SO133-09M\n", + "1 11 5902690542745 QY337-00X\n", + "2 11 5902690542769 QY337-00X\n", + "3 21 5902805820447 RH797-81X\n", + "4 24 5902805820447 RH797-81X\n", + "5 26 5902805820447 RH797-81X\n", "6 28 5902805820447 RH797-81X" ] }, - "execution_count": 225, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "c1(caly1, 0.5)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", - "DF = df[(df.StyleColor == 'RH797-81X')]" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "proba = DF[ DF.MEASUREMENT == 38 ]\n", - "X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", - "minimal_epc = np.floor(proba['EAN'].value_counts().min()/2)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "if minimal_epc > 70:\n", - " minimal_epc = 70\n", - " neighbors = 70" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3035684754501F8000B5B6E6 140\n", - "3035684754501F4000B5B6E5 135\n", - "3035684754501F0000B5B614 130\n", - "3035684754501F0000B5B632 90\n", - "3035684754501F8000B5B6A5 80\n", - "Name: EPC, dtype: int64" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "proba['EPC'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "67.0" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "minimal_epc" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "neighbors = max(2,minimal_epc.astype(int))\n", - "X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", - "nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", - "distances, indices = nbrs.kneighbors(X_embedded)\n", - "distance_desc = sorted(distances[:,neighbors-1], reverse=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", - " distance_desc, # y values\n", - " S=1.0, #parameter suggested from paper\n", - " curve=\"convex\", #parameter from figure\n", - " direction=\"decreasing\") #parameter from figure" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "5000" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "eps = max(5000,kneedle.knee_y/8)\n", - "eps" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='brute')\n", - "db.fit(X)\n", - "y_pred = db.fit_predict(X)\n", - "clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", - "calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "if (db.labels_[db.labels_ == -1].size != 0) :\n", - " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", - " for b in a.index:\n", - " if a[b] > 0.6*proba[proba['EPC'] == b].count()[0] :\n", - " outliery = outliery.append({'MEASUREMENT': 24, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", - " 'StyleColor':'QY337-00X'}, ignore_index = True)" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3035684754501F4000B5B6E5 39\n", - "3035684754501F8000B5B6E6 31\n", - "3035684754501F8000B5B6A5 10\n", - "3035684754501F0000B5B632 10\n", - "3035684754501F0000B5B614 6\n", - "Name: EPC, dtype: int64" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "a" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>MEASUREMENT</th>\n", - " <th>EAN</th>\n", - " <th>StyleColor</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [MEASUREMENT, EAN, StyleColor]\n", - "Index: []" - ] - }, - "execution_count": 40, + "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "outliery" + "c1(train,70,6,0.5)" ] }, { "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "# Wybrane StyleColor\n", - "z1 = test[(test.StyleColor == 'RH267-85J') | (test.StyleColor == 'RJ369-87X') | (test.StyleColor =='RM119-93X') \n", - " | (test.StyleColor == 'RS483-99X') | (test.StyleColor == 'SB281-90M')]\n", - "\n", - "z2 = test[(test.StyleColor == 'RV167-MLC') | (test.StyleColor == 'RV462-87X') | (test.StyleColor =='QJ677-33X') \n", - " | (test.StyleColor == 'RH797-00X') | (test.StyleColor == 'RH267-55J')]\n", - "\n", - "z3 = test[(test.StyleColor == 'SL171-99X') | (test.StyleColor == 'SO133-09M') | (test.StyleColor =='RB254-00X') \n", - " | (test.StyleColor == 'SF078-MLC') | (test.StyleColor == 'QY337-00X')]\n", - "\n", - "z4 = test[(test.StyleColor == 'SP095-59X') | (test.StyleColor == 'RN633-00X') | (test.StyleColor =='RH267-59J') \n", - " | (test.StyleColor == 'RV167-87X') | (test.StyleColor == 'RK485-99X')]\n", - "\n", - "z5 = test[(test.StyleColor == 'RJ365-09M') | (test.StyleColor == 'RH797-59X') | (test.StyleColor =='SP090-90X') \n", - " | (test.StyleColor == 'RH797-99X') | (test.StyleColor == 'RJ371-59M')]\n", - "\n", - "z6 = test[(test.StyleColor == 'RV462-99X') | (test.StyleColor == 'RH797-81X') | (test.StyleColor =='QZ555-20X') \n", - " | (test.StyleColor == 'RJ371-53M') | (test.StyleColor == 'RS054-99X')]" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>MEASUREMENT</th>\n", - " <th>EAN</th>\n", - " <th>StyleColor</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [MEASUREMENT, EAN, StyleColor]\n", - "Index: []" - ] - }, - "execution_count": 84, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "c1(z1)" - ] - }, - { - "cell_type": "code", - "execution_count": 183, + "execution_count": 95, "metadata": {}, "outputs": [ { @@ -2278,54 +2184,114 @@ " <tbody>\n", " <tr>\n", " <th>0</th>\n", + " <td>42</td>\n", + " <td>5902805533040</td>\n", + " <td>RH267-85J</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", " <td>38</td>\n", " <td>5902851414515</td>\n", " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", - " <th>1</th>\n", + " <th>2</th>\n", + " <td>38</td>\n", + " <td>5902851445700</td>\n", + " <td>RS483-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>35</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>42</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", " <td>29</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>2</th>\n", + " <th>8</th>\n", " <td>32</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>3</th>\n", + " <th>9</th>\n", " <td>34</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>4</th>\n", + " <th>10</th>\n", + " <td>42</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>38</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>42</td>\n", + " <td>5902975236956</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", " <td>38</td>\n", " <td>5902851852614</td>\n", " <td>SO133-09M</td>\n", " </tr>\n", " <tr>\n", - " <th>5</th>\n", + " <th>14</th>\n", " <td>38</td>\n", " <td>5902851852638</td>\n", " <td>SO133-09M</td>\n", " </tr>\n", " <tr>\n", - " <th>6</th>\n", + " <th>15</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", " <td>42</td>\n", " <td>5902851852638</td>\n", " <td>SO133-09M</td>\n", " </tr>\n", " <tr>\n", - " <th>7</th>\n", + " <th>17</th>\n", " <td>42</td>\n", " <td>5902851852614</td>\n", " <td>SO133-09M</td>\n", " </tr>\n", " <tr>\n", - " <th>8</th>\n", + " <th>18</th>\n", " <td>42</td>\n", " <td>5902851852638</td>\n", " <td>SO133-09M</td>\n", @@ -2335,30 +2301,40 @@ "</div>" ], "text/plain": [ - " MEASUREMENT EAN StyleColor\n", - "0 38 5902851414515 SL171-99X\n", - "1 29 5902805820447 RH797-81X\n", - "2 32 5902805820447 RH797-81X\n", - "3 34 5902805820447 RH797-81X\n", - "4 38 5902851852614 SO133-09M\n", - "5 38 5902851852638 SO133-09M\n", - "6 42 5902851852638 SO133-09M\n", - "7 42 5902851852614 SO133-09M\n", - "8 42 5902851852638 SO133-09M" + " MEASUREMENT EAN StyleColor\n", + "0 42 5902805533040 RH267-85J\n", + "1 38 5902851414515 SL171-99X\n", + "2 38 5902851445700 RS483-99X\n", + "3 35 5902690542769 QY337-00X\n", + "4 42 5902690542769 QY337-00X\n", + "5 42 5902690542745 QY337-00X\n", + "6 42 5902690542769 QY337-00X\n", + "7 29 5902805820447 RH797-81X\n", + "8 32 5902805820447 RH797-81X\n", + "9 34 5902805820447 RH797-81X\n", + "10 42 5902805820447 RH797-81X\n", + "11 38 5902975236994 SF078-MLC\n", + "12 42 5902975236956 SF078-MLC\n", + "13 38 5902851852614 SO133-09M\n", + "14 38 5902851852638 SO133-09M\n", + "15 38 5902851852638 SO133-09M\n", + "16 42 5902851852638 SO133-09M\n", + "17 42 5902851852614 SO133-09M\n", + "18 42 5902851852638 SO133-09M" ] }, - "execution_count": 183, + "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "c1(test)" + "c1(test,70,6,0.4)" ] }, { "cell_type": "code", - "execution_count": 186, + "execution_count": 268, "metadata": {}, "outputs": [ { @@ -2390,57 +2366,39 @@ " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>38</td>\n", - " <td>5902851414515</td>\n", - " <td>SL171-99X</td>\n", + " <td>22</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", - " <td>29</td>\n", + " <td>25</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", - " <td>32</td>\n", + " <td>24</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", - " <td>34</td>\n", + " <td>26</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", - " <td>38</td>\n", - " <td>5902851852614</td>\n", - " <td>SO133-09M</td>\n", + " <td>30</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", - " <td>38</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", - " </tr>\n", - " <tr>\n", - " <th>6</th>\n", - " <td>42</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", - " </tr>\n", - " <tr>\n", - " <th>7</th>\n", - " <td>42</td>\n", - " <td>5902851852614</td>\n", - " <td>SO133-09M</td>\n", - " </tr>\n", - " <tr>\n", - " <th>8</th>\n", - " <td>42</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", + " <td>28</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", @@ -2448,29 +2406,26 @@ ], "text/plain": [ " MEASUREMENT EAN StyleColor\n", - "0 38 5902851414515 SL171-99X\n", - "1 29 5902805820447 RH797-81X\n", - "2 32 5902805820447 RH797-81X\n", - "3 34 5902805820447 RH797-81X\n", - "4 38 5902851852614 SO133-09M\n", - "5 38 5902851852638 SO133-09M\n", - "6 42 5902851852638 SO133-09M\n", - "7 42 5902851852614 SO133-09M\n", - "8 42 5902851852638 SO133-09M" + "0 22 5902805820447 RH797-81X\n", + "1 25 5902805820447 RH797-81X\n", + "2 24 5902805820447 RH797-81X\n", + "3 26 5902805820447 RH797-81X\n", + "4 30 5902805820447 RH797-81X\n", + "5 28 5902805820447 RH797-81X" ] }, - "execution_count": 186, + "execution_count": 268, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "c1(test)" + "c1(caly1,0.7)" ] }, { "cell_type": "code", - "execution_count": 277, + "execution_count": 225, "metadata": {}, "outputs": [ { @@ -2498,16 +2453,46 @@ " <th>EAN</th>\n", " <th>StyleColor</th>\n", " </tr>\n", - " </thead>\n", - " <tbody>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>22</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", " <tr>\n", - " <th>0</th>\n", + " <th>1</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>25</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", " <td>24</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>1</th>\n", + " <th>4</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>30</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", " <td>28</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", @@ -2518,23 +2503,27 @@ ], "text/plain": [ " MEASUREMENT EAN StyleColor\n", - "0 24 5902805820447 RH797-81X\n", - "1 28 5902805820447 RH797-81X" + "0 22 5902805820447 RH797-81X\n", + "1 21 5902805820447 RH797-81X\n", + "2 25 5902805820447 RH797-81X\n", + "3 24 5902805820447 RH797-81X\n", + "4 26 5902805820447 RH797-81X\n", + "5 30 5902805820447 RH797-81X\n", + "6 28 5902805820447 RH797-81X" ] }, - "execution_count": 277, + "execution_count": 225, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# dla /5\n", - "c1(train)" + "c1(caly1, 0.5)" ] }, { "cell_type": "code", - "execution_count": 278, + "execution_count": 84, "metadata": {}, "outputs": [ { @@ -2564,47 +2553,28 @@ " </tr>\n", " </thead>\n", " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>29</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>32</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>42</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", - " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " MEASUREMENT EAN StyleColor\n", - "0 29 5902805820447 RH797-81X\n", - "1 32 5902805820447 RH797-81X\n", - "2 42 5902851852638 SO133-09M" + "Empty DataFrame\n", + "Columns: [MEASUREMENT, EAN, StyleColor]\n", + "Index: []" ] }, - "execution_count": 278, + "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "c1(test)" + "c1(z1)" ] }, { "cell_type": "code", - "execution_count": 280, + "execution_count": 183, "metadata": {}, "outputs": [ { @@ -2636,39 +2606,87 @@ " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>24</td>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>29</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>1</th>\n", - " <td>28</td>\n", + " <th>2</th>\n", + " <td>32</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>34</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " MEASUREMENT EAN StyleColor\n", - "0 24 5902805820447 RH797-81X\n", - "1 28 5902805820447 RH797-81X" + "0 38 5902851414515 SL171-99X\n", + "1 29 5902805820447 RH797-81X\n", + "2 32 5902805820447 RH797-81X\n", + "3 34 5902805820447 RH797-81X\n", + "4 38 5902851852614 SO133-09M\n", + "5 38 5902851852638 SO133-09M\n", + "6 42 5902851852638 SO133-09M\n", + "7 42 5902851852614 SO133-09M\n", + "8 42 5902851852638 SO133-09M" ] }, - "execution_count": 280, + "execution_count": 183, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# dla /6\n", - "c1(train)" + "c1(test)" ] }, { "cell_type": "code", - "execution_count": 281, + "execution_count": 186, "metadata": {}, "outputs": [ { @@ -2700,24 +2718,54 @@ " <tbody>\n", " <tr>\n", " <th>0</th>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", " <td>29</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>1</th>\n", + " <th>2</th>\n", " <td>32</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>2</th>\n", + " <th>3</th>\n", " <td>34</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>3</th>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", " <td>42</td>\n", " <td>5902851852638</td>\n", " <td>SO133-09M</td>\n", @@ -2728,13 +2776,18 @@ ], "text/plain": [ " MEASUREMENT EAN StyleColor\n", - "0 29 5902805820447 RH797-81X\n", - "1 32 5902805820447 RH797-81X\n", - "2 34 5902805820447 RH797-81X\n", - "3 42 5902851852638 SO133-09M" + "0 38 5902851414515 SL171-99X\n", + "1 29 5902805820447 RH797-81X\n", + "2 32 5902805820447 RH797-81X\n", + "3 34 5902805820447 RH797-81X\n", + "4 38 5902851852614 SO133-09M\n", + "5 38 5902851852638 SO133-09M\n", + "6 42 5902851852638 SO133-09M\n", + "7 42 5902851852614 SO133-09M\n", + "8 42 5902851852638 SO133-09M" ] }, - "execution_count": 281, + "execution_count": 186, "metadata": {}, "output_type": "execute_result" } @@ -2745,7 +2798,59 @@ }, { "cell_type": "code", - "execution_count": 283, + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "def c2(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + " for j in x.StyleColor.unique():\n", + " DF = x[(x.StyleColor == j)]\n", + " measurements = DF.MEASUREMENT.unique()\n", + "\n", + " for i in measurements:\n", + " proba = DF[ DF.MEASUREMENT == i ]\n", + " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", + " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", + " neighbors = max(20,minimal_epc.astype(int))\n", + " \n", + " if minimal_epc > 70:\n", + " minimal_epc = 70\n", + " neighbors = 70\n", + " \n", + " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", + " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", + " distances, indices = nbrs.kneighbors(X_embedded)\n", + " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", + "\n", + " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", + " distance_desc, # y values\n", + " S=1.0, #parameter suggested from paper\n", + " curve=\"convex\", #parameter from figure\n", + " direction=\"decreasing\") #parameter from figure\n", + "\n", + " eps = max(5000,kneedle.knee_y/5)\n", + "\n", + " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", + " db.fit(X)\n", + " y_pred = db.fit_predict(X)\n", + " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", + " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", + "\n", + " if db.labels_[db.labels_ == -1].size != 0 :\n", + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.3*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, "metadata": {}, "outputs": [ { @@ -2777,24 +2882,48 @@ " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>11</td>\n", - " <td>5902690542745</td>\n", - " <td>QY337-00X</td>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>24</td>\n", + " <td>5902805820461</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820461</td>\n", + " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>1</th>\n", - " <td>11</td>\n", - " <td>5902690542769</td>\n", - " <td>QY337-00X</td>\n", + " <th>5</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>2</th>\n", - " <td>24</td>\n", + " <th>6</th>\n", + " <td>27</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>3</th>\n", + " <th>7</th>\n", " <td>28</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", @@ -2805,26 +2934,28 @@ ], "text/plain": [ " MEASUREMENT EAN StyleColor\n", - "0 11 5902690542745 QY337-00X\n", - "1 11 5902690542769 QY337-00X\n", - "2 24 5902805820447 RH797-81X\n", - "3 28 5902805820447 RH797-81X" + "0 21 5902805820447 RH797-81X\n", + "1 24 5902805820447 RH797-81X\n", + "2 24 5902805820461 RH797-81X\n", + "3 24 5902805820447 RH797-81X\n", + "4 24 5902805820461 RH797-81X\n", + "5 26 5902805820447 RH797-81X\n", + "6 27 5902805820447 RH797-81X\n", + "7 28 5902805820447 RH797-81X" ] }, - "execution_count": 283, + "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# eps min 5000 / 5\n", - "# \n", - "c1(train)" + "c2(train6)" ] }, { "cell_type": "code", - "execution_count": 284, + "execution_count": 71, "metadata": {}, "outputs": [ { @@ -2856,51 +2987,27 @@ " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>38</td>\n", - " <td>5902851414515</td>\n", - " <td>SL171-99X</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>42</td>\n", - " <td>5902690542769</td>\n", - " <td>QY337-00X</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", " <td>29</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>3</th>\n", + " <th>1</th>\n", " <td>32</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>4</th>\n", + " <th>2</th>\n", " <td>34</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>5</th>\n", - " <td>38</td>\n", - " <td>5902851852614</td>\n", - " <td>SO133-09M</td>\n", - " </tr>\n", - " <tr>\n", - " <th>6</th>\n", - " <td>38</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", - " </tr>\n", - " <tr>\n", - " <th>7</th>\n", + " <th>3</th>\n", " <td>42</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", @@ -2908,29 +3015,24 @@ ], "text/plain": [ " MEASUREMENT EAN StyleColor\n", - "0 38 5902851414515 SL171-99X\n", - "1 42 5902690542769 QY337-00X\n", - "2 29 5902805820447 RH797-81X\n", - "3 32 5902805820447 RH797-81X\n", - "4 34 5902805820447 RH797-81X\n", - "5 38 5902851852614 SO133-09M\n", - "6 38 5902851852638 SO133-09M\n", - "7 42 5902851852638 SO133-09M" + "0 29 5902805820447 RH797-81X\n", + "1 32 5902805820447 RH797-81X\n", + "2 34 5902805820447 RH797-81X\n", + "3 42 5902805820447 RH797-81X" ] }, - "execution_count": 284, + "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# c=0.6\n", - "c1(test)" + "c2(z6)" ] }, { "cell_type": "code", - "execution_count": 317, + "execution_count": 45, "metadata": {}, "outputs": [ { @@ -2962,24 +3064,42 @@ " <tbody>\n", " <tr>\n", " <th>0</th>\n", + " <td>18</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", " <td>11</td>\n", " <td>5902690542745</td>\n", " <td>QY337-00X</td>\n", " </tr>\n", " <tr>\n", - " <th>1</th>\n", + " <th>2</th>\n", " <td>11</td>\n", " <td>5902690542769</td>\n", " <td>QY337-00X</td>\n", " </tr>\n", " <tr>\n", - " <th>2</th>\n", + " <th>3</th>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", " <td>24</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>3</th>\n", + " <th>5</th>\n", + " <td>26</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", " <td>28</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", @@ -2990,24 +3110,27 @@ ], "text/plain": [ " MEASUREMENT EAN StyleColor\n", - "0 11 5902690542745 QY337-00X\n", - "1 11 5902690542769 QY337-00X\n", - "2 24 5902805820447 RH797-81X\n", - "3 28 5902805820447 RH797-81X" + "0 18 5902851852638 SO133-09M\n", + "1 11 5902690542745 QY337-00X\n", + "2 11 5902690542769 QY337-00X\n", + "3 21 5902805820447 RH797-81X\n", + "4 24 5902805820447 RH797-81X\n", + "5 26 5902805820447 RH797-81X\n", + "6 28 5902805820447 RH797-81X" ] }, - "execution_count": 317, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "c1(train)" + "c2(train)" ] }, { "cell_type": "code", - "execution_count": 318, + "execution_count": 46, "metadata": {}, "outputs": [ { @@ -3045,24 +3168,66 @@ " </tr>\n", " <tr>\n", " <th>1</th>\n", + " <td>42</td>\n", + " <td>5902690542769</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>42</td>\n", + " <td>5902690542745</td>\n", + " <td>QY337-00X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", " <td>29</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>2</th>\n", + " <th>4</th>\n", " <td>32</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>3</th>\n", + " <th>5</th>\n", " <td>34</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>4</th>\n", + " <th>6</th>\n", + " <td>38</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>38</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>42</td>\n", + " <td>5902851852638</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>42</td>\n", + " <td>5902851852614</td>\n", + " <td>SO133-09M</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", " <td>42</td>\n", " <td>5902851852638</td>\n", " <td>SO133-09M</td>\n", @@ -3072,31 +3237,39 @@ "</div>" ], "text/plain": [ - " MEASUREMENT EAN StyleColor\n", - "0 38 5902851414515 SL171-99X\n", - "1 29 5902805820447 RH797-81X\n", - "2 32 5902805820447 RH797-81X\n", - "3 34 5902805820447 RH797-81X\n", - "4 42 5902851852638 SO133-09M" + " MEASUREMENT EAN StyleColor\n", + "0 38 5902851414515 SL171-99X\n", + "1 42 5902690542769 QY337-00X\n", + "2 42 5902690542745 QY337-00X\n", + "3 29 5902805820447 RH797-81X\n", + "4 32 5902805820447 RH797-81X\n", + "5 34 5902805820447 RH797-81X\n", + "6 38 5902851852614 SO133-09M\n", + "7 38 5902851852638 SO133-09M\n", + "8 38 5902851852638 SO133-09M\n", + "9 42 5902851852638 SO133-09M\n", + "10 42 5902851852614 SO133-09M\n", + "11 42 5902851852638 SO133-09M" ] }, - "execution_count": 318, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "c1(test)" + "c2(test)" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ - "def c2(x):\n", - " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", + "# najlepszy model\n", + "def c6(x):\n", + " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EPC', 'StyleColor'])\n", " for j in x.StyleColor.unique():\n", " DF = x[(x.StyleColor == j)]\n", " measurements = DF.MEASUREMENT.unique()\n", @@ -3105,7 +3278,7 @@ " proba = DF[ DF.MEASUREMENT == i ]\n", " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", - " neighbors = max(2,minimal_epc.astype(int))\n", + " neighbors = max(10,minimal_epc.astype(int))\n", " \n", " if minimal_epc > 70:\n", " minimal_epc = 70\n", @@ -3122,7 +3295,7 @@ " curve=\"convex\", #parameter from figure\n", " direction=\"decreasing\") #parameter from figure\n", "\n", - " eps = max(5000,kneedle.knee_y/5)\n", + " eps = max(2000,kneedle.knee_y/3)\n", "\n", " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", " db.fit(X)\n", @@ -3131,96 +3304,29 @@ " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", "\n", " if db.labels_[db.labels_ == -1].size != 0 :\n", - " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", - " for b in a.index:\n", - " if a[b] > 0.5*proba[proba['EPC'] == b].count()[0] :\n", - " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", - " 'StyleColor':j}, ignore_index = True)\n", - "\n", - "\n", - " return(outliery)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>MEASUREMENT</th>\n", - " <th>EAN</th>\n", - " <th>StyleColor</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>21</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>24</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>26</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>28</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " MEASUREMENT EAN StyleColor\n", - "0 21 5902805820447 RH797-81X\n", - "1 24 5902805820447 RH797-81X\n", - "2 26 5902805820447 RH797-81X\n", - "3 28 5902805820447 RH797-81X" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], + " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", + " for b in a.index:\n", + " if a[b] > 0.4*proba[proba['EPC'] == b].count()[0] :\n", + " outliery = outliery.append({'MEASUREMENT': i, 'EPC':proba[proba['EPC'] == b].EPC.iloc[0], \n", + " 'StyleColor':j}, ignore_index = True)\n", + "\n", + "\n", + " return(outliery)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], "source": [ - "c2(train6)" + "train7 = train[(train.StyleColor == 'SF078-MLC') | (train.StyleColor == 'RH797-81X') | (train.StyleColor == 'SL171-99X')]\n", + "test7 = test[(test.StyleColor == 'SF078-MLC') | (test.StyleColor == 'RH797-81X') | (test.StyleColor == 'SL171-99X')]" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 123, "metadata": {}, "outputs": [ { @@ -3244,53 +3350,166 @@ " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", - " <th>MEASUREMENT</th>\n", + " <th></th>\n", + " <th>SUM_PROXIMITY_KMS</th>\n", + " <th>TIME_MS</th>\n", " <th>EAN</th>\n", + " <th>MEASUREMENT</th>\n", + " </tr>\n", + " <tr>\n", " <th>StyleColor</th>\n", + " <th>EPC</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", - " <th>0</th>\n", - " <td>29</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", + " <th rowspan=\"5\" valign=\"top\">RH797-81X</th>\n", + " <th>3035684754501F0000B5B614</th>\n", + " <td>1699</td>\n", + " <td>1699</td>\n", + " <td>1699</td>\n", + " <td>1699</td>\n", " </tr>\n", " <tr>\n", - " <th>1</th>\n", - " <td>32</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", + " <th>3035684754501F0000B5B632</th>\n", + " <td>1393</td>\n", + " <td>1393</td>\n", + " <td>1393</td>\n", + " <td>1393</td>\n", " </tr>\n", " <tr>\n", - " <th>2</th>\n", - " <td>34</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", + " <th>3035684754501F4000B5B6E5</th>\n", + " <td>1646</td>\n", + " <td>1646</td>\n", + " <td>1646</td>\n", + " <td>1646</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3035684754501F8000B5B6A5</th>\n", + " <td>1329</td>\n", + " <td>1329</td>\n", + " <td>1329</td>\n", + " <td>1329</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3035684754501F8000B5B6E6</th>\n", + " <td>1625</td>\n", + " <td>1625</td>\n", + " <td>1625</td>\n", + " <td>1625</td>\n", + " </tr>\n", + " <tr>\n", + " <th rowspan=\"4\" valign=\"top\">SF078-MLC</th>\n", + " <th>30356849FC1723C000B5B1A3</th>\n", + " <td>934</td>\n", + " <td>934</td>\n", + " <td>934</td>\n", + " <td>934</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30356849FC17244000B59A90</th>\n", + " <td>1187</td>\n", + " <td>1187</td>\n", + " <td>1187</td>\n", + " <td>1187</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30356849FC1724C000B599A7</th>\n", + " <td>1108</td>\n", + " <td>1108</td>\n", + " <td>1108</td>\n", + " <td>1108</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30356849FC1724C000B59A42</th>\n", + " <td>1179</td>\n", + " <td>1179</td>\n", + " <td>1179</td>\n", + " <td>1179</td>\n", + " </tr>\n", + " <tr>\n", + " <th rowspan=\"4\" valign=\"top\">SL171-99X</th>\n", + " <th>303568480C287A8000B5BA63</th>\n", + " <td>2191</td>\n", + " <td>2191</td>\n", + " <td>2191</td>\n", + " <td>2191</td>\n", + " </tr>\n", + " <tr>\n", + " <th>303568480C287A8000B5BABD</th>\n", + " <td>785</td>\n", + " <td>785</td>\n", + " <td>785</td>\n", + " <td>785</td>\n", + " </tr>\n", + " <tr>\n", + " <th>303568480C287A8000B5BADA</th>\n", + " <td>1241</td>\n", + " <td>1241</td>\n", + " <td>1241</td>\n", + " <td>1241</td>\n", + " </tr>\n", + " <tr>\n", + " <th>303568480C287AC000B5BAD5</th>\n", + " <td>1152</td>\n", + " <td>1152</td>\n", + " <td>1152</td>\n", + " <td>1152</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " MEASUREMENT EAN StyleColor\n", - "0 29 5902805820447 RH797-81X\n", - "1 32 5902805820447 RH797-81X\n", - "2 34 5902805820447 RH797-81X" + " SUM_PROXIMITY_KMS TIME_MS EAN \\\n", + "StyleColor EPC \n", + "RH797-81X 3035684754501F0000B5B614 1699 1699 1699 \n", + " 3035684754501F0000B5B632 1393 1393 1393 \n", + " 3035684754501F4000B5B6E5 1646 1646 1646 \n", + " 3035684754501F8000B5B6A5 1329 1329 1329 \n", + " 3035684754501F8000B5B6E6 1625 1625 1625 \n", + "SF078-MLC 30356849FC1723C000B5B1A3 934 934 934 \n", + " 30356849FC17244000B59A90 1187 1187 1187 \n", + " 30356849FC1724C000B599A7 1108 1108 1108 \n", + " 30356849FC1724C000B59A42 1179 1179 1179 \n", + "SL171-99X 303568480C287A8000B5BA63 2191 2191 2191 \n", + " 303568480C287A8000B5BABD 785 785 785 \n", + " 303568480C287A8000B5BADA 1241 1241 1241 \n", + " 303568480C287AC000B5BAD5 1152 1152 1152 \n", + "\n", + " MEASUREMENT \n", + "StyleColor EPC \n", + "RH797-81X 3035684754501F0000B5B614 1699 \n", + " 3035684754501F0000B5B632 1393 \n", + " 3035684754501F4000B5B6E5 1646 \n", + " 3035684754501F8000B5B6A5 1329 \n", + " 3035684754501F8000B5B6E6 1625 \n", + "SF078-MLC 30356849FC1723C000B5B1A3 934 \n", + " 30356849FC17244000B59A90 1187 \n", + " 30356849FC1724C000B599A7 1108 \n", + " 30356849FC1724C000B59A42 1179 \n", + "SL171-99X 303568480C287A8000B5BA63 2191 \n", + " 303568480C287A8000B5BABD 785 \n", + " 303568480C287A8000B5BADA 1241 \n", + " 303568480C287AC000B5BAD5 1152 " ] }, - "execution_count": 44, + "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "c2(z6)" + "train7.groupby(['StyleColor','EPC']).count()" ] }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 77, "metadata": {}, "outputs": [ { @@ -3315,51 +3534,45 @@ " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>MEASUREMENT</th>\n", - " <th>EAN</th>\n", + " <th>EPC</th>\n", " <th>StyleColor</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>18</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", + " <td>27</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", - " <td>11</td>\n", - " <td>5902690542745</td>\n", - " <td>QY337-00X</td>\n", + " <td>28</td>\n", + " <td>30356849FC1724C000B59A42</td>\n", + " <td>SF078-MLC</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", - " <td>11</td>\n", - " <td>5902690542769</td>\n", - " <td>QY337-00X</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", " <td>21</td>\n", - " <td>5902805820447</td>\n", + " <td>3035684754501F0000B5B614</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>4</th>\n", + " <th>3</th>\n", " <td>24</td>\n", - " <td>5902805820447</td>\n", + " <td>3035684754501F0000B5B614</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>5</th>\n", - " <td>26</td>\n", - " <td>5902805820447</td>\n", + " <th>4</th>\n", + " <td>27</td>\n", + " <td>3035684754501F0000B5B614</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>6</th>\n", + " <th>5</th>\n", " <td>28</td>\n", - " <td>5902805820447</td>\n", + " <td>3035684754501F0000B5B614</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " </tbody>\n", @@ -3367,28 +3580,27 @@ "</div>" ], "text/plain": [ - " MEASUREMENT EAN StyleColor\n", - "0 18 5902851852638 SO133-09M\n", - "1 11 5902690542745 QY337-00X\n", - "2 11 5902690542769 QY337-00X\n", - "3 21 5902805820447 RH797-81X\n", - "4 24 5902805820447 RH797-81X\n", - "5 26 5902805820447 RH797-81X\n", - "6 28 5902805820447 RH797-81X" + " MEASUREMENT EPC StyleColor\n", + "0 27 303568480C287AC000B5BAD5 SL171-99X\n", + "1 28 30356849FC1724C000B59A42 SF078-MLC\n", + "2 21 3035684754501F0000B5B614 RH797-81X\n", + "3 24 3035684754501F0000B5B614 RH797-81X\n", + "4 27 3035684754501F0000B5B614 RH797-81X\n", + "5 28 3035684754501F0000B5B614 RH797-81X" ] }, - "execution_count": 45, + "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "c2(train)" + "c6(train7)" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 79, "metadata": {}, "outputs": [ { @@ -3413,167 +3625,108 @@ " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>MEASUREMENT</th>\n", - " <th>EAN</th>\n", + " <th>EPC</th>\n", " <th>StyleColor</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>38</td>\n", - " <td>5902851414515</td>\n", + " <td>32</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", - " <td>42</td>\n", - " <td>5902690542769</td>\n", - " <td>QY337-00X</td>\n", + " <td>35</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", - " <td>42</td>\n", - " <td>5902690542745</td>\n", - " <td>QY337-00X</td>\n", + " <td>38</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", - " <td>29</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BA63</td>\n", + " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", - " <td>32</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BADA</td>\n", + " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", - " <td>34</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", - " <td>38</td>\n", - " <td>5902851852614</td>\n", - " <td>SO133-09M</td>\n", + " <td>41</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", - " <td>38</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", + " <td>42</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", - " <td>38</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", + " <td>42</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", - " <td>42</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", + " <td>29</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", - " <td>42</td>\n", - " <td>5902851852614</td>\n", - " <td>SO133-09M</td>\n", - " </tr>\n", - " <tr>\n", - " <th>11</th>\n", - " <td>42</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", + " <td>32</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " MEASUREMENT EAN StyleColor\n", - "0 38 5902851414515 SL171-99X\n", - "1 42 5902690542769 QY337-00X\n", - "2 42 5902690542745 QY337-00X\n", - "3 29 5902805820447 RH797-81X\n", - "4 32 5902805820447 RH797-81X\n", - "5 34 5902805820447 RH797-81X\n", - "6 38 5902851852614 SO133-09M\n", - "7 38 5902851852638 SO133-09M\n", - "8 38 5902851852638 SO133-09M\n", - "9 42 5902851852638 SO133-09M\n", - "10 42 5902851852614 SO133-09M\n", - "11 42 5902851852638 SO133-09M" + " MEASUREMENT EPC StyleColor\n", + "0 32 303568480C287AC000B5BAD5 SL171-99X\n", + "1 35 303568480C287AC000B5BAD5 SL171-99X\n", + "2 38 303568480C287AC000B5BAD5 SL171-99X\n", + "3 38 303568480C287A8000B5BA63 SL171-99X\n", + "4 38 303568480C287A8000B5BADA SL171-99X\n", + "5 38 303568480C287A8000B5BABD SL171-99X\n", + "6 41 303568480C287AC000B5BAD5 SL171-99X\n", + "7 42 303568480C287AC000B5BAD5 SL171-99X\n", + "8 42 303568480C287A8000B5BABD SL171-99X\n", + "9 29 3035684754501F0000B5B614 RH797-81X\n", + "10 32 3035684754501F0000B5B614 RH797-81X" ] }, - "execution_count": 46, + "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "c2(test)" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [], - "source": [ - "def c3(x):\n", - " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", - " for j in x.StyleColor.unique():\n", - " DF = x[(x.StyleColor == j)]\n", - " measurements = DF.MEASUREMENT.unique()\n", - "\n", - " for i in measurements:\n", - " proba = DF[ DF.MEASUREMENT == i ]\n", - " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", - " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", - " neighbors = max(2,minimal_epc.astype(int))\n", - " \n", - " if minimal_epc > 70:\n", - " minimal_epc = 70\n", - " neighbors = 70\n", - " \n", - " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", - " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", - " distances, indices = nbrs.kneighbors(X_embedded)\n", - " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", - "\n", - " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", - " distance_desc, # y values\n", - " S=1.0, #parameter suggested from paper\n", - " curve=\"convex\", #parameter from figure\n", - " direction=\"decreasing\") #parameter from figure\n", - "\n", - " eps = max(5000,kneedle.knee_y/4)\n", - "\n", - " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='kd_tree')\n", - " db.fit(X)\n", - " y_pred = db.fit_predict(X)\n", - " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", - " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", - "\n", - " if db.labels_[db.labels_ == -1].size != 0 :\n", - " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", - " for b in a.index:\n", - " if a[b] > 0.5*proba[proba['EPC'] == b].count()[0] :\n", - " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", - " 'StyleColor':j}, ignore_index = True)\n", - "\n", - "\n", - " return(outliery)" + "c6(test7)" ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 81, "metadata": {}, "outputs": [ { @@ -3605,36 +3758,60 @@ " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>18</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", + " <td>23</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", - " <td>11</td>\n", - " <td>5902690542745</td>\n", - " <td>QY337-00X</td>\n", + " <td>24</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", - " <td>11</td>\n", - " <td>5902690542769</td>\n", - " <td>QY337-00X</td>\n", + " <td>21</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>24</td>\n", + " <td>5902805820461</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>24</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>3</th>\n", - " <td>21</td>\n", + " <th>6</th>\n", + " <td>24</td>\n", + " <td>5902805820461</td>\n", + " <td>RH797-81X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>26</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>4</th>\n", - " <td>24</td>\n", + " <th>8</th>\n", + " <td>27</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>5</th>\n", + " <th>9</th>\n", " <td>28</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", @@ -3645,26 +3822,30 @@ ], "text/plain": [ " MEASUREMENT EAN StyleColor\n", - "0 18 5902851852638 SO133-09M\n", - "1 11 5902690542745 QY337-00X\n", - "2 11 5902690542769 QY337-00X\n", - "3 21 5902805820447 RH797-81X\n", - "4 24 5902805820447 RH797-81X\n", - "5 28 5902805820447 RH797-81X" + "0 23 5902975236994 SF078-MLC\n", + "1 24 5902975236994 SF078-MLC\n", + "2 21 5902805820447 RH797-81X\n", + "3 24 5902805820447 RH797-81X\n", + "4 24 5902805820461 RH797-81X\n", + "5 24 5902805820447 RH797-81X\n", + "6 24 5902805820461 RH797-81X\n", + "7 26 5902805820447 RH797-81X\n", + "8 27 5902805820447 RH797-81X\n", + "9 28 5902805820447 RH797-81X" ] }, - "execution_count": 51, + "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "c3(train)" + "c2(train7)" ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 82, "metadata": {}, "outputs": [ { @@ -3696,137 +3877,150 @@ " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>38</td>\n", + " <td>32</td>\n", " <td>5902851414515</td>\n", " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", - " <td>42</td>\n", - " <td>5902690542769</td>\n", - " <td>QY337-00X</td>\n", + " <td>38</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", - " <td>42</td>\n", - " <td>5902690542745</td>\n", - " <td>QY337-00X</td>\n", + " <td>38</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", + " <td>38</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>38</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>42</td>\n", + " <td>5902851414515</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>42</td>\n", + " <td>5902851414508</td>\n", + " <td>SL171-99X</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", " <td>29</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>4</th>\n", + " <th>8</th>\n", " <td>32</td>\n", " <td>5902805820447</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>5</th>\n", - " <td>38</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", + " <th>9</th>\n", + " <td>34</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>6</th>\n", + " <th>10</th>\n", " <td>42</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", + " <td>5902805820447</td>\n", + " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", - " <th>7</th>\n", - " <td>42</td>\n", - " <td>5902851852614</td>\n", - " <td>SO133-09M</td>\n", + " <th>11</th>\n", + " <td>29</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", " </tr>\n", " <tr>\n", - " <th>8</th>\n", + " <th>12</th>\n", + " <td>36</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>38</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>38</td>\n", + " <td>5902975236956</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>40</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>41</td>\n", + " <td>5902975236994</td>\n", + " <td>SF078-MLC</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", " <td>42</td>\n", - " <td>5902851852638</td>\n", - " <td>SO133-09M</td>\n", + " <td>5902975236956</td>\n", + " <td>SF078-MLC</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " MEASUREMENT EAN StyleColor\n", - "0 38 5902851414515 SL171-99X\n", - "1 42 5902690542769 QY337-00X\n", - "2 42 5902690542745 QY337-00X\n", - "3 29 5902805820447 RH797-81X\n", - "4 32 5902805820447 RH797-81X\n", - "5 38 5902851852638 SO133-09M\n", - "6 42 5902851852638 SO133-09M\n", - "7 42 5902851852614 SO133-09M\n", - "8 42 5902851852638 SO133-09M" + " MEASUREMENT EAN StyleColor\n", + "0 32 5902851414515 SL171-99X\n", + "1 38 5902851414515 SL171-99X\n", + "2 38 5902851414508 SL171-99X\n", + "3 38 5902851414508 SL171-99X\n", + "4 38 5902851414508 SL171-99X\n", + "5 42 5902851414515 SL171-99X\n", + "6 42 5902851414508 SL171-99X\n", + "7 29 5902805820447 RH797-81X\n", + "8 32 5902805820447 RH797-81X\n", + "9 34 5902805820447 RH797-81X\n", + "10 42 5902805820447 RH797-81X\n", + "11 29 5902975236994 SF078-MLC\n", + "12 36 5902975236994 SF078-MLC\n", + "13 38 5902975236994 SF078-MLC\n", + "14 38 5902975236956 SF078-MLC\n", + "15 40 5902975236994 SF078-MLC\n", + "16 41 5902975236994 SF078-MLC\n", + "17 42 5902975236956 SF078-MLC" ] }, - "execution_count": 52, + "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "c3(test)" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [], - "source": [ - "def c4(x):\n", - " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", - " for j in x.StyleColor.unique():\n", - " DF = x[(x.StyleColor == j)]\n", - " measurements = DF.MEASUREMENT.unique()\n", - "\n", - " for i in measurements:\n", - " proba = DF[ DF.MEASUREMENT == i ]\n", - " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", - " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", - " neighbors = max(2,minimal_epc.astype(int))\n", - " \n", - " if minimal_epc > 70:\n", - " minimal_epc = 70\n", - " neighbors = 70\n", - " \n", - " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", - " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", - " distances, indices = nbrs.kneighbors(X_embedded)\n", - " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", - "\n", - " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", - " distance_desc, # y values\n", - " S=1.0, #parameter suggested from paper\n", - " curve=\"convex\", #parameter from figure\n", - " direction=\"decreasing\") #parameter from figure\n", - "\n", - " eps = max(5000,kneedle.knee_y/2)\n", - "\n", - " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='auto')\n", - " db.fit(X)\n", - " y_pred = db.fit_predict(X)\n", - " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", - " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", - "\n", - " if (db.labels_[db.labels_ == -1].size != 0 ) & (db.labels_[db.labels_ == -1].size > minimal_epc ):\n", - " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts().sum()\n", - " b = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()[0] \n", - " if a - 2*b < 0:\n", - " outliery = outliery.append({'MEASUREMENT': i, 'EAN': proba[proba['EPC'] == calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts().index[0]].EAN.iloc[0], \"StyleColor\":j}, ignore_index = True)\n", - "\n", - " return(outliery)" + "c2(test7)" ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 83, "metadata": {}, "outputs": [ { @@ -3851,269 +4045,262 @@ " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>MEASUREMENT</th>\n", - " <th>EAN</th>\n", + " <th>EPC</th>\n", " <th>StyleColor</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>6</td>\n", - " <td>5902805533040</td>\n", - " <td>RH267-85J</td>\n", + " <td>12</td>\n", + " <td>303568480C2B874000B59A39</td>\n", + " <td>RS483-99X</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", - " <td>24</td>\n", - " <td>5902805533040</td>\n", - " <td>RH267-85J</td>\n", + " <td>23</td>\n", + " <td>303568480C2B874000B59A39</td>\n", + " <td>RS483-99X</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", - " <td>4</td>\n", - " <td>5902851445731</td>\n", + " <td>28</td>\n", + " <td>303568480C2B868000B599B2</td>\n", " <td>RS483-99X</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", - " <td>7</td>\n", - " <td>5902851445700</td>\n", - " <td>RS483-99X</td>\n", + " <td>1</td>\n", + " <td>303568480C357A0000B59999</td>\n", + " <td>SB281-90M</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", - " <td>11</td>\n", - " <td>5902851445731</td>\n", - " <td>RS483-99X</td>\n", + " <td>16</td>\n", + " <td>303568480C357A0000B59999</td>\n", + " <td>SB281-90M</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", - " <td>28</td>\n", - " <td>5902805303681</td>\n", + " <td>7</td>\n", + " <td>30356847541DA80000B5BA54</td>\n", " <td>RJ369-87X</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", - " <td>1</td>\n", - " <td>5902805431803</td>\n", - " <td>RM119-93X</td>\n", + " <td>17</td>\n", + " <td>303568480C3455C000B5B30A</td>\n", + " <td>RV167-MLC</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", - " <td>5</td>\n", - " <td>5902805431797</td>\n", - " <td>RM119-93X</td>\n", + " <td>17</td>\n", + " <td>30356847542CCD0000B59A80</td>\n", + " <td>QJ677-33X</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", - " <td>6</td>\n", - " <td>5902805431803</td>\n", - " <td>RM119-93X</td>\n", + " <td>26</td>\n", + " <td>30356847542CCD8000B599FA</td>\n", + " <td>QJ677-33X</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", - " <td>12</td>\n", - " <td>5902851535913</td>\n", - " <td>RV167-MLC</td>\n", + " <td>26</td>\n", + " <td>30356847542CCD0000B59A26</td>\n", + " <td>QJ677-33X</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", - " <td>2</td>\n", - " <td>5902975217986</td>\n", - " <td>RV462-87X</td>\n", + " <td>4</td>\n", + " <td>3035684754340CC000B594C3</td>\n", + " <td>RH267-55J</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", - " <td>4</td>\n", - " <td>5902851414508</td>\n", - " <td>SL171-99X</td>\n", + " <td>18</td>\n", + " <td>3035684754340D0000B594EB</td>\n", + " <td>RH267-55J</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", - " <td>7</td>\n", - " <td>5902851414508</td>\n", - " <td>SL171-99X</td>\n", + " <td>18</td>\n", + " <td>3035684754340CC000B594C6</td>\n", + " <td>RH267-55J</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", - " <td>12</td>\n", - " <td>5902851414508</td>\n", + " <td>27</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", - " <td>4</td>\n", - " <td>5902851852638</td>\n", + " <td>7</td>\n", + " <td>303568480C5343C000B599F6</td>\n", " <td>SO133-09M</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", - " <td>7</td>\n", - " <td>5902851852638</td>\n", + " <td>17</td>\n", + " <td>303568480C5343C000B599C8</td>\n", " <td>SO133-09M</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", - " <td>14</td>\n", - " <td>5902851852638</td>\n", + " <td>17</td>\n", + " <td>303568480C53434000B599E1</td>\n", " <td>SO133-09M</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", - " <td>20</td>\n", - " <td>5902851852638</td>\n", + " <td>24</td>\n", + " <td>303568480C53434000B599E1</td>\n", " <td>SO133-09M</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", - " <td>2</td>\n", - " <td>5902690542769</td>\n", - " <td>QY337-00X</td>\n", + " <td>24</td>\n", + " <td>30356847540FE2C000B59A68</td>\n", + " <td>RB254-00X</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", - " <td>4</td>\n", - " <td>5902690542745</td>\n", - " <td>QY337-00X</td>\n", + " <td>28</td>\n", + " <td>30356849FC1724C000B59A42</td>\n", + " <td>SF078-MLC</td>\n", " </tr>\n", " <tr>\n", " <th>20</th>\n", - " <td>13</td>\n", - " <td>5902690542769</td>\n", + " <td>11</td>\n", + " <td>303568458835008000B5BAD1</td>\n", " <td>QY337-00X</td>\n", " </tr>\n", " <tr>\n", " <th>21</th>\n", - " <td>16</td>\n", - " <td>5902690542745</td>\n", + " <td>11</td>\n", + " <td>303568458835010000B5BA58</td>\n", " <td>QY337-00X</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", - " <td>23</td>\n", - " <td>5902690542745</td>\n", + " <td>19</td>\n", + " <td>303568458835008000B5BAD1</td>\n", " <td>QY337-00X</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", - " <td>24</td>\n", - " <td>5902805219685</td>\n", - " <td>RN633-00X</td>\n", + " <td>19</td>\n", + " <td>303568458835010000B5BA58</td>\n", + " <td>QY337-00X</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", - " <td>1</td>\n", - " <td>5902805533255</td>\n", - " <td>RH267-59J</td>\n", + " <td>7</td>\n", + " <td>303568475415740000B5A5CD</td>\n", + " <td>RN633-00X</td>\n", " </tr>\n", " <tr>\n", " <th>25</th>\n", - " <td>2</td>\n", - " <td>5902805533255</td>\n", + " <td>1</td>\n", + " <td>303568475434134000B5B6DF</td>\n", " <td>RH267-59J</td>\n", " </tr>\n", " <tr>\n", " <th>26</th>\n", - " <td>13</td>\n", - " <td>5902805533255</td>\n", - " <td>RH267-59J</td>\n", + " <td>2</td>\n", + " <td>30356847542B6D4000B5B656</td>\n", + " <td>RJ365-09M</td>\n", " </tr>\n", " <tr>\n", " <th>27</th>\n", - " <td>21</td>\n", - " <td>5902805533255</td>\n", - " <td>RH267-59J</td>\n", + " <td>2</td>\n", + " <td>30356847542B6D0000B5B65A</td>\n", + " <td>RJ365-09M</td>\n", " </tr>\n", " <tr>\n", " <th>28</th>\n", - " <td>24</td>\n", - " <td>5902805533255</td>\n", - " <td>RH267-59J</td>\n", + " <td>12</td>\n", + " <td>30356849FC1E348000B5B2D4</td>\n", + " <td>SP090-90X</td>\n", " </tr>\n", " <tr>\n", " <th>29</th>\n", - " <td>26</td>\n", - " <td>5902805444698</td>\n", - " <td>RJ365-09M</td>\n", + " <td>21</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", " <th>30</th>\n", - " <td>21</td>\n", - " <td>5902805820447</td>\n", + " <td>24</td>\n", + " <td>3035684754501F0000B5B614</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", " <th>31</th>\n", - " <td>23</td>\n", - " <td>5902805820447</td>\n", + " <td>27</td>\n", + " <td>3035684754501F0000B5B614</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", " <th>32</th>\n", " <td>28</td>\n", - " <td>5902805820447</td>\n", + " <td>3035684754501F0000B5B614</td>\n", " <td>RH797-81X</td>\n", " </tr>\n", - " <tr>\n", - " <th>33</th>\n", - " <td>3</td>\n", - " <td>5902805385823</td>\n", - " <td>RJ371-53M</td>\n", - " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " MEASUREMENT EAN StyleColor\n", - "0 6 5902805533040 RH267-85J\n", - "1 24 5902805533040 RH267-85J\n", - "2 4 5902851445731 RS483-99X\n", - "3 7 5902851445700 RS483-99X\n", - "4 11 5902851445731 RS483-99X\n", - "5 28 5902805303681 RJ369-87X\n", - "6 1 5902805431803 RM119-93X\n", - "7 5 5902805431797 RM119-93X\n", - "8 6 5902805431803 RM119-93X\n", - "9 12 5902851535913 RV167-MLC\n", - "10 2 5902975217986 RV462-87X\n", - "11 4 5902851414508 SL171-99X\n", - "12 7 5902851414508 SL171-99X\n", - "13 12 5902851414508 SL171-99X\n", - "14 4 5902851852638 SO133-09M\n", - "15 7 5902851852638 SO133-09M\n", - "16 14 5902851852638 SO133-09M\n", - "17 20 5902851852638 SO133-09M\n", - "18 2 5902690542769 QY337-00X\n", - "19 4 5902690542745 QY337-00X\n", - "20 13 5902690542769 QY337-00X\n", - "21 16 5902690542745 QY337-00X\n", - "22 23 5902690542745 QY337-00X\n", - "23 24 5902805219685 RN633-00X\n", - "24 1 5902805533255 RH267-59J\n", - "25 2 5902805533255 RH267-59J\n", - "26 13 5902805533255 RH267-59J\n", - "27 21 5902805533255 RH267-59J\n", - "28 24 5902805533255 RH267-59J\n", - "29 26 5902805444698 RJ365-09M\n", - "30 21 5902805820447 RH797-81X\n", - "31 23 5902805820447 RH797-81X\n", - "32 28 5902805820447 RH797-81X\n", - "33 3 5902805385823 RJ371-53M" + " MEASUREMENT EPC StyleColor\n", + "0 12 303568480C2B874000B59A39 RS483-99X\n", + "1 23 303568480C2B874000B59A39 RS483-99X\n", + "2 28 303568480C2B868000B599B2 RS483-99X\n", + "3 1 303568480C357A0000B59999 SB281-90M\n", + "4 16 303568480C357A0000B59999 SB281-90M\n", + "5 7 30356847541DA80000B5BA54 RJ369-87X\n", + "6 17 303568480C3455C000B5B30A RV167-MLC\n", + "7 17 30356847542CCD0000B59A80 QJ677-33X\n", + "8 26 30356847542CCD8000B599FA QJ677-33X\n", + "9 26 30356847542CCD0000B59A26 QJ677-33X\n", + "10 4 3035684754340CC000B594C3 RH267-55J\n", + "11 18 3035684754340D0000B594EB RH267-55J\n", + "12 18 3035684754340CC000B594C6 RH267-55J\n", + "13 27 303568480C287AC000B5BAD5 SL171-99X\n", + "14 7 303568480C5343C000B599F6 SO133-09M\n", + "15 17 303568480C5343C000B599C8 SO133-09M\n", + "16 17 303568480C53434000B599E1 SO133-09M\n", + "17 24 303568480C53434000B599E1 SO133-09M\n", + "18 24 30356847540FE2C000B59A68 RB254-00X\n", + "19 28 30356849FC1724C000B59A42 SF078-MLC\n", + "20 11 303568458835008000B5BAD1 QY337-00X\n", + "21 11 303568458835010000B5BA58 QY337-00X\n", + "22 19 303568458835008000B5BAD1 QY337-00X\n", + "23 19 303568458835010000B5BA58 QY337-00X\n", + "24 7 303568475415740000B5A5CD RN633-00X\n", + "25 1 303568475434134000B5B6DF RH267-59J\n", + "26 2 30356847542B6D4000B5B656 RJ365-09M\n", + "27 2 30356847542B6D0000B5B65A RJ365-09M\n", + "28 12 30356849FC1E348000B5B2D4 SP090-90X\n", + "29 21 3035684754501F0000B5B614 RH797-81X\n", + "30 24 3035684754501F0000B5B614 RH797-81X\n", + "31 27 3035684754501F0000B5B614 RH797-81X\n", + "32 28 3035684754501F0000B5B614 RH797-81X" ] }, - "execution_count": 61, + "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "c4(train)" + "c6(train)" ] }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 84, "metadata": {}, "outputs": [ { @@ -4138,359 +4325,188 @@ " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>MEASUREMENT</th>\n", - " <th>EAN</th>\n", + " <th>EPC</th>\n", " <th>StyleColor</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", - " <td>41</td>\n", - " <td>5902805532999</td>\n", - " <td>RH267-55J</td>\n", + " <td>42</td>\n", + " <td>3035684754340E0000B594E8</td>\n", + " <td>RH267-85J</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>32</td>\n", - " <td>5902851414515</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", - " <td>34</td>\n", - " <td>5902851414515</td>\n", + " <td>35</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", - " <td>42</td>\n", - " <td>5902851414515</td>\n", + " <td>38</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", - " <td>32</td>\n", - " <td>5902805431803</td>\n", - " <td>RM119-93X</td>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BA63</td>\n", + " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", - " <td>40</td>\n", - " <td>5902805431803</td>\n", - " <td>RM119-93X</td>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BADA</td>\n", + " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", - " <td>32</td>\n", - " <td>5902851445700</td>\n", - " <td>RS483-99X</td>\n", + " <td>38</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", - " <td>37</td>\n", - " <td>5902851445700</td>\n", - " <td>RS483-99X</td>\n", + " <td>41</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", - " <td>40</td>\n", - " <td>5902851445731</td>\n", - " <td>RS483-99X</td>\n", + " <td>42</td>\n", + " <td>303568480C287AC000B5BAD5</td>\n", + " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", - " <td>32</td>\n", - " <td>5902690542745</td>\n", - " <td>QY337-00X</td>\n", + " <td>42</td>\n", + " <td>303568480C287A8000B5BABD</td>\n", + " <td>SL171-99X</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", - " <td>36</td>\n", - " <td>5902805820423</td>\n", - " <td>RH797-59X</td>\n", + " <td>38</td>\n", + " <td>303568480C2B868000B599B2</td>\n", + " <td>RS483-99X</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", - " <td>37</td>\n", - " <td>5902805303681</td>\n", - " <td>RJ369-87X</td>\n", + " <td>38</td>\n", + " <td>303568480C34548000B5B2B5</td>\n", + " <td>RV167-87X</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", - " <td>29</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", + " <td>42</td>\n", + " <td>303568458835010000B5BA58</td>\n", + " <td>QY337-00X</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", - " <td>32</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", + " <td>42</td>\n", + " <td>303568458835008000B5BAD1</td>\n", + " <td>QY337-00X</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>32</td>\n", - " <td>5902805533255</td>\n", - " <td>RH267-59J</td>\n", + " <td>30356847541DA80000B5BA54</td>\n", + " <td>RJ369-87X</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", - " <td>38</td>\n", - " <td>5902805533255</td>\n", - " <td>RH267-59J</td>\n", + " <td>29</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", - " <td>40</td>\n", - " <td>5902805533255</td>\n", - " <td>RH267-59J</td>\n", + " <td>32</td>\n", + " <td>3035684754501F0000B5B614</td>\n", + " <td>RH797-81X</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>29</td>\n", - " <td>5902975236994</td>\n", - " <td>SF078-MLC</td>\n", + " <td>303568475415744000B599FE</td>\n", + " <td>RN633-00X</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", - " <td>36</td>\n", - " <td>5902975236994</td>\n", - " <td>SF078-MLC</td>\n", + " <td>39</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", - " <td>41</td>\n", - " <td>5902975236994</td>\n", - " <td>SF078-MLC</td>\n", + " <td>42</td>\n", + " <td>303568480C5343C000B599F6</td>\n", + " <td>SO133-09M</td>\n", " </tr>\n", " <tr>\n", " <th>20</th>\n", - " <td>39</td>\n", - " <td>5902851852638</td>\n", + " <td>42</td>\n", + " <td>303568480C53434000B599E1</td>\n", " <td>SO133-09M</td>\n", " </tr>\n", " <tr>\n", " <th>21</th>\n", - " <td>32</td>\n", - " <td>5902805820546</td>\n", - " <td>RH797-00X</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " MEASUREMENT EAN StyleColor\n", - "0 41 5902805532999 RH267-55J\n", - "1 32 5902851414515 SL171-99X\n", - "2 34 5902851414515 SL171-99X\n", - "3 42 5902851414515 SL171-99X\n", - "4 32 5902805431803 RM119-93X\n", - "5 40 5902805431803 RM119-93X\n", - "6 32 5902851445700 RS483-99X\n", - "7 37 5902851445700 RS483-99X\n", - "8 40 5902851445731 RS483-99X\n", - "9 32 5902690542745 QY337-00X\n", - "10 36 5902805820423 RH797-59X\n", - "11 37 5902805303681 RJ369-87X\n", - "12 29 5902805820447 RH797-81X\n", - "13 32 5902805820447 RH797-81X\n", - "14 32 5902805533255 RH267-59J\n", - "15 38 5902805533255 RH267-59J\n", - "16 40 5902805533255 RH267-59J\n", - "17 29 5902975236994 SF078-MLC\n", - "18 36 5902975236994 SF078-MLC\n", - "19 41 5902975236994 SF078-MLC\n", - "20 39 5902851852638 SO133-09M\n", - "21 32 5902805820546 RH797-00X" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "c4(test)" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [], - "source": [ - "def c5(x):\n", - " outliery = pd.DataFrame(columns = ['MEASUREMENT', 'EAN', 'StyleColor'])\n", - " for j in x.StyleColor.unique():\n", - " DF = x[(x.StyleColor == j)]\n", - " measurements = DF.MEASUREMENT.unique()\n", - "\n", - " for i in measurements:\n", - " proba = DF[ DF.MEASUREMENT == i ]\n", - " X = np.asarray(proba[['SUM_PROXIMITY_KMS','TIME_MS']]).reshape(-1, 2)\n", - " minimal_epc = np.floor(proba['EPC'].value_counts().min()/2)\n", - " neighbors = max(2,minimal_epc.astype(int))\n", - " \n", - " if minimal_epc > 70:\n", - " minimal_epc = 70\n", - " neighbors = 70\n", - " \n", - " X_embedded = proba[['TIME_MS','SUM_PROXIMITY_KMS']]\n", - " nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X_embedded)\n", - " distances, indices = nbrs.kneighbors(X_embedded)\n", - " distance_desc = sorted(distances[:,neighbors-1], reverse=True)\n", - "\n", - " kneedle = KneeLocator(range(1,len(distance_desc)+1), #x values\n", - " distance_desc, # y values\n", - " S=1.0, #parameter suggested from paper\n", - " curve=\"convex\", #parameter from figure\n", - " direction=\"decreasing\") #parameter from figure\n", - "\n", - " eps = max(5000,kneedle.knee_y/2)\n", - "\n", - " db = DBSCAN(eps=eps, min_samples=minimal_epc, algorithm='kd_tree')\n", - " db.fit(X)\n", - " y_pred = db.fit_predict(X)\n", - " clusters = pd.DataFrame(db.labels_,columns = ['CLUSTER'],index=proba.index)\n", - " calosc = pd.concat([proba, clusters.reindex(proba.index)], axis=1)\n", - "\n", - " if db.labels_[db.labels_ == -1].size != 0 :\n", - " a = calosc[calosc.CLUSTER == -1 ]['EPC'].value_counts()\n", - " for b in a.index:\n", - " if a[b] > 0.5*proba[proba['EPC'] == b].count()[0] :\n", - " outliery = outliery.append({'MEASUREMENT': i, 'EAN':proba[proba['EPC'] == b].EAN.iloc[0], \n", - " 'StyleColor':j}, ignore_index = True)\n", - "\n", - "\n", - " return(outliery)" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>MEASUREMENT</th>\n", - " <th>EAN</th>\n", - " <th>StyleColor</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>28</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " MEASUREMENT EAN StyleColor\n", - "0 28 5902805820447 RH797-81X" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "c5(train)" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>MEASUREMENT</th>\n", - " <th>EAN</th>\n", - " <th>StyleColor</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>29</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", + " <td>42</td>\n", + " <td>303568480C5343C000B599C8</td>\n", + " <td>SO133-09M</td>\n", " </tr>\n", " <tr>\n", - " <th>1</th>\n", - " <td>32</td>\n", - " <td>5902805820447</td>\n", - " <td>RH797-81X</td>\n", + " <th>22</th>\n", + " <td>35</td>\n", + " <td>303568475450218000B59781</td>\n", + " <td>RH797-00X</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " MEASUREMENT EAN StyleColor\n", - "0 29 5902805820447 RH797-81X\n", - "1 32 5902805820447 RH797-81X" + " MEASUREMENT EPC StyleColor\n", + "0 42 3035684754340E0000B594E8 RH267-85J\n", + "1 32 303568480C287AC000B5BAD5 SL171-99X\n", + "2 35 303568480C287AC000B5BAD5 SL171-99X\n", + "3 38 303568480C287AC000B5BAD5 SL171-99X\n", + "4 38 303568480C287A8000B5BA63 SL171-99X\n", + "5 38 303568480C287A8000B5BADA SL171-99X\n", + "6 38 303568480C287A8000B5BABD SL171-99X\n", + "7 41 303568480C287AC000B5BAD5 SL171-99X\n", + "8 42 303568480C287AC000B5BAD5 SL171-99X\n", + "9 42 303568480C287A8000B5BABD SL171-99X\n", + "10 38 303568480C2B868000B599B2 RS483-99X\n", + "11 38 303568480C34548000B5B2B5 RV167-87X\n", + "12 42 303568458835010000B5BA58 QY337-00X\n", + "13 42 303568458835008000B5BAD1 QY337-00X\n", + "14 32 30356847541DA80000B5BA54 RJ369-87X\n", + "15 29 3035684754501F0000B5B614 RH797-81X\n", + "16 32 3035684754501F0000B5B614 RH797-81X\n", + "17 29 303568475415744000B599FE RN633-00X\n", + "18 39 303568480C5343C000B599F6 SO133-09M\n", + "19 42 303568480C5343C000B599F6 SO133-09M\n", + "20 42 303568480C53434000B599E1 SO133-09M\n", + "21 42 303568480C5343C000B599C8 SO133-09M\n", + "22 35 303568475450218000B59781 RH797-00X" ] }, - "execution_count": 65, + "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "c5(test)" + "c6(test)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {