{ "cells": [ { "cell_type": "code", "id": "initial_id", "metadata": { "ExecuteTime": { "end_time": "2024-07-30T12:35:06.718909Z", "start_time": "2024-07-30T12:35:06.081202Z" } }, "source": [ "import pandas as pd\n", "import numpy as np" ], "outputs": [], "execution_count": 1 }, { "cell_type": "code", "id": "13ad721e84c34936", "metadata": { "ExecuteTime": { "end_time": "2024-07-30T12:35:08.731721Z", "start_time": "2024-07-30T12:35:06.720903Z" } }, "source": [ "df=pd.read_csv(r'WMT_Grocery_202209.csv')\n", "df" ], "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\thaku\\AppData\\Local\\Temp\\ipykernel_13136\\1153799610.py:1: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df=pd.read_csv(r'D:\\pynb\\Walmart\\KNN\\archive\\WMT_Grocery_202209.csv')\n" ] }, { "data": { "text/plain": [ " SHIPPING_LOCATION DEPARTMENT CATEGORY SUBCATEGORY \\\n", "0 79936 Deli Hummus, Dips, & Salsa NaN \n", "1 79936 Deli Hummus, Dips, & Salsa NaN \n", "2 79936 Deli Hummus, Dips, & Salsa NaN \n", "3 79936 Deli Hummus, Dips, & Salsa NaN \n", "4 79936 Deli Hummus, Dips, & Salsa NaN \n", "... ... ... ... ... \n", "568529 70072 Alcohol Wine White Wine \n", "568530 70072 Alcohol Wine White Wine \n", "568531 70072 Alcohol Wine White Wine \n", "568532 70072 Alcohol Wine White Wine \n", "568533 70072 Alcohol Wine White Wine \n", "\n", " BREADCRUMBS SKU \\\n", "0 Deli/Hummus, Dips, & Salsa 110895339 \n", "1 Deli/Hummus, Dips, & Salsa 105455228 \n", "2 Deli/Hummus, Dips, & Salsa 128642379 \n", "3 Deli/Hummus, Dips, & Salsa 366126367 \n", "4 Deli/Hummus, Dips, & Salsa 160090316 \n", "... ... ... \n", "568529 Alcohol/Wine 593600139 \n", "568530 Alcohol/Wine 333403243 \n", "568531 Alcohol/Wine 526588325 \n", "568532 Alcohol/Wine 286992782 \n", "568533 Alcohol/Wine 160015930 \n", "\n", " PRODUCT_URL \\\n", "0 https://www.walmart.com/ip/Marketside-Roasted-... \n", "1 https://www.walmart.com/ip/Marketside-Roasted-... \n", "2 https://www.walmart.com/ip/Marketside-Classic-... \n", "3 https://www.walmart.com/ip/Marketside-Everythi... \n", "4 https://www.walmart.com/ip/Price-s-Jalapeno-Di... \n", "... ... \n", "568529 https://www.walmart.com/ip/Farm-Fresh-Blueberr... \n", "568530 https://www.walmart.com/ip/Farm-Fresh-Peach-Mo... \n", "568531 https://www.walmart.com/ip/Farm-Fresh-Raspberr... \n", "568532 https://www.walmart.com/ip/Farm-Fresh-Mango-Mo... \n", "568533 https://www.walmart.com/ip/Ole-Orleans-Heritag... \n", "\n", " PRODUCT_NAME BRAND \\\n", "0 Marketside Roasted Red Pepper Hummus, 10 Oz Marketside \n", "1 Marketside Roasted Garlic Hummus, 10 Oz Marketside \n", "2 Marketside Classic Hummus, 10 Oz Marketside \n", "3 Marketside Everything Hummus, 10 oz Marketside \n", "4 Price's Jalapeno Dip, 12 Oz. Price's \n", "... ... ... \n", "568529 Farm Fresh Blueberry Moscato 750ml Farm Fresh Wine Company \n", "568530 Farm Fresh Peach Moscato 750 Ml Farm Fresh Wine Company \n", "568531 Farm Fresh Raspberry Moscato 750ml Farm Fresh Wine Company \n", "568532 Farm Fresh Mango Moscato 750ml Farm Fresh Wine Company \n", "568533 Ole Orleans Heritage Riesling 750ml Ole Orleans \n", "\n", " PRICE_RETAIL PRICE_CURRENT PRODUCT_SIZE PROMOTION \\\n", "0 2.67 2.67 10 NaN \n", "1 2.67 2.67 10 NaN \n", "2 2.67 2.67 10 NaN \n", "3 2.67 2.67 10 NaN \n", "4 3.12 3.12 12 NaN \n", "... ... ... ... ... \n", "568529 9.98 9.98 750 NaN \n", "568530 9.98 9.98 750 NaN \n", "568531 9.98 9.98 750 NaN \n", "568532 9.98 9.98 750 NaN \n", "568533 18.98 18.98 750 NaN \n", "\n", " RunDate tid \n", "0 2022-09-11 21:20:04 16163804 \n", "1 2022-09-11 21:20:04 16163805 \n", "2 2022-09-11 21:20:04 16163806 \n", "3 2022-09-11 21:20:04 16163807 \n", "4 2022-09-11 21:20:04 16163808 \n", "... ... ... \n", "568529 2022-09-11 21:20:04 16732333 \n", "568530 2022-09-11 21:20:04 16732334 \n", "568531 2022-09-11 21:20:04 16732335 \n", "568532 2022-09-11 21:20:04 16732336 \n", "568533 2022-09-11 21:20:04 16732337 \n", "\n", "[568534 rows x 15 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SHIPPING_LOCATIONDEPARTMENTCATEGORYSUBCATEGORYBREADCRUMBSSKUPRODUCT_URLPRODUCT_NAMEBRANDPRICE_RETAILPRICE_CURRENTPRODUCT_SIZEPROMOTIONRunDatetid
079936DeliHummus, Dips, & SalsaNaNDeli/Hummus, Dips, & Salsa110895339https://www.walmart.com/ip/Marketside-Roasted-...Marketside Roasted Red Pepper Hummus, 10 OzMarketside2.672.6710NaN2022-09-11 21:20:0416163804
179936DeliHummus, Dips, & SalsaNaNDeli/Hummus, Dips, & Salsa105455228https://www.walmart.com/ip/Marketside-Roasted-...Marketside Roasted Garlic Hummus, 10 OzMarketside2.672.6710NaN2022-09-11 21:20:0416163805
279936DeliHummus, Dips, & SalsaNaNDeli/Hummus, Dips, & Salsa128642379https://www.walmart.com/ip/Marketside-Classic-...Marketside Classic Hummus, 10 OzMarketside2.672.6710NaN2022-09-11 21:20:0416163806
379936DeliHummus, Dips, & SalsaNaNDeli/Hummus, Dips, & Salsa366126367https://www.walmart.com/ip/Marketside-Everythi...Marketside Everything Hummus, 10 ozMarketside2.672.6710NaN2022-09-11 21:20:0416163807
479936DeliHummus, Dips, & SalsaNaNDeli/Hummus, Dips, & Salsa160090316https://www.walmart.com/ip/Price-s-Jalapeno-Di...Price's Jalapeno Dip, 12 Oz.Price's3.123.1212NaN2022-09-11 21:20:0416163808
................................................
56852970072AlcoholWineWhite WineAlcohol/Wine593600139https://www.walmart.com/ip/Farm-Fresh-Blueberr...Farm Fresh Blueberry Moscato 750mlFarm Fresh Wine Company9.989.98750NaN2022-09-11 21:20:0416732333
56853070072AlcoholWineWhite WineAlcohol/Wine333403243https://www.walmart.com/ip/Farm-Fresh-Peach-Mo...Farm Fresh Peach Moscato 750 MlFarm Fresh Wine Company9.989.98750NaN2022-09-11 21:20:0416732334
56853170072AlcoholWineWhite WineAlcohol/Wine526588325https://www.walmart.com/ip/Farm-Fresh-Raspberr...Farm Fresh Raspberry Moscato 750mlFarm Fresh Wine Company9.989.98750NaN2022-09-11 21:20:0416732335
56853270072AlcoholWineWhite WineAlcohol/Wine286992782https://www.walmart.com/ip/Farm-Fresh-Mango-Mo...Farm Fresh Mango Moscato 750mlFarm Fresh Wine Company9.989.98750NaN2022-09-11 21:20:0416732336
56853370072AlcoholWineWhite WineAlcohol/Wine160015930https://www.walmart.com/ip/Ole-Orleans-Heritag...Ole Orleans Heritage Riesling 750mlOle Orleans18.9818.98750NaN2022-09-11 21:20:0416732337
\n", "

568534 rows × 15 columns

\n", "
" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 2 }, { "cell_type": "code", "id": "e65e1fbd9770b4", "metadata": { "ExecuteTime": { "end_time": "2024-07-30T12:35:08.778596Z", "start_time": "2024-07-30T12:35:08.732718Z" } }, "source": [ "df=df[['PRODUCT_NAME','DEPARTMENT','CATEGORY','BREADCRUMBS','BRAND']]\n", "df['PRODUCT']=df['PRODUCT_NAME']" ], "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\thaku\\AppData\\Local\\Temp\\ipykernel_13136\\2027505516.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df['PRODUCT']=df['PRODUCT_NAME']\n" ] } ], "execution_count": 3 }, { "metadata": { "ExecuteTime": { "end_time": "2024-07-30T12:35:08.903031Z", "start_time": "2024-07-30T12:35:08.780590Z" } }, "cell_type": "code", "source": "df.isnull().sum()", "id": "fa1760637e52808f", "outputs": [ { "data": { "text/plain": [ "PRODUCT_NAME 0\n", "DEPARTMENT 0\n", "CATEGORY 0\n", "BREADCRUMBS 0\n", "BRAND 27\n", "PRODUCT 0\n", "dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 4 }, { "metadata": { "ExecuteTime": { "end_time": "2024-07-30T12:35:09.045434Z", "start_time": "2024-07-30T12:35:08.905158Z" } }, "cell_type": "code", "source": "df = df[df['BRAND'].apply(lambda x: isinstance(x, str))]", "id": "e33c2a31c09617ba", "outputs": [], "execution_count": 5 }, { "cell_type": "code", "id": "dc247f93acd769a", "metadata": { "ExecuteTime": { "end_time": "2024-07-30T12:35:09.406463Z", "start_time": "2024-07-30T12:35:09.046431Z" } }, "source": [ "df.dropna()\n", "def is_string(value):\n", " return isinstance(value, str)\n", "\n", "# Identify rows in 'BRAND' column where the value is not a string\n", "non_string_rows = df[~df['BRAND'].apply(is_string)].index\n", "print(non_string_rows)\n", "# Drop those rows from the DataFrame\n", "df.drop(index=non_string_rows)" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index([], dtype='int64')\n" ] }, { "data": { "text/plain": [ " PRODUCT_NAME DEPARTMENT \\\n", "0 Marketside Roasted Red Pepper Hummus, 10 Oz Deli \n", "1 Marketside Roasted Garlic Hummus, 10 Oz Deli \n", "2 Marketside Classic Hummus, 10 Oz Deli \n", "3 Marketside Everything Hummus, 10 oz Deli \n", "4 Price's Jalapeno Dip, 12 Oz. Deli \n", "... ... ... \n", "568529 Farm Fresh Blueberry Moscato 750ml Alcohol \n", "568530 Farm Fresh Peach Moscato 750 Ml Alcohol \n", "568531 Farm Fresh Raspberry Moscato 750ml Alcohol \n", "568532 Farm Fresh Mango Moscato 750ml Alcohol \n", "568533 Ole Orleans Heritage Riesling 750ml Alcohol \n", "\n", " CATEGORY BREADCRUMBS \\\n", "0 Hummus, Dips, & Salsa Deli/Hummus, Dips, & Salsa \n", "1 Hummus, Dips, & Salsa Deli/Hummus, Dips, & Salsa \n", "2 Hummus, Dips, & Salsa Deli/Hummus, Dips, & Salsa \n", "3 Hummus, Dips, & Salsa Deli/Hummus, Dips, & Salsa \n", "4 Hummus, Dips, & Salsa Deli/Hummus, Dips, & Salsa \n", "... ... ... \n", "568529 Wine Alcohol/Wine \n", "568530 Wine Alcohol/Wine \n", "568531 Wine Alcohol/Wine \n", "568532 Wine Alcohol/Wine \n", "568533 Wine Alcohol/Wine \n", "\n", " BRAND PRODUCT \n", "0 Marketside Marketside Roasted Red Pepper Hummus, 10 Oz \n", "1 Marketside Marketside Roasted Garlic Hummus, 10 Oz \n", "2 Marketside Marketside Classic Hummus, 10 Oz \n", "3 Marketside Marketside Everything Hummus, 10 oz \n", "4 Price's Price's Jalapeno Dip, 12 Oz. \n", "... ... ... \n", "568529 Farm Fresh Wine Company Farm Fresh Blueberry Moscato 750ml \n", "568530 Farm Fresh Wine Company Farm Fresh Peach Moscato 750 Ml \n", "568531 Farm Fresh Wine Company Farm Fresh Raspberry Moscato 750ml \n", "568532 Farm Fresh Wine Company Farm Fresh Mango Moscato 750ml \n", "568533 Ole Orleans Ole Orleans Heritage Riesling 750ml \n", "\n", "[568507 rows x 6 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PRODUCT_NAMEDEPARTMENTCATEGORYBREADCRUMBSBRANDPRODUCT
0Marketside Roasted Red Pepper Hummus, 10 OzDeliHummus, Dips, & SalsaDeli/Hummus, Dips, & SalsaMarketsideMarketside Roasted Red Pepper Hummus, 10 Oz
1Marketside Roasted Garlic Hummus, 10 OzDeliHummus, Dips, & SalsaDeli/Hummus, Dips, & SalsaMarketsideMarketside Roasted Garlic Hummus, 10 Oz
2Marketside Classic Hummus, 10 OzDeliHummus, Dips, & SalsaDeli/Hummus, Dips, & SalsaMarketsideMarketside Classic Hummus, 10 Oz
3Marketside Everything Hummus, 10 ozDeliHummus, Dips, & SalsaDeli/Hummus, Dips, & SalsaMarketsideMarketside Everything Hummus, 10 oz
4Price's Jalapeno Dip, 12 Oz.DeliHummus, Dips, & SalsaDeli/Hummus, Dips, & SalsaPrice'sPrice's Jalapeno Dip, 12 Oz.
.....................
568529Farm Fresh Blueberry Moscato 750mlAlcoholWineAlcohol/WineFarm Fresh Wine CompanyFarm Fresh Blueberry Moscato 750ml
568530Farm Fresh Peach Moscato 750 MlAlcoholWineAlcohol/WineFarm Fresh Wine CompanyFarm Fresh Peach Moscato 750 Ml
568531Farm Fresh Raspberry Moscato 750mlAlcoholWineAlcohol/WineFarm Fresh Wine CompanyFarm Fresh Raspberry Moscato 750ml
568532Farm Fresh Mango Moscato 750mlAlcoholWineAlcohol/WineFarm Fresh Wine CompanyFarm Fresh Mango Moscato 750ml
568533Ole Orleans Heritage Riesling 750mlAlcoholWineAlcohol/WineOle OrleansOle Orleans Heritage Riesling 750ml
\n", "

568507 rows × 6 columns

\n", "
" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 6 }, { "cell_type": "code", "id": "3478dbf45d0de013", "metadata": { "ExecuteTime": { "end_time": "2024-07-30T12:35:09.421458Z", "start_time": "2024-07-30T12:35:09.407461Z" } }, "source": [ "import ast,re\n", "def preprocess_text(text):\n", " # Remove non-alphabet characters and extra spaces\n", " text = re.sub(r'[^a-zA-Z\\s]', '', text)\n", " text = re.sub(r'\\s+', ' ', text).strip()\n", " return text.lower()" ], "outputs": [], "execution_count": 7 }, { "cell_type": "code", "id": "47b4b465b97821bb", "metadata": { "ExecuteTime": { "end_time": "2024-07-30T12:35:17.915264Z", "start_time": "2024-07-30T12:35:09.422456Z" } }, "source": [ "df['PRODUCT']=df['PRODUCT'].apply(preprocess_text)\n", "df['DEPARTMENT']=df['DEPARTMENT'].apply(preprocess_text)\n", "df['CATEGORY']=df['CATEGORY'].apply(preprocess_text)\n", "df['BREADCRUMBS']=df['BREADCRUMBS'].apply(preprocess_text)\n", "df['BRAND']=df['BRAND'].apply(preprocess_text)" ], "outputs": [], "execution_count": 8 }, { "cell_type": "code", "id": "bca9973bcd761828", "metadata": { "ExecuteTime": { "end_time": "2024-07-30T12:35:21.683604Z", "start_time": "2024-07-30T12:35:17.918253Z" } }, "source": [ "df['PRODUCT']=df['PRODUCT'].apply(lambda x:x.split())\n", "df['DEPARTMENT']=df['DEPARTMENT'].apply(lambda x:x.split())\n", "df['CATEGORY']=df['CATEGORY'].apply(lambda x:x.split())\n", "df['BREADCRUMBS']=df['BREADCRUMBS'].apply(lambda x:x.split())\n", "df['BRAND']=df['BRAND'].apply(lambda x:x.split())" ], "outputs": [], "execution_count": 9 }, { "cell_type": "code", "id": "8240161bbf8dd746", "metadata": { "ExecuteTime": { "end_time": "2024-07-30T12:35:24.181188Z", "start_time": "2024-07-30T12:35:21.686396Z" } }, "source": "df['tags']=df['PRODUCT']+df['DEPARTMENT']+df['CATEGORY']+df['BREADCRUMBS']+df['BRAND']", "outputs": [], "execution_count": 10 }, { "cell_type": "code", "id": "620840a23eee5d5e", "metadata": { "ExecuteTime": { "end_time": "2024-07-30T12:35:24.212213Z", "start_time": "2024-07-30T12:35:24.182142Z" } }, "source": [ "df" ], "outputs": [ { "data": { "text/plain": [ " PRODUCT_NAME DEPARTMENT \\\n", "0 Marketside Roasted Red Pepper Hummus, 10 Oz [deli] \n", "1 Marketside Roasted Garlic Hummus, 10 Oz [deli] \n", "2 Marketside Classic Hummus, 10 Oz [deli] \n", "3 Marketside Everything Hummus, 10 oz [deli] \n", "4 Price's Jalapeno Dip, 12 Oz. [deli] \n", "... ... ... \n", "568529 Farm Fresh Blueberry Moscato 750ml [alcohol] \n", "568530 Farm Fresh Peach Moscato 750 Ml [alcohol] \n", "568531 Farm Fresh Raspberry Moscato 750ml [alcohol] \n", "568532 Farm Fresh Mango Moscato 750ml [alcohol] \n", "568533 Ole Orleans Heritage Riesling 750ml [alcohol] \n", "\n", " CATEGORY BREADCRUMBS \\\n", "0 [hummus, dips, salsa] [delihummus, dips, salsa] \n", "1 [hummus, dips, salsa] [delihummus, dips, salsa] \n", "2 [hummus, dips, salsa] [delihummus, dips, salsa] \n", "3 [hummus, dips, salsa] [delihummus, dips, salsa] \n", "4 [hummus, dips, salsa] [delihummus, dips, salsa] \n", "... ... ... \n", "568529 [wine] [alcoholwine] \n", "568530 [wine] [alcoholwine] \n", "568531 [wine] [alcoholwine] \n", "568532 [wine] [alcoholwine] \n", "568533 [wine] [alcoholwine] \n", "\n", " BRAND \\\n", "0 [marketside] \n", "1 [marketside] \n", "2 [marketside] \n", "3 [marketside] \n", "4 [prices] \n", "... ... \n", "568529 [farm, fresh, wine, company] \n", "568530 [farm, fresh, wine, company] \n", "568531 [farm, fresh, wine, company] \n", "568532 [farm, fresh, wine, company] \n", "568533 [ole, orleans] \n", "\n", " PRODUCT \\\n", "0 [marketside, roasted, red, pepper, hummus, oz] \n", "1 [marketside, roasted, garlic, hummus, oz] \n", "2 [marketside, classic, hummus, oz] \n", "3 [marketside, everything, hummus, oz] \n", "4 [prices, jalapeno, dip, oz] \n", "... ... \n", "568529 [farm, fresh, blueberry, moscato, ml] \n", "568530 [farm, fresh, peach, moscato, ml] \n", "568531 [farm, fresh, raspberry, moscato, ml] \n", "568532 [farm, fresh, mango, moscato, ml] \n", "568533 [ole, orleans, heritage, riesling, ml] \n", "\n", " tags \n", "0 [marketside, roasted, red, pepper, hummus, oz,... \n", "1 [marketside, roasted, garlic, hummus, oz, deli... \n", "2 [marketside, classic, hummus, oz, deli, hummus... \n", "3 [marketside, everything, hummus, oz, deli, hum... \n", "4 [prices, jalapeno, dip, oz, deli, hummus, dips... \n", "... ... \n", "568529 [farm, fresh, blueberry, moscato, ml, alcohol,... \n", "568530 [farm, fresh, peach, moscato, ml, alcohol, win... \n", "568531 [farm, fresh, raspberry, moscato, ml, alcohol,... \n", "568532 [farm, fresh, mango, moscato, ml, alcohol, win... \n", "568533 [ole, orleans, heritage, riesling, ml, alcohol... \n", "\n", "[568507 rows x 7 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PRODUCT_NAMEDEPARTMENTCATEGORYBREADCRUMBSBRANDPRODUCTtags
0Marketside Roasted Red Pepper Hummus, 10 Oz[deli][hummus, dips, salsa][delihummus, dips, salsa][marketside][marketside, roasted, red, pepper, hummus, oz][marketside, roasted, red, pepper, hummus, oz,...
1Marketside Roasted Garlic Hummus, 10 Oz[deli][hummus, dips, salsa][delihummus, dips, salsa][marketside][marketside, roasted, garlic, hummus, oz][marketside, roasted, garlic, hummus, oz, deli...
2Marketside Classic Hummus, 10 Oz[deli][hummus, dips, salsa][delihummus, dips, salsa][marketside][marketside, classic, hummus, oz][marketside, classic, hummus, oz, deli, hummus...
3Marketside Everything Hummus, 10 oz[deli][hummus, dips, salsa][delihummus, dips, salsa][marketside][marketside, everything, hummus, oz][marketside, everything, hummus, oz, deli, hum...
4Price's Jalapeno Dip, 12 Oz.[deli][hummus, dips, salsa][delihummus, dips, salsa][prices][prices, jalapeno, dip, oz][prices, jalapeno, dip, oz, deli, hummus, dips...
........................
568529Farm Fresh Blueberry Moscato 750ml[alcohol][wine][alcoholwine][farm, fresh, wine, company][farm, fresh, blueberry, moscato, ml][farm, fresh, blueberry, moscato, ml, alcohol,...
568530Farm Fresh Peach Moscato 750 Ml[alcohol][wine][alcoholwine][farm, fresh, wine, company][farm, fresh, peach, moscato, ml][farm, fresh, peach, moscato, ml, alcohol, win...
568531Farm Fresh Raspberry Moscato 750ml[alcohol][wine][alcoholwine][farm, fresh, wine, company][farm, fresh, raspberry, moscato, ml][farm, fresh, raspberry, moscato, ml, alcohol,...
568532Farm Fresh Mango Moscato 750ml[alcohol][wine][alcoholwine][farm, fresh, wine, company][farm, fresh, mango, moscato, ml][farm, fresh, mango, moscato, ml, alcohol, win...
568533Ole Orleans Heritage Riesling 750ml[alcohol][wine][alcoholwine][ole, orleans][ole, orleans, heritage, riesling, ml][ole, orleans, heritage, riesling, ml, alcohol...
\n", "

568507 rows × 7 columns

\n", "
" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 11 }, { "cell_type": "code", "id": "3a28e095285cd9a", "metadata": { "ExecuteTime": { "end_time": "2024-07-30T12:35:24.243648Z", "start_time": "2024-07-30T12:35:24.213211Z" } }, "source": "new_df=df[['PRODUCT_NAME',\"tags\"]]", "outputs": [], "execution_count": 12 }, { "cell_type": "code", "id": "c672ae1826940651", "metadata": { "ExecuteTime": { "end_time": "2024-07-30T12:35:24.258999Z", "start_time": "2024-07-30T12:35:24.245254Z" } }, "source": [ "new_df" ], "outputs": [ { "data": { "text/plain": [ " PRODUCT_NAME \\\n", "0 Marketside Roasted Red Pepper Hummus, 10 Oz \n", "1 Marketside Roasted Garlic Hummus, 10 Oz \n", "2 Marketside Classic Hummus, 10 Oz \n", "3 Marketside Everything Hummus, 10 oz \n", "4 Price's Jalapeno Dip, 12 Oz. \n", "... ... \n", "568529 Farm Fresh Blueberry Moscato 750ml \n", "568530 Farm Fresh Peach Moscato 750 Ml \n", "568531 Farm Fresh Raspberry Moscato 750ml \n", "568532 Farm Fresh Mango Moscato 750ml \n", "568533 Ole Orleans Heritage Riesling 750ml \n", "\n", " tags \n", "0 [marketside, roasted, red, pepper, hummus, oz,... \n", "1 [marketside, roasted, garlic, hummus, oz, deli... \n", "2 [marketside, classic, hummus, oz, deli, hummus... \n", "3 [marketside, everything, hummus, oz, deli, hum... \n", "4 [prices, jalapeno, dip, oz, deli, hummus, dips... \n", "... ... \n", "568529 [farm, fresh, blueberry, moscato, ml, alcohol,... \n", "568530 [farm, fresh, peach, moscato, ml, alcohol, win... \n", "568531 [farm, fresh, raspberry, moscato, ml, alcohol,... \n", "568532 [farm, fresh, mango, moscato, ml, alcohol, win... \n", "568533 [ole, orleans, heritage, riesling, ml, alcohol... \n", "\n", "[568507 rows x 2 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PRODUCT_NAMEtags
0Marketside Roasted Red Pepper Hummus, 10 Oz[marketside, roasted, red, pepper, hummus, oz,...
1Marketside Roasted Garlic Hummus, 10 Oz[marketside, roasted, garlic, hummus, oz, deli...
2Marketside Classic Hummus, 10 Oz[marketside, classic, hummus, oz, deli, hummus...
3Marketside Everything Hummus, 10 oz[marketside, everything, hummus, oz, deli, hum...
4Price's Jalapeno Dip, 12 Oz.[prices, jalapeno, dip, oz, deli, hummus, dips...
.........
568529Farm Fresh Blueberry Moscato 750ml[farm, fresh, blueberry, moscato, ml, alcohol,...
568530Farm Fresh Peach Moscato 750 Ml[farm, fresh, peach, moscato, ml, alcohol, win...
568531Farm Fresh Raspberry Moscato 750ml[farm, fresh, raspberry, moscato, ml, alcohol,...
568532Farm Fresh Mango Moscato 750ml[farm, fresh, mango, moscato, ml, alcohol, win...
568533Ole Orleans Heritage Riesling 750ml[ole, orleans, heritage, riesling, ml, alcohol...
\n", "

568507 rows × 2 columns

\n", "
" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 13 }, { "cell_type": "code", "id": "9f206d66e3a02e2d", "metadata": { "ExecuteTime": { "end_time": "2024-07-30T12:35:26.205605Z", "start_time": "2024-07-30T12:35:24.260533Z" } }, "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "from nltk.stem.porter import PorterStemmer\n", "ps=PorterStemmer()\n", "cv=CountVectorizer(max_features=5000,stop_words='english')" ], "outputs": [], "execution_count": 14 }, { "cell_type": "code", "id": "179547695cf71375", "metadata": { "ExecuteTime": { "end_time": "2024-07-30T12:35:26.221713Z", "start_time": "2024-07-30T12:35:26.206601Z" } }, "source": [ "def stem(text):\n", " y=[]\n", " for i in text:\n", " y.append(ps.stem(i))\n", " return \" \".join(y)" ], "outputs": [], "execution_count": 15 }, { "cell_type": "code", "id": "40e19aacfa32d7f9", "metadata": { "ExecuteTime": { "end_time": "2024-07-30T12:37:14.151419Z", "start_time": "2024-07-30T12:35:26.222722Z" } }, "source": [ "new_df['tags']=new_df['tags'].apply(stem)\n" ], "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\thaku\\AppData\\Local\\Temp\\ipykernel_13136\\1459480162.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " new_df['tags']=new_df['tags'].apply(stem)\n" ] } ], "execution_count": 16 }, { "cell_type": "code", "id": "24975d8282c44c17", "metadata": { "ExecuteTime": { "end_time": "2024-07-30T12:37:21.969928Z", "start_time": "2024-07-30T12:37:14.153418Z" } }, "source": [ "vectors=cv.fit_transform(new_df['tags']).toarray()" ], "outputs": [ { "ename": "MemoryError", "evalue": "Unable to allocate 21.2 GiB for an array with shape (568507, 5000) and data type int64", "output_type": "error", "traceback": [ "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[1;31mMemoryError\u001B[0m Traceback (most recent call last)", "Cell \u001B[1;32mIn[17], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m vectors\u001B[38;5;241m=\u001B[39m\u001B[43mcv\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfit_transform\u001B[49m\u001B[43m(\u001B[49m\u001B[43mnew_df\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mtags\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtoarray\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n", "File \u001B[1;32mD:\\pynb\\Walmart\\venv\\lib\\site-packages\\scipy\\sparse\\_compressed.py:1181\u001B[0m, in \u001B[0;36m_cs_matrix.toarray\u001B[1;34m(self, order, out)\u001B[0m\n\u001B[0;32m 1179\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m out \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m \u001B[38;5;129;01mand\u001B[39;00m order \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[0;32m 1180\u001B[0m order \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_swap(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mcf\u001B[39m\u001B[38;5;124m'\u001B[39m)[\u001B[38;5;241m0\u001B[39m]\n\u001B[1;32m-> 1181\u001B[0m out \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_process_toarray_args\u001B[49m\u001B[43m(\u001B[49m\u001B[43morder\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mout\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 1182\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (out\u001B[38;5;241m.\u001B[39mflags\u001B[38;5;241m.\u001B[39mc_contiguous \u001B[38;5;129;01mor\u001B[39;00m out\u001B[38;5;241m.\u001B[39mflags\u001B[38;5;241m.\u001B[39mf_contiguous):\n\u001B[0;32m 1183\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mOutput array must be C or F contiguous\u001B[39m\u001B[38;5;124m'\u001B[39m)\n", "File \u001B[1;32mD:\\pynb\\Walmart\\venv\\lib\\site-packages\\scipy\\sparse\\_base.py:1301\u001B[0m, in \u001B[0;36m_spbase._process_toarray_args\u001B[1;34m(self, order, out)\u001B[0m\n\u001B[0;32m 1299\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m out\n\u001B[0;32m 1300\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m-> 1301\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mnp\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mzeros\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mshape\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdtype\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mdtype\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43morder\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43morder\u001B[49m\u001B[43m)\u001B[49m\n", "\u001B[1;31mMemoryError\u001B[0m: Unable to allocate 21.2 GiB for an array with shape (568507, 5000) and data type int64" ] } ], "execution_count": 17 }, { "cell_type": "code", "id": "84d50839b49ad1ce", "metadata": { "ExecuteTime": { "end_time": "2024-07-30T12:37:21.971885Z", "start_time": "2024-07-30T12:37:21.971885Z" } }, "source": [ "from sklearn.metrics.pairwise import cosine_similarity\n", "similarity=cosine_similarity(vectors)\n" ], "outputs": [], "execution_count": null }, { "cell_type": "code", "id": "9f8b1afa332ca4b7", "metadata": {}, "source": [ "def recommend(item):\n", " item_index=new_df[new_df['PRODUCT_NAME']==item].index[0]\n", " distance=similarity[item_index]\n", " items_list=sorted(list(enumerate(distance)),reverse=True,key=lambda x:x[1])[1:6]\n", " \n", " for i in items_list:\n", " print(new_df.iloc[i[0]]['PRODUCT_NAME'])\n", "\n", "def get_recommendations(user_description, count_vectorizer, count_matrix):\n", " # Preprocess the user-provided description\n", " user_description = preprocess_text(user_description)\n", " \n", " # Transform the user description into the same feature space\n", " user_vector = count_vectorizer.transform([user_description])\n", " \n", " # Compute cosine similarity between user description and item descriptions\n", " cosine_similarities = cosine_similarity(user_vector, count_matrix).flatten()\n", " \n", " # Get indices of the most similar items\n", " similar_indices = cosine_similarities.argsort()[::-1]\n", " \n", " return similar_indices\n" ], "outputs": [], "execution_count": null }, { "cell_type": "code", "id": "72c21ab855a6ba41", "metadata": {}, "source": [ "recommend(\"THE FIRST YEARS\")" ], "outputs": [], "execution_count": null }, { "cell_type": "code", "id": "206cf57c6ce8bbf9", "metadata": {}, "source": [ "new_df.iloc[get_recommendations('milk', cv, vectors)]" ], "outputs": [], "execution_count": null }, { "cell_type": "code", "id": "76143eea-ffb5-4d4c-a700-b0d98de0bb01", "metadata": {}, "source": [ "import pickle\n", "with open(\"cv.pkl\",\"wb\") as file:\n", " pickle.dump(cv,file)\n", "with open(\"vectors.pkl\",\"wb\")as file:\n", " pickle.dump(vectors,file)" ], "outputs": [], "execution_count": null }, { "cell_type": "code", "id": "c5ee2911-a4bc-4a39-bde9-a8ee8378cc88", "metadata": {}, "source": [], "outputs": [], "execution_count": null } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.19" } }, "nbformat": 4, "nbformat_minor": 5 }