Association Rule Generation using Apriori
In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from numpy.polynomial.polynomial import polyfit
import matplotlib.style as style
from IPython.display import Image
import warnings
warnings.filterwarnings('ignore')
# define random jitter
def rjitt(arr):
stdev = .01*(max(arr)-min(arr))
return arr + np.random.randn(len(arr)) * stdev
# YONG's local data path
dataPath = 'C:/YONG/m1710/myPydata/'
Section 1. Load dataset
In [6]:
# Import Dataset
# dataset = pd.read_csv(‘apriori_data2.csv’, header = None)
tr01a = pd.read_csv(dataPath + 'onlneRetail_A.csv',
dtype = {"StockCode" : "str"})
tr01b = pd.read_csv(dataPath + 'onlneRetail_B.csv')
# to bypass github upload size limitation, read pre-split datasets and merge
tr01c = pd.concat([tr01a, tr01b], axis=0)
print(tr01c.shape)
tr01 = tr01c[['CustomerID','StockCode']]
print(tr01.shape)
tr01c.head()
(541909, 8)
(541909, 2)
Out[6]:
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountry01234
536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 8:26 | 2.55 | 17850.0 | United Kingdom |
536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 8:26 | 3.39 | 17850.0 | United Kingdom |
536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 8:26 | 2.75 | 17850.0 | United Kingdom |
536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 8:26 | 3.39 | 17850.0 | United Kingdom |
536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 8:26 | 3.39 | 17850.0 | United Kingdom |
In [7]:
# print(tr01.shape)
# 값이 없는 레코드의 수 확인 -- 분석 제외 대상
print('count_nan', len(tr01) - tr01.count())
tr01 = tr01.dropna().drop_duplicates()
print(tr01.shape)
tr01['CustomerID'] = tr01.CustomerID.astype(int).astype(str)
count_nan CustomerID 135080
StockCode 0
dtype: int64
(267615, 2)
In [8]:
tr01[:10]
Out[8]:
CustomerIDStockCode0123456789
17850 | 85123A |
17850 | 71053 |
17850 | 84406B |
17850 | 84029G |
17850 | 84029E |
17850 | 22752 |
17850 | 21730 |
17850 | 22633 |
17850 | 22632 |
13047 | 84879 |
In [9]:
# Check Frequency Distribution
nCust = len(tr01.CustomerID.unique())
print(nCust)
plt.figure(figsize = (20,4))
plt.bar(tr01.StockCode.value_counts()[:200].index,
tr01.StockCode.value_counts()[:200]*100/nCust)
plt.ylim(0,tr01.StockCode.value_counts()[:200].max()*100/nCust)
plt.axhline(tr01.StockCode.value_counts()[:200].min()*100/nCust,
linestyle=':', color='grey')
plt.ylabel('PCNT Customer Ordered')
plt.title('Percentage of Customer Ordered by Item')
plt.show()
4372

In [97]:
# Check Frequency Distribution
nStockCode = len(tr01.StockCode.unique())
print(nStockCode)
plt.figure(figsize = (20,4))
WideCusts = tr01.CustomerID.value_counts()[:200]
plt.bar(WideCusts.index,
WideCusts*100/nStockCode)
plt.ylim(0,WideCusts.max()*100/nStockCode)
plt.axhline(WideCusts.min()*100/nStockCode,
linestyle=':', color='grey')
plt.ylabel('PCNT StockCode Ordered')
plt.title('Percentage of Item Ordered by Customer')
plt.show()
3684

In [10]:
# Association rule generation을 위해 list format으로 변환
tmparr = []
ucusts = tr01.CustomerID.unique()
numcusts = len(ucusts)
for i in range(0, numcusts-1):
tmpdf = tr01.loc[tr01['CustomerID']==ucusts[i]]
tmparr.append(list(tmpdf['StockCode']))
# Check the formatted
np.array(tmparr[1])
Out[10]:
array(['84879', '22745', '22748', '22749', '22310', '84969', '22623',
'22622', '21754', '21755', '21777', '48187', '22960', '22913',
'22912', '22914', '21756', '22274', '16161P', '22766', '22708',
'23231', '85015', '85016', '22829', '22722', '22961', '22969',
'22720', '22549', '20972', '22743', '21658', '48184', '22692',
'22499', '22607', '47566', '21902', '21901', '21899', '21900',
'22077', '85123A', '23152', '23182', '23161', '23160', '23163',
'23162', '23164', '23177', '23176', '23006', '22996', '23070',
'23126', '23299', '22423', '21621', '23092', '23112', '23110',
'23111', '23118', '23093', '23089', '23168', '23236', '23240',
'23175', '23173', '23245', '23032', '23031', '23029', '23028',
'85053', '22090', '22089', '22088', '23546', '23545', '22986',
'22739', '23435', '22621', '48188', '22899', '22367', '23514',
'23513', '23511', '23198', '23294', '23295', '23025', '23470',
'23397', '23096', '23487', '23109', '23434', '22086', '85049E',
'M'], dtype='<U6')
Section 2. Extracting association rules
In [47]:
# Train Apriori Model
from mlxtend.frequent_patterns import apriori
from apyori import apriori
rules = apriori(tmparr, min_support = 0.05, min_confidence = 0.33, min_lift = 3, max_length = 2)
# try min_support = 0.02 or min_confidence = 0.2
# to check the distribution of rules ( CAUTION :: takes a while )
# rules = apriori(tmparr, min_support = 0.01, min_confidence = 0.1, min_lift = 2, max_length = 2)
# Listing the resulting Rules
results = list(rules)
# print(results[:10])
print("number of rules : " + str(len(results)) + '\n')
num_rules = len(results)
column_names = ["hypo", "conc", "conf","supp","lift"]
ruledf = pd.DataFrame(columns = column_names)
# Print rules one by one
for i in range(0, num_rules):
result = results[i]
# print(result)
supp = int(result.support*10000)/100
conf = int(result.ordered_statistics[0].confidence *1000)/10
lift = int(result.ordered_statistics[0].lift *100)/100,
hypo = ''.join([x+' ' for x in result.ordered_statistics[0].items_base]).strip()
conc = ''.join([x+' ' for x in result.ordered_statistics[0].items_add]).strip()
print("{ "+str(hypo)+ " ==> " + str(conc) +
" } [ conf = " + str(conf) + "%,"+ " supp = "+str(supp)+"%, " +
" lift = "+str(list(lift)[0])+"]")
# insert raws into a container df
ruledf.loc[i] = [hypo, conc, conf, supp, list(lift)[0]]
number of rules : 81
{ 20725 ==> 20726 } [ conf = 47.1%, supp = 5.74%, lift = 5.52]
{ 20725 ==> 20727 } [ conf = 56.3%, supp = 6.86%, lift = 5.38]
{ 20725 ==> 20728 } [ conf = 57.8%, supp = 7.04%, lift = 5.26]
{ 20725 ==> 22382 } [ conf = 56.7%, supp = 6.9%, lift = 5.05]
{ 20725 ==> 22383 } [ conf = 53.1%, supp = 6.47%, lift = 5.34]
{ 20725 ==> 22384 } [ conf = 55.4%, supp = 6.74%, lift = 5.44]
{ 20725 ==> 23206 } [ conf = 47.3%, supp = 5.76%, lift = 5.16]
{ 20725 ==> 23207 } [ conf = 42.8%, supp = 5.21%, lift = 4.96]
{ 20725 ==> 23209 } [ conf = 50.0%, supp = 6.08%, lift = 4.65]
{ 20725 ==> 85099B } [ conf = 51.3%, supp = 6.24%, lift = 3.52]
{ 20726 ==> 20727 } [ conf = 59.2%, supp = 5.05%, lift = 5.65]
{ 20726 ==> 20728 } [ conf = 63.2%, supp = 5.39%, lift = 5.74]
{ 20726 ==> 22382 } [ conf = 70.5%, supp = 6.01%, lift = 6.27]
{ 20727 ==> 20728 } [ conf = 57.2%, supp = 5.99%, lift = 5.19]
{ 20727 ==> 22382 } [ conf = 58.2%, supp = 6.1%, lift = 5.18]
{ 20727 ==> 22383 } [ conf = 58.2%, supp = 6.1%, lift = 5.85]
{ 20727 ==> 22384 } [ conf = 60.9%, supp = 6.38%, lift = 5.98]
{ 20727 ==> 23207 } [ conf = 48.0%, supp = 5.03%, lift = 5.56]
{ 20727 ==> 23209 } [ conf = 48.6%, supp = 5.1%, lift = 4.53]
{ 20728 ==> 22382 } [ conf = 59.0%, supp = 6.49%, lift = 5.25]
{ 20728 ==> 22383 } [ conf = 59.8%, supp = 6.58%, lift = 6.01]
{ 20728 ==> 22384 } [ conf = 59.2%, supp = 6.52%, lift = 5.81]
{ 20728 ==> 23206 } [ conf = 48.8%, supp = 5.37%, lift = 5.32]
{ 20728 ==> 23207 } [ conf = 48.2%, supp = 5.3%, lift = 5.59]
{ 20728 ==> 23209 } [ conf = 49.4%, supp = 5.44%, lift = 4.61]
{ 21212 ==> 21977 } [ conf = 43.2%, supp = 6.29%, lift = 4.59]
{ 21212 ==> 84991 } [ conf = 42.4%, supp = 6.17%, lift = 4.47]
{ 21733 ==> 85123A } [ conf = 81.0%, supp = 7.13%, lift = 4.12]
{ 21754 ==> 21755 } [ conf = 61.1%, supp = 5.03%, lift = 7.92]
{ 21931 ==> 85099B } [ conf = 67.9%, supp = 5.19%, lift = 4.67]
{ 22086 ==> 22910 } [ conf = 53.1%, supp = 7.48%, lift = 4.94]
{ 22138 ==> 22617 } [ conf = 41.7%, supp = 5.58%, lift = 6.07]
{ 22382 ==> 22383 } [ conf = 57.4%, supp = 6.45%, lift = 5.77]
{ 22382 ==> 22384 } [ conf = 51.9%, supp = 5.83%, lift = 5.1]
{ 22382 ==> 22662 } [ conf = 47.4%, supp = 5.33%, lift = 6.64]
{ 22382 ==> 23206 } [ conf = 45.8%, supp = 5.14%, lift = 4.99]
{ 22382 ==> 23207 } [ conf = 46.4%, supp = 5.21%, lift = 5.38]
{ 22382 ==> 23209 } [ conf = 49.0%, supp = 5.51%, lift = 4.57]
{ 22383 ==> 22384 } [ conf = 55.8%, supp = 5.55%, lift = 5.48]
{ 22383 ==> 23206 } [ conf = 51.4%, supp = 5.12%, lift = 5.61]
{ 22384 ==> 23209 } [ conf = 51.2%, supp = 5.21%, lift = 4.77]
{ 22384 ==> 85099B } [ conf = 51.6%, supp = 5.26%, lift = 3.55]
{ 22386 ==> 85099B } [ conf = 76.6%, supp = 6.52%, lift = 5.26]
{ 22411 ==> 85099B } [ conf = 60.2%, supp = 5.17%, lift = 4.14]
{ 22423 ==> 22697 } [ conf = 33.8%, supp = 6.86%, lift = 3.79]
{ 22698 ==> 22423 } [ conf = 81.3%, supp = 5.97%, lift = 4.0]
{ 22423 ==> 22699 } [ conf = 36.5%, supp = 7.41%, lift = 3.73]
{ 22423 ==> 23245 } [ conf = 34.1%, supp = 6.93%, lift = 3.16]
{ 22469 ==> 22470 } [ conf = 53.9%, supp = 7.06%, lift = 4.84]
{ 22469 ==> 23321 } [ conf = 39.0%, supp = 5.12%, lift = 4.52]
{ 22577 ==> 22578 } [ conf = 73.2%, supp = 5.76%, lift = 9.58]
{ 22629 ==> 22630 } [ conf = 69.5%, supp = 5.12%, lift = 10.55]
{ 22666 ==> 22720 } [ conf = 56.0%, supp = 6.06%, lift = 3.82]
{ 22697 ==> 22698 } [ conf = 76.4%, supp = 6.81%, lift = 10.4]
{ 22697 ==> 22699 } [ conf = 83.0%, supp = 7.41%, lift = 8.5]
{ 22698 ==> 22699 } [ conf = 85.0%, supp = 6.24%, lift = 8.7]
{ 22720 ==> 22722 } [ conf = 42.4%, supp = 6.22%, lift = 4.72]
{ 22720 ==> 23243 } [ conf = 34.6%, supp = 5.07%, lift = 5.51]
{ 22720 ==> 23245 } [ conf = 35.1%, supp = 5.14%, lift = 3.25]
{ 22726 ==> 22727 } [ conf = 82.0%, supp = 6.06%, lift = 9.17]
{ 22727 ==> 22728 } [ conf = 58.0%, supp = 5.19%, lift = 8.69]
{ 22804 ==> 85123A } [ conf = 90.9%, supp = 5.26%, lift = 4.63]
{ 22960 ==> 22961 } [ conf = 45.2%, supp = 5.94%, lift = 3.68]
{ 23199 ==> 23203 } [ conf = 66.2%, supp = 5.21%, lift = 5.73]
{ 23199 ==> 85099B } [ conf = 72.6%, supp = 5.71%, lift = 4.99]
{ 23201 ==> 23203 } [ conf = 57.2%, supp = 5.17%, lift = 4.95]
{ 23201 ==> 85099B } [ conf = 61.2%, supp = 5.53%, lift = 4.21]
{ 23202 ==> 23203 } [ conf = 64.4%, supp = 5.55%, lift = 5.57]
{ 23202 ==> 85099B } [ conf = 60.4%, supp = 5.21%, lift = 4.15]
{ 23203 ==> 23209 } [ conf = 54.4%, supp = 6.29%, lift = 5.07]
{ 23203 ==> 85099B } [ conf = 60.9%, supp = 7.04%, lift = 4.19]
{ 23206 ==> 23207 } [ conf = 56.1%, supp = 5.14%, lift = 6.5]
{ 23206 ==> 23209 } [ conf = 60.5%, supp = 5.55%, lift = 5.64]
{ 23207 ==> 23209 } [ conf = 58.8%, supp = 5.07%, lift = 5.48]
{ 23298 ==> 47566 } [ conf = 52.4%, supp = 6.86%, lift = 3.23]
{ 23300 ==> 23301 } [ conf = 75.7%, supp = 6.08%, lift = 8.17]
{ 23321 ==> 23322 } [ conf = 58.7%, supp = 5.07%, lift = 7.33]
{ 23344 ==> 85099B } [ conf = 57.5%, supp = 5.14%, lift = 3.95]
{ 82482 ==> 82494L } [ conf = 68.8%, supp = 6.47%, lift = 7.67]
{ 85099B ==> 85099C } [ conf = 35.3%, supp = 5.14%, lift = 5.05]
{ 85099B ==> 85099F } [ conf = 38.9%, supp = 5.67%, lift = 5.37]
In [48]:
np.array(ruledf.tail().hypo)
np.array(ruledf.tail().conc)
ruledf.tail()
Out[48]:
hypoconcconfsupplift7677787980
23321 | 23322 | 58.7 | 5.07 | 7.33 |
23344 | 85099B | 57.5 | 5.14 | 3.95 |
82482 | 82494L | 68.8 | 6.47 | 7.67 |
85099B | 85099C | 35.3 | 5.14 | 5.05 |
85099B | 85099F | 38.9 | 5.67 | 5.37 |
visual profiling of the resulting rule set
In [49]:
# profiling the resulting rule set
def scattrules(X, Y, Xlab, Ylab):
plt.scatter(rjitt(X), rjitt(Y), alpha=0.3)
plt.xlabel(Xlab)
plt.ylabel(Ylab)
plt.axvline(X.median(), linestyle=':', color='grey')
plt.axhline(Y.median(), linestyle=':', color='grey')
plt.plot(np.unique(X), np.poly1d(np.polyfit(X, Y, 3))(np.unique(X)),
color='red', linestyle='--')
plt.show()
X = ruledf.conf
Y = ruledf.supp
Xlab, Ylab = 'CONF', 'SUPP'
scattrules(X, Y, Xlab, Ylab)
scattrules(ruledf.conf, ruledf.lift, 'CONF', 'LIFT')
scattrules(ruledf.supp, ruledf.lift, 'SUPP', 'LIFT')



In [13]:
# attach description of stockCode for human inspection
dfItemName = tr01a[['StockCode','Description']].drop_duplicates().dropna()
dfItemName['StockCode'] = np.array(dfItemName.StockCode.astype(str))
dfItemName['Description'] = np.array(dfItemName.Description.astype(str))
dfItemName.shape
dfItemName.columns = ['hypo','deschypo']
dfItemName1 = dfItemName.copy()
dfItemName1.columns = ['conc','descconc']
dfItemName1 = dfItemName1[dfItemName1.descconc!='?']
dfItemName.head()
# to check if description is correct
dfItemName1[dfItemName1.conc=='85123A']
Out[13]:
concdescconc0
85123A | WHITE HANGING HEART T-LIGHT HOLDER |
In [111]:
# attach descriptions using left join
ruledf1 = pd.merge(ruledf, dfItemName, how='left', on='hypo')
ruledf1.head()
ruledf1.tail(10)
ruledf2 = pd.merge(ruledf1, dfItemName1, how='left', on='conc')
ruledf2.head()
ruledf3 = ruledf2[ (ruledf2.conf>30) & (ruledf2.lift>5)].sort_values('lift', ascending=False)
print('number of rules : ', len(ruledf3))
ruledf3
number of rules : 73
Out[111]:
hypoconcconfsuppliftdeschypodescconc838582918792861204612412312154113264950363731293038515225969524114...601161171312563212611891039664535272814155311010910810776125
22629 | 22630 | 69.5 | 5.12 | 10.55 | SPACEBOY LUNCH BOX | DOLLY GIRL LUNCH BOX |
22697 | 22698 | 76.4 | 6.81 | 10.40 | GREEN REGENCY TEACUP AND SAUCER | PINK REGENCY TEACUP AND SAUCER |
22577 | 22578 | 73.2 | 5.76 | 9.58 | WOODEN HEART CHRISTMAS SCANDINAVIAN | WOODEN STAR CHRISTMAS SCANDINAVIAN |
22726 | 22727 | 82.0 | 6.06 | 9.17 | ALARM CLOCK BAKELIKE GREEN | ALARM CLOCK BAKELIKE RED |
22698 | 22699 | 85.0 | 6.24 | 8.70 | PINK REGENCY TEACUP AND SAUCER | ROSES REGENCY TEACUP AND SAUCER |
22727 | 22728 | 58.0 | 5.19 | 8.69 | ALARM CLOCK BAKELIKE RED | ALARM CLOCK BAKELIKE PINK |
22697 | 22699 | 83.0 | 7.41 | 8.50 | GREEN REGENCY TEACUP AND SAUCER | ROSES REGENCY TEACUP AND SAUCER |
23300 | 23301 | 75.7 | 6.08 | 8.17 | GARDENERS KNEELING PAD CUP OF TEA | GARDENERS KNEELING PAD KEEP CALM |
21754 | 21755 | 61.1 | 5.03 | 7.92 | HOME BUILDING BLOCK WORD | LOVE BUILDING BLOCK WORD |
82482 | 82494L | 68.8 | 6.47 | 7.67 | WOODEN PICTURE FRAME WHITE FINISH | cracked |
82482 | 82494L | 68.8 | 6.47 | 7.67 | WOODEN PICTURE FRAME WHITE FINISH | WOODEN FRAME ANTIQUE WHITE |
23321 | 23322 | 58.7 | 5.07 | 7.33 | SMALL WHITE HEART OF WICKER | LARGE WHITE HEART OF WICKER |
22382 | 22662 | 47.4 | 5.33 | 6.64 | LUNCH BAG SPACEBOY DESIGN | LUNCH BAG DOLLY GIRL DESIGN |
23206 | 23207 | 56.1 | 5.14 | 6.50 | LUNCH BAG APPLE DESIGN | LUNCH BAG ALPHABET DESIGN |
20726 | 22382 | 70.5 | 6.01 | 6.27 | LUNCH BAG WOODLAND | LUNCH BAG SPACEBOY DESIGN |
22138 | 22617 | 41.7 | 5.58 | 6.07 | BAKING SET 9 PIECE RETROSPOT | BAKING SET SPACEBOY DESIGN |
22138 | 22617 | 41.7 | 5.58 | 6.07 | BAKING SET 9 PIECE RETROSPOT | mouldy, thrown away. |
20728 | 22383 | 59.8 | 6.58 | 6.01 | LUNCH BAG CARS BLUE | LUNCH BAG SUKI DESIGN |
20728 | 22383 | 59.8 | 6.58 | 6.01 | LUNCH BAG CARS BLUE | LUNCH BAG SUKI DESIGN |
20727 | 22384 | 60.9 | 6.38 | 5.98 | LUNCH BAG BLACK SKULL. | LUNCH BAG PINK POLKADOT |
20727 | 22383 | 58.2 | 6.10 | 5.85 | LUNCH BAG BLACK SKULL. | LUNCH BAG SUKI DESIGN |
20727 | 22383 | 58.2 | 6.10 | 5.85 | LUNCH BAG BLACK SKULL. | LUNCH BAG SUKI DESIGN |
20728 | 22384 | 59.2 | 6.52 | 5.81 | LUNCH BAG CARS BLUE | LUNCH BAG PINK POLKADOT |
22382 | 22383 | 57.4 | 6.45 | 5.77 | LUNCH BAG SPACEBOY DESIGN | LUNCH BAG SUKI DESIGN |
22382 | 22383 | 57.4 | 6.45 | 5.77 | LUNCH BAG SPACEBOY DESIGN | LUNCH BAG SUKI DESIGN |
20726 | 20728 | 63.2 | 5.39 | 5.74 | LUNCH BAG WOODLAND | LUNCH BAG CARS BLUE |
23199 | 23203 | 66.2 | 5.21 | 5.73 | JUMBO BAG APPLES | JUMBO BAG DOILEY PATTERNS |
23199 | 23203 | 66.2 | 5.21 | 5.73 | JUMBO BAG APPLES | mailout |
20726 | 20727 | 59.2 | 5.05 | 5.65 | LUNCH BAG WOODLAND | LUNCH BAG BLACK SKULL. |
23206 | 23209 | 60.5 | 5.55 | 5.64 | LUNCH BAG APPLE DESIGN | mailout |
... | ... | ... | ... | ... | ... | ... |
22383 | 22384 | 55.8 | 5.55 | 5.48 | LUNCH BAG SUKI DESIGN | LUNCH BAG PINK POLKADOT |
23207 | 23209 | 58.8 | 5.07 | 5.48 | LUNCH BAG ALPHABET DESIGN | mailout |
23207 | 23209 | 58.8 | 5.07 | 5.48 | LUNCH BAG ALPHABET DESIGN | LUNCH BAG DOILEY PATTERN |
20725 | 22384 | 55.4 | 6.74 | 5.44 | LUNCH BAG RED SPOTTY | LUNCH BAG PINK POLKADOT |
20725 | 22384 | 55.4 | 6.74 | 5.44 | LUNCH BAG RED RETROSPOT | LUNCH BAG PINK POLKADOT |
22382 | 23207 | 46.4 | 5.21 | 5.38 | LUNCH BAG SPACEBOY DESIGN | LUNCH BAG ALPHABET DESIGN |
20725 | 20727 | 56.3 | 6.86 | 5.38 | LUNCH BAG RED SPOTTY | LUNCH BAG BLACK SKULL. |
20725 | 20727 | 56.3 | 6.86 | 5.38 | LUNCH BAG RED RETROSPOT | LUNCH BAG BLACK SKULL. |
85099B | 85099F | 38.9 | 5.67 | 5.37 | JUMBO BAG RED RETROSPOT | JUMBO BAG STRAWBERRY |
20725 | 22383 | 53.1 | 6.47 | 5.34 | LUNCH BAG RED SPOTTY | LUNCH BAG SUKI DESIGN |
20725 | 22383 | 53.1 | 6.47 | 5.34 | LUNCH BAG RED RETROSPOT | LUNCH BAG SUKI DESIGN |
20725 | 22383 | 53.1 | 6.47 | 5.34 | LUNCH BAG RED RETROSPOT | LUNCH BAG SUKI DESIGN |
20725 | 22383 | 53.1 | 6.47 | 5.34 | LUNCH BAG RED SPOTTY | LUNCH BAG SUKI DESIGN |
20728 | 23206 | 48.8 | 5.37 | 5.32 | LUNCH BAG CARS BLUE | LUNCH BAG APPLE DESIGN |
22386 | 85099B | 76.6 | 6.52 | 5.26 | JUMBO BAG PINK POLKADOT | JUMBO BAG RED RETROSPOT |
20725 | 20728 | 57.8 | 7.04 | 5.26 | LUNCH BAG RED RETROSPOT | LUNCH BAG CARS BLUE |
20725 | 20728 | 57.8 | 7.04 | 5.26 | LUNCH BAG RED SPOTTY | LUNCH BAG CARS BLUE |
20728 | 22382 | 59.0 | 6.49 | 5.25 | LUNCH BAG CARS BLUE | LUNCH BAG SPACEBOY DESIGN |
20727 | 20728 | 57.2 | 5.99 | 5.19 | LUNCH BAG BLACK SKULL. | LUNCH BAG CARS BLUE |
20727 | 22382 | 58.2 | 6.10 | 5.18 | LUNCH BAG BLACK SKULL. | LUNCH BAG SPACEBOY DESIGN |
20725 | 23206 | 47.3 | 5.76 | 5.16 | LUNCH BAG RED RETROSPOT | LUNCH BAG APPLE DESIGN |
20725 | 23206 | 47.3 | 5.76 | 5.16 | LUNCH BAG RED SPOTTY | LUNCH BAG APPLE DESIGN |
22382 | 22384 | 51.9 | 5.83 | 5.10 | LUNCH BAG SPACEBOY DESIGN | LUNCH BAG PINK POLKADOT |
23203 | 23209 | 54.4 | 6.29 | 5.07 | JUMBO BAG DOILEY PATTERNS | LUNCH BAG DOILEY PATTERN |
23203 | 23209 | 54.4 | 6.29 | 5.07 | JUMBO BAG DOILEY PATTERNS | mailout |
23203 | 23209 | 54.4 | 6.29 | 5.07 | mailout | LUNCH BAG DOILEY PATTERN |
23203 | 23209 | 54.4 | 6.29 | 5.07 | mailout | mailout |
20725 | 22382 | 56.7 | 6.90 | 5.05 | LUNCH BAG RED SPOTTY | LUNCH BAG SPACEBOY DESIGN |
20725 | 22382 | 56.7 | 6.90 | 5.05 | LUNCH BAG RED RETROSPOT | LUNCH BAG SPACEBOY DESIGN |
85099B | 85099C | 35.3 | 5.14 | 5.05 | JUMBO BAG RED RETROSPOT | JUMBO BAG BAROQUE BLACK WHITE |
73 rows × 7 columns
- items with descriptions like cracked, faulty, damages are not useful for recommendation
- elimination required
In [112]:
ruledf3 = ruledf3[~(ruledf3.descconc.isin(['cracked', 'faulty', 'damages', 'mouldy, thrown away.', 'mailout']))]
print('number of rules : ', len(ruledf3))
ruledf3
number of rules : 64
Out[112]:
hypoconcconfsuppliftdeschypodescconc83858291879286120461231215411326493637312930385152259624115626140...08959601171312563212611891039664535272814155311010876125
22629 | 22630 | 69.5 | 5.12 | 10.55 | SPACEBOY LUNCH BOX | DOLLY GIRL LUNCH BOX |
22697 | 22698 | 76.4 | 6.81 | 10.40 | GREEN REGENCY TEACUP AND SAUCER | PINK REGENCY TEACUP AND SAUCER |
22577 | 22578 | 73.2 | 5.76 | 9.58 | WOODEN HEART CHRISTMAS SCANDINAVIAN | WOODEN STAR CHRISTMAS SCANDINAVIAN |
22726 | 22727 | 82.0 | 6.06 | 9.17 | ALARM CLOCK BAKELIKE GREEN | ALARM CLOCK BAKELIKE RED |
22698 | 22699 | 85.0 | 6.24 | 8.70 | PINK REGENCY TEACUP AND SAUCER | ROSES REGENCY TEACUP AND SAUCER |
22727 | 22728 | 58.0 | 5.19 | 8.69 | ALARM CLOCK BAKELIKE RED | ALARM CLOCK BAKELIKE PINK |
22697 | 22699 | 83.0 | 7.41 | 8.50 | GREEN REGENCY TEACUP AND SAUCER | ROSES REGENCY TEACUP AND SAUCER |
23300 | 23301 | 75.7 | 6.08 | 8.17 | GARDENERS KNEELING PAD CUP OF TEA | GARDENERS KNEELING PAD KEEP CALM |
21754 | 21755 | 61.1 | 5.03 | 7.92 | HOME BUILDING BLOCK WORD | LOVE BUILDING BLOCK WORD |
82482 | 82494L | 68.8 | 6.47 | 7.67 | WOODEN PICTURE FRAME WHITE FINISH | WOODEN FRAME ANTIQUE WHITE |
23321 | 23322 | 58.7 | 5.07 | 7.33 | SMALL WHITE HEART OF WICKER | LARGE WHITE HEART OF WICKER |
22382 | 22662 | 47.4 | 5.33 | 6.64 | LUNCH BAG SPACEBOY DESIGN | LUNCH BAG DOLLY GIRL DESIGN |
23206 | 23207 | 56.1 | 5.14 | 6.50 | LUNCH BAG APPLE DESIGN | LUNCH BAG ALPHABET DESIGN |
20726 | 22382 | 70.5 | 6.01 | 6.27 | LUNCH BAG WOODLAND | LUNCH BAG SPACEBOY DESIGN |
22138 | 22617 | 41.7 | 5.58 | 6.07 | BAKING SET 9 PIECE RETROSPOT | BAKING SET SPACEBOY DESIGN |
20728 | 22383 | 59.8 | 6.58 | 6.01 | LUNCH BAG CARS BLUE | LUNCH BAG SUKI DESIGN |
20728 | 22383 | 59.8 | 6.58 | 6.01 | LUNCH BAG CARS BLUE | LUNCH BAG SUKI DESIGN |
20727 | 22384 | 60.9 | 6.38 | 5.98 | LUNCH BAG BLACK SKULL. | LUNCH BAG PINK POLKADOT |
20727 | 22383 | 58.2 | 6.10 | 5.85 | LUNCH BAG BLACK SKULL. | LUNCH BAG SUKI DESIGN |
20727 | 22383 | 58.2 | 6.10 | 5.85 | LUNCH BAG BLACK SKULL. | LUNCH BAG SUKI DESIGN |
20728 | 22384 | 59.2 | 6.52 | 5.81 | LUNCH BAG CARS BLUE | LUNCH BAG PINK POLKADOT |
22382 | 22383 | 57.4 | 6.45 | 5.77 | LUNCH BAG SPACEBOY DESIGN | LUNCH BAG SUKI DESIGN |
22382 | 22383 | 57.4 | 6.45 | 5.77 | LUNCH BAG SPACEBOY DESIGN | LUNCH BAG SUKI DESIGN |
20726 | 20728 | 63.2 | 5.39 | 5.74 | LUNCH BAG WOODLAND | LUNCH BAG CARS BLUE |
23199 | 23203 | 66.2 | 5.21 | 5.73 | JUMBO BAG APPLES | JUMBO BAG DOILEY PATTERNS |
20726 | 20727 | 59.2 | 5.05 | 5.65 | LUNCH BAG WOODLAND | LUNCH BAG BLACK SKULL. |
23206 | 23209 | 60.5 | 5.55 | 5.64 | LUNCH BAG APPLE DESIGN | LUNCH BAG DOILEY PATTERN |
22383 | 23206 | 51.4 | 5.12 | 5.61 | LUNCH BAG SUKI DESIGN | LUNCH BAG APPLE DESIGN |
22383 | 23206 | 51.4 | 5.12 | 5.61 | LUNCH BAG SUKI DESIGN | LUNCH BAG APPLE DESIGN |
20728 | 23207 | 48.2 | 5.30 | 5.59 | LUNCH BAG CARS BLUE | LUNCH BAG ALPHABET DESIGN |
... | ... | ... | ... | ... | ... | ... |
20725 | 20726 | 47.1 | 5.74 | 5.52 | LUNCH BAG RED RETROSPOT | LUNCH BAG WOODLAND |
22720 | 23243 | 34.6 | 5.07 | 5.51 | SET OF 3 CAKE TINS PANTRY DESIGN | SET OF TEA COFFEE SUGAR TINS PANTRY |
22383 | 22384 | 55.8 | 5.55 | 5.48 | LUNCH BAG SUKI DESIGN | LUNCH BAG PINK POLKADOT |
22383 | 22384 | 55.8 | 5.55 | 5.48 | LUNCH BAG SUKI DESIGN | LUNCH BAG PINK POLKADOT |
23207 | 23209 | 58.8 | 5.07 | 5.48 | LUNCH BAG ALPHABET DESIGN | LUNCH BAG DOILEY PATTERN |
20725 | 22384 | 55.4 | 6.74 | 5.44 | LUNCH BAG RED SPOTTY | LUNCH BAG PINK POLKADOT |
20725 | 22384 | 55.4 | 6.74 | 5.44 | LUNCH BAG RED RETROSPOT | LUNCH BAG PINK POLKADOT |
22382 | 23207 | 46.4 | 5.21 | 5.38 | LUNCH BAG SPACEBOY DESIGN | LUNCH BAG ALPHABET DESIGN |
20725 | 20727 | 56.3 | 6.86 | 5.38 | LUNCH BAG RED SPOTTY | LUNCH BAG BLACK SKULL. |
20725 | 20727 | 56.3 | 6.86 | 5.38 | LUNCH BAG RED RETROSPOT | LUNCH BAG BLACK SKULL. |
85099B | 85099F | 38.9 | 5.67 | 5.37 | JUMBO BAG RED RETROSPOT | JUMBO BAG STRAWBERRY |
20725 | 22383 | 53.1 | 6.47 | 5.34 | LUNCH BAG RED SPOTTY | LUNCH BAG SUKI DESIGN |
20725 | 22383 | 53.1 | 6.47 | 5.34 | LUNCH BAG RED RETROSPOT | LUNCH BAG SUKI DESIGN |
20725 | 22383 | 53.1 | 6.47 | 5.34 | LUNCH BAG RED RETROSPOT | LUNCH BAG SUKI DESIGN |
20725 | 22383 | 53.1 | 6.47 | 5.34 | LUNCH BAG RED SPOTTY | LUNCH BAG SUKI DESIGN |
20728 | 23206 | 48.8 | 5.37 | 5.32 | LUNCH BAG CARS BLUE | LUNCH BAG APPLE DESIGN |
22386 | 85099B | 76.6 | 6.52 | 5.26 | JUMBO BAG PINK POLKADOT | JUMBO BAG RED RETROSPOT |
20725 | 20728 | 57.8 | 7.04 | 5.26 | LUNCH BAG RED RETROSPOT | LUNCH BAG CARS BLUE |
20725 | 20728 | 57.8 | 7.04 | 5.26 | LUNCH BAG RED SPOTTY | LUNCH BAG CARS BLUE |
20728 | 22382 | 59.0 | 6.49 | 5.25 | LUNCH BAG CARS BLUE | LUNCH BAG SPACEBOY DESIGN |
20727 | 20728 | 57.2 | 5.99 | 5.19 | LUNCH BAG BLACK SKULL. | LUNCH BAG CARS BLUE |
20727 | 22382 | 58.2 | 6.10 | 5.18 | LUNCH BAG BLACK SKULL. | LUNCH BAG SPACEBOY DESIGN |
20725 | 23206 | 47.3 | 5.76 | 5.16 | LUNCH BAG RED RETROSPOT | LUNCH BAG APPLE DESIGN |
20725 | 23206 | 47.3 | 5.76 | 5.16 | LUNCH BAG RED SPOTTY | LUNCH BAG APPLE DESIGN |
22382 | 22384 | 51.9 | 5.83 | 5.10 | LUNCH BAG SPACEBOY DESIGN | LUNCH BAG PINK POLKADOT |
23203 | 23209 | 54.4 | 6.29 | 5.07 | JUMBO BAG DOILEY PATTERNS | LUNCH BAG DOILEY PATTERN |
23203 | 23209 | 54.4 | 6.29 | 5.07 | mailout | LUNCH BAG DOILEY PATTERN |
20725 | 22382 | 56.7 | 6.90 | 5.05 | LUNCH BAG RED SPOTTY | LUNCH BAG SPACEBOY DESIGN |
20725 | 22382 | 56.7 | 6.90 | 5.05 | LUNCH BAG RED RETROSPOT | LUNCH BAG SPACEBOY DESIGN |
85099B | 85099C | 35.3 | 5.14 | 5.05 | JUMBO BAG RED RETROSPOT | JUMBO BAG BAROQUE BLACK WHITE |
64 rows × 7 columns
Section 3. Drawing Network Diagram using the Extracted Association Rules
In [1]:
### WARNING >>>> ! ! !
# Check correlation network Notebooks and Redesign !!!
from matplotlib import pylab
from pylab import *
import networkx as nx
graph = nx.from_pandas_edgelist(ruledf3, source = 'deschypo', target = 'descconc',
edge_attr = 'conf', create_using = nx.DiGraph())
# set colors by item category
gnodes = np.array(graph.node)
gnode_types = ['red' if 'BAG' in x else x for x in gnodes]
gnode_types = ['green' if 'TEACUP AND SAUCER' in x else x for x in gnode_types]
gnode_types = [x if (x=='red') or (x=='green') else 'grey' for x in gnode_types]
cntbyitemtype = pd.Series(gnode_types).value_counts()
print(cntbyitemtype )
plt.figure(figsize = (6,4))
plt.bar(['BAG','OTHERS','TEACUP AND SAUCER'], cntbyitemtype,
color=['red','grey','green'])
plt.title('Numer of Nodes by type in the rule set')
plt.show()
plt.figure(figsize = (20,8))
pylab.title('purchase associations between StockCodes\n(node size= Supp ; edge width= Conf)', size=18)
nx.draw_networkx(graph, node_color=gnode_types, edge_color='skyblue',
font_size=8, alpha=0.5,
node_size=np.array(ruledf.supp)*20,
width = np.array(ruledf3.conf*10/ruledf3.conf.max()),
arrow=True,
pos=nx.spring_layout(graph) )
plt.show()
plt.figure(figsize = (20,8))
pylab.title('purchase associations between StockCodes', size=18)
nx.draw_networkx(graph, node_color=gnode_types, edge_color='skyblue',
font_size=8, alpha=0.5, node_size=30,
width = np.array(ruledf3.conf*10/ruledf3.conf.max()),
arrow=True,
pos=nx.circular_layout(graph)
)
# https://qxf2.com/blog/drawing-weighted-graphs-with-networkx/
# plt.savefig("C:/YONG/m1710/0_AI_20171102/mov/fig_arm_001.png", format = "png", dpi = 400)
plt.show()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-1-a13d896d3650> in <module>()
8 import networkx as nx
9
---> 10 graph = nx.from_pandas_edgelist(ruledf3, source = 'deschypo', target = 'descconc',
11 edge_attr = 'conf', create_using = nx.DiGraph())
12 # set colors by item category
NameError: name 'ruledf3' is not defined
In [126]:
# node coloring example
nx.draw(graph, node_color=gnode_types, with_labels=True,
font_size=8)
plt.show()
Appendix. Practice Variations
- InvoiceNo instead of CustomerID (i.e. InvoiceNo as the basket)
- Trx date by CustomerID insetad of CustomerID (i.e. Trx day as the basket)
- Eliminate too frequent rules
'Python데이터분석' 카테고리의 다른 글
Sales EDA - Fashion Brand (0) | 2024.12.04 |
---|---|
[ FCPEDA ] 패션 고객-상품 탐색적 데이터 분석 - PYTHON (40) | 2024.10.11 |
파이썬 시각화 분석: 쉽지만 모르는 사람들이 많은 유용한 팁 5 (matplotlib) - ChatGPT (1) | 2024.01.30 |
파이썬 데이터 분석: 쉽지만 잘들 모르는 유용한 팁 10 (0) | 2023.12.25 |
[온라인 서점 고객세분화] ChatGPT가 지원하는 디지털 마케터의 시장/고객 데이터 분석 (2) | 2023.12.04 |