>RE::VISION CRM

Python데이터분석

Association Rule Generation using Apriori

YONG_X 2025. 3. 31. 20:44

 

Association Rule Generation using Apriori

In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from numpy.polynomial.polynomial import polyfit
import matplotlib.style as style 
from IPython.display import Image

import warnings
warnings.filterwarnings('ignore')

# define random jitter
def rjitt(arr):
    stdev = .01*(max(arr)-min(arr))
    return arr + np.random.randn(len(arr)) * stdev

# YONG's local data path
dataPath = 'C:/YONG/m1710/myPydata/'

Section 1. Load dataset

In [6]:
# Import Dataset 
# dataset = pd.read_csv(‘apriori_data2.csv’, header = None)

tr01a = pd.read_csv(dataPath + 'onlneRetail_A.csv',
                   dtype = {"StockCode" : "str"})

tr01b = pd.read_csv(dataPath + 'onlneRetail_B.csv')

# to bypass github upload size limitation, read pre-split datasets and merge
tr01c = pd.concat([tr01a, tr01b], axis=0)
print(tr01c.shape)

tr01 = tr01c[['CustomerID','StockCode']]
print(tr01.shape)
tr01c.head()
(541909, 8)
(541909, 2)
Out[6]:
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountry01234
536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 2010-12-01 8:26 2.55 17850.0 United Kingdom
536365 71053 WHITE METAL LANTERN 6 2010-12-01 8:26 3.39 17850.0 United Kingdom
536365 84406B CREAM CUPID HEARTS COAT HANGER 8 2010-12-01 8:26 2.75 17850.0 United Kingdom
536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 2010-12-01 8:26 3.39 17850.0 United Kingdom
536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 2010-12-01 8:26 3.39 17850.0 United Kingdom
In [7]:
# print(tr01.shape)

# 값이 없는 레코드의 수 확인 -- 분석 제외 대상
print('count_nan', len(tr01) - tr01.count())

tr01 = tr01.dropna().drop_duplicates()
print(tr01.shape)

tr01['CustomerID'] = tr01.CustomerID.astype(int).astype(str)
count_nan CustomerID    135080
StockCode          0
dtype: int64
(267615, 2)
In [8]:
tr01[:10]
Out[8]:
CustomerIDStockCode0123456789
17850 85123A
17850 71053
17850 84406B
17850 84029G
17850 84029E
17850 22752
17850 21730
17850 22633
17850 22632
13047 84879
In [9]:
# Check Frequency Distribution

nCust = len(tr01.CustomerID.unique())
print(nCust)
plt.figure(figsize = (20,4))
plt.bar(tr01.StockCode.value_counts()[:200].index,
        tr01.StockCode.value_counts()[:200]*100/nCust)
plt.ylim(0,tr01.StockCode.value_counts()[:200].max()*100/nCust)
plt.axhline(tr01.StockCode.value_counts()[:200].min()*100/nCust, 
            linestyle=':', color='grey')
plt.ylabel('PCNT Customer Ordered')
plt.title('Percentage of Customer Ordered by Item')
plt.show()
4372
In [97]:
# Check Frequency Distribution

nStockCode = len(tr01.StockCode.unique())
print(nStockCode)
plt.figure(figsize = (20,4))
WideCusts = tr01.CustomerID.value_counts()[:200]
plt.bar(WideCusts.index,
        WideCusts*100/nStockCode)
plt.ylim(0,WideCusts.max()*100/nStockCode)
plt.axhline(WideCusts.min()*100/nStockCode, 
            linestyle=':', color='grey')
plt.ylabel('PCNT StockCode Ordered')
plt.title('Percentage of Item Ordered by Customer')
plt.show()
3684
In [10]:
# Association rule generation을 위해 list format으로 변환

tmparr = []
ucusts = tr01.CustomerID.unique()
numcusts = len(ucusts)

for i in range(0, numcusts-1):
    tmpdf = tr01.loc[tr01['CustomerID']==ucusts[i]]
    tmparr.append(list(tmpdf['StockCode']))

# Check the formatted
np.array(tmparr[1])
Out[10]:
array(['84879', '22745', '22748', '22749', '22310', '84969', '22623',
       '22622', '21754', '21755', '21777', '48187', '22960', '22913',
       '22912', '22914', '21756', '22274', '16161P', '22766', '22708',
       '23231', '85015', '85016', '22829', '22722', '22961', '22969',
       '22720', '22549', '20972', '22743', '21658', '48184', '22692',
       '22499', '22607', '47566', '21902', '21901', '21899', '21900',
       '22077', '85123A', '23152', '23182', '23161', '23160', '23163',
       '23162', '23164', '23177', '23176', '23006', '22996', '23070',
       '23126', '23299', '22423', '21621', '23092', '23112', '23110',
       '23111', '23118', '23093', '23089', '23168', '23236', '23240',
       '23175', '23173', '23245', '23032', '23031', '23029', '23028',
       '85053', '22090', '22089', '22088', '23546', '23545', '22986',
       '22739', '23435', '22621', '48188', '22899', '22367', '23514',
       '23513', '23511', '23198', '23294', '23295', '23025', '23470',
       '23397', '23096', '23487', '23109', '23434', '22086', '85049E',
       'M'], dtype='<U6')

Section 2. Extracting association rules

In [47]:
# Train Apriori Model
from mlxtend.frequent_patterns import apriori 
from apyori import apriori

rules = apriori(tmparr, min_support = 0.05, min_confidence = 0.33, min_lift = 3, max_length = 2)

# try min_support = 0.02 or min_confidence = 0.2

# to check the distribution of rules ( CAUTION :: takes a while )
# rules = apriori(tmparr, min_support = 0.01, min_confidence = 0.1, min_lift = 2, max_length = 2)



# Listing the resulting Rules
results = list(rules)
# print(results[:10])

print("number of rules : " + str(len(results)) + '\n')
num_rules = len(results)

column_names = ["hypo", "conc", "conf","supp","lift"]
ruledf = pd.DataFrame(columns = column_names)

# Print rules one by one
for i in range(0, num_rules):
    result = results[i]
    # print(result)
    supp = int(result.support*10000)/100
    conf = int(result.ordered_statistics[0].confidence *1000)/10
    lift = int(result.ordered_statistics[0].lift *100)/100,
    hypo = ''.join([x+' ' for x in result.ordered_statistics[0].items_base]).strip()
    conc = ''.join([x+' ' for x in result.ordered_statistics[0].items_add]).strip()
    
    print("{ "+str(hypo)+ " ==>  " + str(conc)  +
          " } [ conf = " + str(conf) + "%,"+ " supp = "+str(supp)+"%, " +
          " lift = "+str(list(lift)[0])+"]")
    
    # insert raws into a container df
    ruledf.loc[i] = [hypo, conc, conf, supp, list(lift)[0]]
number of rules : 81

{ 20725 ==>  20726 } [ conf = 47.1%, supp = 5.74%,  lift = 5.52]
{ 20725 ==>  20727 } [ conf = 56.3%, supp = 6.86%,  lift = 5.38]
{ 20725 ==>  20728 } [ conf = 57.8%, supp = 7.04%,  lift = 5.26]
{ 20725 ==>  22382 } [ conf = 56.7%, supp = 6.9%,  lift = 5.05]
{ 20725 ==>  22383 } [ conf = 53.1%, supp = 6.47%,  lift = 5.34]
{ 20725 ==>  22384 } [ conf = 55.4%, supp = 6.74%,  lift = 5.44]
{ 20725 ==>  23206 } [ conf = 47.3%, supp = 5.76%,  lift = 5.16]
{ 20725 ==>  23207 } [ conf = 42.8%, supp = 5.21%,  lift = 4.96]
{ 20725 ==>  23209 } [ conf = 50.0%, supp = 6.08%,  lift = 4.65]
{ 20725 ==>  85099B } [ conf = 51.3%, supp = 6.24%,  lift = 3.52]
{ 20726 ==>  20727 } [ conf = 59.2%, supp = 5.05%,  lift = 5.65]
{ 20726 ==>  20728 } [ conf = 63.2%, supp = 5.39%,  lift = 5.74]
{ 20726 ==>  22382 } [ conf = 70.5%, supp = 6.01%,  lift = 6.27]
{ 20727 ==>  20728 } [ conf = 57.2%, supp = 5.99%,  lift = 5.19]
{ 20727 ==>  22382 } [ conf = 58.2%, supp = 6.1%,  lift = 5.18]
{ 20727 ==>  22383 } [ conf = 58.2%, supp = 6.1%,  lift = 5.85]
{ 20727 ==>  22384 } [ conf = 60.9%, supp = 6.38%,  lift = 5.98]
{ 20727 ==>  23207 } [ conf = 48.0%, supp = 5.03%,  lift = 5.56]
{ 20727 ==>  23209 } [ conf = 48.6%, supp = 5.1%,  lift = 4.53]
{ 20728 ==>  22382 } [ conf = 59.0%, supp = 6.49%,  lift = 5.25]
{ 20728 ==>  22383 } [ conf = 59.8%, supp = 6.58%,  lift = 6.01]
{ 20728 ==>  22384 } [ conf = 59.2%, supp = 6.52%,  lift = 5.81]
{ 20728 ==>  23206 } [ conf = 48.8%, supp = 5.37%,  lift = 5.32]
{ 20728 ==>  23207 } [ conf = 48.2%, supp = 5.3%,  lift = 5.59]
{ 20728 ==>  23209 } [ conf = 49.4%, supp = 5.44%,  lift = 4.61]
{ 21212 ==>  21977 } [ conf = 43.2%, supp = 6.29%,  lift = 4.59]
{ 21212 ==>  84991 } [ conf = 42.4%, supp = 6.17%,  lift = 4.47]
{ 21733 ==>  85123A } [ conf = 81.0%, supp = 7.13%,  lift = 4.12]
{ 21754 ==>  21755 } [ conf = 61.1%, supp = 5.03%,  lift = 7.92]
{ 21931 ==>  85099B } [ conf = 67.9%, supp = 5.19%,  lift = 4.67]
{ 22086 ==>  22910 } [ conf = 53.1%, supp = 7.48%,  lift = 4.94]
{ 22138 ==>  22617 } [ conf = 41.7%, supp = 5.58%,  lift = 6.07]
{ 22382 ==>  22383 } [ conf = 57.4%, supp = 6.45%,  lift = 5.77]
{ 22382 ==>  22384 } [ conf = 51.9%, supp = 5.83%,  lift = 5.1]
{ 22382 ==>  22662 } [ conf = 47.4%, supp = 5.33%,  lift = 6.64]
{ 22382 ==>  23206 } [ conf = 45.8%, supp = 5.14%,  lift = 4.99]
{ 22382 ==>  23207 } [ conf = 46.4%, supp = 5.21%,  lift = 5.38]
{ 22382 ==>  23209 } [ conf = 49.0%, supp = 5.51%,  lift = 4.57]
{ 22383 ==>  22384 } [ conf = 55.8%, supp = 5.55%,  lift = 5.48]
{ 22383 ==>  23206 } [ conf = 51.4%, supp = 5.12%,  lift = 5.61]
{ 22384 ==>  23209 } [ conf = 51.2%, supp = 5.21%,  lift = 4.77]
{ 22384 ==>  85099B } [ conf = 51.6%, supp = 5.26%,  lift = 3.55]
{ 22386 ==>  85099B } [ conf = 76.6%, supp = 6.52%,  lift = 5.26]
{ 22411 ==>  85099B } [ conf = 60.2%, supp = 5.17%,  lift = 4.14]
{ 22423 ==>  22697 } [ conf = 33.8%, supp = 6.86%,  lift = 3.79]
{ 22698 ==>  22423 } [ conf = 81.3%, supp = 5.97%,  lift = 4.0]
{ 22423 ==>  22699 } [ conf = 36.5%, supp = 7.41%,  lift = 3.73]
{ 22423 ==>  23245 } [ conf = 34.1%, supp = 6.93%,  lift = 3.16]
{ 22469 ==>  22470 } [ conf = 53.9%, supp = 7.06%,  lift = 4.84]
{ 22469 ==>  23321 } [ conf = 39.0%, supp = 5.12%,  lift = 4.52]
{ 22577 ==>  22578 } [ conf = 73.2%, supp = 5.76%,  lift = 9.58]
{ 22629 ==>  22630 } [ conf = 69.5%, supp = 5.12%,  lift = 10.55]
{ 22666 ==>  22720 } [ conf = 56.0%, supp = 6.06%,  lift = 3.82]
{ 22697 ==>  22698 } [ conf = 76.4%, supp = 6.81%,  lift = 10.4]
{ 22697 ==>  22699 } [ conf = 83.0%, supp = 7.41%,  lift = 8.5]
{ 22698 ==>  22699 } [ conf = 85.0%, supp = 6.24%,  lift = 8.7]
{ 22720 ==>  22722 } [ conf = 42.4%, supp = 6.22%,  lift = 4.72]
{ 22720 ==>  23243 } [ conf = 34.6%, supp = 5.07%,  lift = 5.51]
{ 22720 ==>  23245 } [ conf = 35.1%, supp = 5.14%,  lift = 3.25]
{ 22726 ==>  22727 } [ conf = 82.0%, supp = 6.06%,  lift = 9.17]
{ 22727 ==>  22728 } [ conf = 58.0%, supp = 5.19%,  lift = 8.69]
{ 22804 ==>  85123A } [ conf = 90.9%, supp = 5.26%,  lift = 4.63]
{ 22960 ==>  22961 } [ conf = 45.2%, supp = 5.94%,  lift = 3.68]
{ 23199 ==>  23203 } [ conf = 66.2%, supp = 5.21%,  lift = 5.73]
{ 23199 ==>  85099B } [ conf = 72.6%, supp = 5.71%,  lift = 4.99]
{ 23201 ==>  23203 } [ conf = 57.2%, supp = 5.17%,  lift = 4.95]
{ 23201 ==>  85099B } [ conf = 61.2%, supp = 5.53%,  lift = 4.21]
{ 23202 ==>  23203 } [ conf = 64.4%, supp = 5.55%,  lift = 5.57]
{ 23202 ==>  85099B } [ conf = 60.4%, supp = 5.21%,  lift = 4.15]
{ 23203 ==>  23209 } [ conf = 54.4%, supp = 6.29%,  lift = 5.07]
{ 23203 ==>  85099B } [ conf = 60.9%, supp = 7.04%,  lift = 4.19]
{ 23206 ==>  23207 } [ conf = 56.1%, supp = 5.14%,  lift = 6.5]
{ 23206 ==>  23209 } [ conf = 60.5%, supp = 5.55%,  lift = 5.64]
{ 23207 ==>  23209 } [ conf = 58.8%, supp = 5.07%,  lift = 5.48]
{ 23298 ==>  47566 } [ conf = 52.4%, supp = 6.86%,  lift = 3.23]
{ 23300 ==>  23301 } [ conf = 75.7%, supp = 6.08%,  lift = 8.17]
{ 23321 ==>  23322 } [ conf = 58.7%, supp = 5.07%,  lift = 7.33]
{ 23344 ==>  85099B } [ conf = 57.5%, supp = 5.14%,  lift = 3.95]
{ 82482 ==>  82494L } [ conf = 68.8%, supp = 6.47%,  lift = 7.67]
{ 85099B ==>  85099C } [ conf = 35.3%, supp = 5.14%,  lift = 5.05]
{ 85099B ==>  85099F } [ conf = 38.9%, supp = 5.67%,  lift = 5.37]
In [48]:
np.array(ruledf.tail().hypo)
np.array(ruledf.tail().conc)
ruledf.tail()
Out[48]:
hypoconcconfsupplift7677787980
23321 23322 58.7 5.07 7.33
23344 85099B 57.5 5.14 3.95
82482 82494L 68.8 6.47 7.67
85099B 85099C 35.3 5.14 5.05
85099B 85099F 38.9 5.67 5.37

visual profiling of the resulting rule set

In [49]:
# profiling the resulting rule set

def scattrules(X, Y, Xlab, Ylab):
    plt.scatter(rjitt(X), rjitt(Y), alpha=0.3)
    plt.xlabel(Xlab)
    plt.ylabel(Ylab)
    plt.axvline(X.median(), linestyle=':', color='grey')
    plt.axhline(Y.median(), linestyle=':', color='grey')
    plt.plot(np.unique(X), np.poly1d(np.polyfit(X, Y, 3))(np.unique(X)), 
             color='red', linestyle='--')
    plt.show()

X = ruledf.conf
Y = ruledf.supp
Xlab, Ylab = 'CONF', 'SUPP'

scattrules(X, Y, Xlab, Ylab)    

scattrules(ruledf.conf, ruledf.lift, 'CONF', 'LIFT')    
scattrules(ruledf.supp, ruledf.lift, 'SUPP', 'LIFT')    
In [13]:
# attach description of stockCode for human inspection

dfItemName = tr01a[['StockCode','Description']].drop_duplicates().dropna()
dfItemName['StockCode'] = np.array(dfItemName.StockCode.astype(str))
dfItemName['Description'] = np.array(dfItemName.Description.astype(str))
dfItemName.shape

dfItemName.columns = ['hypo','deschypo']
dfItemName1 = dfItemName.copy()
dfItemName1.columns = ['conc','descconc']

dfItemName1 = dfItemName1[dfItemName1.descconc!='?']

dfItemName.head()
# to check if description is correct
dfItemName1[dfItemName1.conc=='85123A']
Out[13]:
concdescconc0
85123A WHITE HANGING HEART T-LIGHT HOLDER
In [111]:
# attach descriptions using left join

ruledf1 = pd.merge(ruledf, dfItemName, how='left', on='hypo')
ruledf1.head()
ruledf1.tail(10)

ruledf2 = pd.merge(ruledf1, dfItemName1, how='left', on='conc')
ruledf2.head()
ruledf3 = ruledf2[ (ruledf2.conf>30) & (ruledf2.lift>5)].sort_values('lift', ascending=False)
print('number of rules : ', len(ruledf3))
ruledf3
number of rules :  73
Out[111]:
hypoconcconfsuppliftdeschypodescconc838582918792861204612412312154113264950363731293038515225969524114...601161171312563212611891039664535272814155311010910810776125
22629 22630 69.5 5.12 10.55 SPACEBOY LUNCH BOX DOLLY GIRL LUNCH BOX
22697 22698 76.4 6.81 10.40 GREEN REGENCY TEACUP AND SAUCER PINK REGENCY TEACUP AND SAUCER
22577 22578 73.2 5.76 9.58 WOODEN HEART CHRISTMAS SCANDINAVIAN WOODEN STAR CHRISTMAS SCANDINAVIAN
22726 22727 82.0 6.06 9.17 ALARM CLOCK BAKELIKE GREEN ALARM CLOCK BAKELIKE RED
22698 22699 85.0 6.24 8.70 PINK REGENCY TEACUP AND SAUCER ROSES REGENCY TEACUP AND SAUCER
22727 22728 58.0 5.19 8.69 ALARM CLOCK BAKELIKE RED ALARM CLOCK BAKELIKE PINK
22697 22699 83.0 7.41 8.50 GREEN REGENCY TEACUP AND SAUCER ROSES REGENCY TEACUP AND SAUCER
23300 23301 75.7 6.08 8.17 GARDENERS KNEELING PAD CUP OF TEA GARDENERS KNEELING PAD KEEP CALM
21754 21755 61.1 5.03 7.92 HOME BUILDING BLOCK WORD LOVE BUILDING BLOCK WORD
82482 82494L 68.8 6.47 7.67 WOODEN PICTURE FRAME WHITE FINISH cracked
82482 82494L 68.8 6.47 7.67 WOODEN PICTURE FRAME WHITE FINISH WOODEN FRAME ANTIQUE WHITE
23321 23322 58.7 5.07 7.33 SMALL WHITE HEART OF WICKER LARGE WHITE HEART OF WICKER
22382 22662 47.4 5.33 6.64 LUNCH BAG SPACEBOY DESIGN LUNCH BAG DOLLY GIRL DESIGN
23206 23207 56.1 5.14 6.50 LUNCH BAG APPLE DESIGN LUNCH BAG ALPHABET DESIGN
20726 22382 70.5 6.01 6.27 LUNCH BAG WOODLAND LUNCH BAG SPACEBOY DESIGN
22138 22617 41.7 5.58 6.07 BAKING SET 9 PIECE RETROSPOT BAKING SET SPACEBOY DESIGN
22138 22617 41.7 5.58 6.07 BAKING SET 9 PIECE RETROSPOT mouldy, thrown away.
20728 22383 59.8 6.58 6.01 LUNCH BAG CARS BLUE LUNCH BAG SUKI DESIGN
20728 22383 59.8 6.58 6.01 LUNCH BAG CARS BLUE LUNCH BAG SUKI DESIGN
20727 22384 60.9 6.38 5.98 LUNCH BAG BLACK SKULL. LUNCH BAG PINK POLKADOT
20727 22383 58.2 6.10 5.85 LUNCH BAG BLACK SKULL. LUNCH BAG SUKI DESIGN
20727 22383 58.2 6.10 5.85 LUNCH BAG BLACK SKULL. LUNCH BAG SUKI DESIGN
20728 22384 59.2 6.52 5.81 LUNCH BAG CARS BLUE LUNCH BAG PINK POLKADOT
22382 22383 57.4 6.45 5.77 LUNCH BAG SPACEBOY DESIGN LUNCH BAG SUKI DESIGN
22382 22383 57.4 6.45 5.77 LUNCH BAG SPACEBOY DESIGN LUNCH BAG SUKI DESIGN
20726 20728 63.2 5.39 5.74 LUNCH BAG WOODLAND LUNCH BAG CARS BLUE
23199 23203 66.2 5.21 5.73 JUMBO BAG APPLES JUMBO BAG DOILEY PATTERNS
23199 23203 66.2 5.21 5.73 JUMBO BAG APPLES mailout
20726 20727 59.2 5.05 5.65 LUNCH BAG WOODLAND LUNCH BAG BLACK SKULL.
23206 23209 60.5 5.55 5.64 LUNCH BAG APPLE DESIGN mailout
... ... ... ... ... ... ...
22383 22384 55.8 5.55 5.48 LUNCH BAG SUKI DESIGN LUNCH BAG PINK POLKADOT
23207 23209 58.8 5.07 5.48 LUNCH BAG ALPHABET DESIGN mailout
23207 23209 58.8 5.07 5.48 LUNCH BAG ALPHABET DESIGN LUNCH BAG DOILEY PATTERN
20725 22384 55.4 6.74 5.44 LUNCH BAG RED SPOTTY LUNCH BAG PINK POLKADOT
20725 22384 55.4 6.74 5.44 LUNCH BAG RED RETROSPOT LUNCH BAG PINK POLKADOT
22382 23207 46.4 5.21 5.38 LUNCH BAG SPACEBOY DESIGN LUNCH BAG ALPHABET DESIGN
20725 20727 56.3 6.86 5.38 LUNCH BAG RED SPOTTY LUNCH BAG BLACK SKULL.
20725 20727 56.3 6.86 5.38 LUNCH BAG RED RETROSPOT LUNCH BAG BLACK SKULL.
85099B 85099F 38.9 5.67 5.37 JUMBO BAG RED RETROSPOT JUMBO BAG STRAWBERRY
20725 22383 53.1 6.47 5.34 LUNCH BAG RED SPOTTY LUNCH BAG SUKI DESIGN
20725 22383 53.1 6.47 5.34 LUNCH BAG RED RETROSPOT LUNCH BAG SUKI DESIGN
20725 22383 53.1 6.47 5.34 LUNCH BAG RED RETROSPOT LUNCH BAG SUKI DESIGN
20725 22383 53.1 6.47 5.34 LUNCH BAG RED SPOTTY LUNCH BAG SUKI DESIGN
20728 23206 48.8 5.37 5.32 LUNCH BAG CARS BLUE LUNCH BAG APPLE DESIGN
22386 85099B 76.6 6.52 5.26 JUMBO BAG PINK POLKADOT JUMBO BAG RED RETROSPOT
20725 20728 57.8 7.04 5.26 LUNCH BAG RED RETROSPOT LUNCH BAG CARS BLUE
20725 20728 57.8 7.04 5.26 LUNCH BAG RED SPOTTY LUNCH BAG CARS BLUE
20728 22382 59.0 6.49 5.25 LUNCH BAG CARS BLUE LUNCH BAG SPACEBOY DESIGN
20727 20728 57.2 5.99 5.19 LUNCH BAG BLACK SKULL. LUNCH BAG CARS BLUE
20727 22382 58.2 6.10 5.18 LUNCH BAG BLACK SKULL. LUNCH BAG SPACEBOY DESIGN
20725 23206 47.3 5.76 5.16 LUNCH BAG RED RETROSPOT LUNCH BAG APPLE DESIGN
20725 23206 47.3 5.76 5.16 LUNCH BAG RED SPOTTY LUNCH BAG APPLE DESIGN
22382 22384 51.9 5.83 5.10 LUNCH BAG SPACEBOY DESIGN LUNCH BAG PINK POLKADOT
23203 23209 54.4 6.29 5.07 JUMBO BAG DOILEY PATTERNS LUNCH BAG DOILEY PATTERN
23203 23209 54.4 6.29 5.07 JUMBO BAG DOILEY PATTERNS mailout
23203 23209 54.4 6.29 5.07 mailout LUNCH BAG DOILEY PATTERN
23203 23209 54.4 6.29 5.07 mailout mailout
20725 22382 56.7 6.90 5.05 LUNCH BAG RED SPOTTY LUNCH BAG SPACEBOY DESIGN
20725 22382 56.7 6.90 5.05 LUNCH BAG RED RETROSPOT LUNCH BAG SPACEBOY DESIGN
85099B 85099C 35.3 5.14 5.05 JUMBO BAG RED RETROSPOT JUMBO BAG BAROQUE BLACK WHITE

73 rows × 7 columns

  • items with descriptions like cracked, faulty, damages are not useful for recommendation
  • elimination required
In [112]:
ruledf3 = ruledf3[~(ruledf3.descconc.isin(['cracked', 'faulty', 'damages', 'mouldy, thrown away.', 'mailout']))]
print('number of rules : ', len(ruledf3))
ruledf3
number of rules :  64
Out[112]:
hypoconcconfsuppliftdeschypodescconc83858291879286120461231215411326493637312930385152259624115626140...08959601171312563212611891039664535272814155311010876125
22629 22630 69.5 5.12 10.55 SPACEBOY LUNCH BOX DOLLY GIRL LUNCH BOX
22697 22698 76.4 6.81 10.40 GREEN REGENCY TEACUP AND SAUCER PINK REGENCY TEACUP AND SAUCER
22577 22578 73.2 5.76 9.58 WOODEN HEART CHRISTMAS SCANDINAVIAN WOODEN STAR CHRISTMAS SCANDINAVIAN
22726 22727 82.0 6.06 9.17 ALARM CLOCK BAKELIKE GREEN ALARM CLOCK BAKELIKE RED
22698 22699 85.0 6.24 8.70 PINK REGENCY TEACUP AND SAUCER ROSES REGENCY TEACUP AND SAUCER
22727 22728 58.0 5.19 8.69 ALARM CLOCK BAKELIKE RED ALARM CLOCK BAKELIKE PINK
22697 22699 83.0 7.41 8.50 GREEN REGENCY TEACUP AND SAUCER ROSES REGENCY TEACUP AND SAUCER
23300 23301 75.7 6.08 8.17 GARDENERS KNEELING PAD CUP OF TEA GARDENERS KNEELING PAD KEEP CALM
21754 21755 61.1 5.03 7.92 HOME BUILDING BLOCK WORD LOVE BUILDING BLOCK WORD
82482 82494L 68.8 6.47 7.67 WOODEN PICTURE FRAME WHITE FINISH WOODEN FRAME ANTIQUE WHITE
23321 23322 58.7 5.07 7.33 SMALL WHITE HEART OF WICKER LARGE WHITE HEART OF WICKER
22382 22662 47.4 5.33 6.64 LUNCH BAG SPACEBOY DESIGN LUNCH BAG DOLLY GIRL DESIGN
23206 23207 56.1 5.14 6.50 LUNCH BAG APPLE DESIGN LUNCH BAG ALPHABET DESIGN
20726 22382 70.5 6.01 6.27 LUNCH BAG WOODLAND LUNCH BAG SPACEBOY DESIGN
22138 22617 41.7 5.58 6.07 BAKING SET 9 PIECE RETROSPOT BAKING SET SPACEBOY DESIGN
20728 22383 59.8 6.58 6.01 LUNCH BAG CARS BLUE LUNCH BAG SUKI DESIGN
20728 22383 59.8 6.58 6.01 LUNCH BAG CARS BLUE LUNCH BAG SUKI DESIGN
20727 22384 60.9 6.38 5.98 LUNCH BAG BLACK SKULL. LUNCH BAG PINK POLKADOT
20727 22383 58.2 6.10 5.85 LUNCH BAG BLACK SKULL. LUNCH BAG SUKI DESIGN
20727 22383 58.2 6.10 5.85 LUNCH BAG BLACK SKULL. LUNCH BAG SUKI DESIGN
20728 22384 59.2 6.52 5.81 LUNCH BAG CARS BLUE LUNCH BAG PINK POLKADOT
22382 22383 57.4 6.45 5.77 LUNCH BAG SPACEBOY DESIGN LUNCH BAG SUKI DESIGN
22382 22383 57.4 6.45 5.77 LUNCH BAG SPACEBOY DESIGN LUNCH BAG SUKI DESIGN
20726 20728 63.2 5.39 5.74 LUNCH BAG WOODLAND LUNCH BAG CARS BLUE
23199 23203 66.2 5.21 5.73 JUMBO BAG APPLES JUMBO BAG DOILEY PATTERNS
20726 20727 59.2 5.05 5.65 LUNCH BAG WOODLAND LUNCH BAG BLACK SKULL.
23206 23209 60.5 5.55 5.64 LUNCH BAG APPLE DESIGN LUNCH BAG DOILEY PATTERN
22383 23206 51.4 5.12 5.61 LUNCH BAG SUKI DESIGN LUNCH BAG APPLE DESIGN
22383 23206 51.4 5.12 5.61 LUNCH BAG SUKI DESIGN LUNCH BAG APPLE DESIGN
20728 23207 48.2 5.30 5.59 LUNCH BAG CARS BLUE LUNCH BAG ALPHABET DESIGN
... ... ... ... ... ... ...
20725 20726 47.1 5.74 5.52 LUNCH BAG RED RETROSPOT LUNCH BAG WOODLAND
22720 23243 34.6 5.07 5.51 SET OF 3 CAKE TINS PANTRY DESIGN SET OF TEA COFFEE SUGAR TINS PANTRY
22383 22384 55.8 5.55 5.48 LUNCH BAG SUKI DESIGN LUNCH BAG PINK POLKADOT
22383 22384 55.8 5.55 5.48 LUNCH BAG SUKI DESIGN LUNCH BAG PINK POLKADOT
23207 23209 58.8 5.07 5.48 LUNCH BAG ALPHABET DESIGN LUNCH BAG DOILEY PATTERN
20725 22384 55.4 6.74 5.44 LUNCH BAG RED SPOTTY LUNCH BAG PINK POLKADOT
20725 22384 55.4 6.74 5.44 LUNCH BAG RED RETROSPOT LUNCH BAG PINK POLKADOT
22382 23207 46.4 5.21 5.38 LUNCH BAG SPACEBOY DESIGN LUNCH BAG ALPHABET DESIGN
20725 20727 56.3 6.86 5.38 LUNCH BAG RED SPOTTY LUNCH BAG BLACK SKULL.
20725 20727 56.3 6.86 5.38 LUNCH BAG RED RETROSPOT LUNCH BAG BLACK SKULL.
85099B 85099F 38.9 5.67 5.37 JUMBO BAG RED RETROSPOT JUMBO BAG STRAWBERRY
20725 22383 53.1 6.47 5.34 LUNCH BAG RED SPOTTY LUNCH BAG SUKI DESIGN
20725 22383 53.1 6.47 5.34 LUNCH BAG RED RETROSPOT LUNCH BAG SUKI DESIGN
20725 22383 53.1 6.47 5.34 LUNCH BAG RED RETROSPOT LUNCH BAG SUKI DESIGN
20725 22383 53.1 6.47 5.34 LUNCH BAG RED SPOTTY LUNCH BAG SUKI DESIGN
20728 23206 48.8 5.37 5.32 LUNCH BAG CARS BLUE LUNCH BAG APPLE DESIGN
22386 85099B 76.6 6.52 5.26 JUMBO BAG PINK POLKADOT JUMBO BAG RED RETROSPOT
20725 20728 57.8 7.04 5.26 LUNCH BAG RED RETROSPOT LUNCH BAG CARS BLUE
20725 20728 57.8 7.04 5.26 LUNCH BAG RED SPOTTY LUNCH BAG CARS BLUE
20728 22382 59.0 6.49 5.25 LUNCH BAG CARS BLUE LUNCH BAG SPACEBOY DESIGN
20727 20728 57.2 5.99 5.19 LUNCH BAG BLACK SKULL. LUNCH BAG CARS BLUE
20727 22382 58.2 6.10 5.18 LUNCH BAG BLACK SKULL. LUNCH BAG SPACEBOY DESIGN
20725 23206 47.3 5.76 5.16 LUNCH BAG RED RETROSPOT LUNCH BAG APPLE DESIGN
20725 23206 47.3 5.76 5.16 LUNCH BAG RED SPOTTY LUNCH BAG APPLE DESIGN
22382 22384 51.9 5.83 5.10 LUNCH BAG SPACEBOY DESIGN LUNCH BAG PINK POLKADOT
23203 23209 54.4 6.29 5.07 JUMBO BAG DOILEY PATTERNS LUNCH BAG DOILEY PATTERN
23203 23209 54.4 6.29 5.07 mailout LUNCH BAG DOILEY PATTERN
20725 22382 56.7 6.90 5.05 LUNCH BAG RED SPOTTY LUNCH BAG SPACEBOY DESIGN
20725 22382 56.7 6.90 5.05 LUNCH BAG RED RETROSPOT LUNCH BAG SPACEBOY DESIGN
85099B 85099C 35.3 5.14 5.05 JUMBO BAG RED RETROSPOT JUMBO BAG BAROQUE BLACK WHITE

64 rows × 7 columns

Section 3. Drawing Network Diagram using the Extracted Association Rules

In [1]:
### WARNING >>>>  ! ! !

# Check correlation network Notebooks and Redesign !!!


from matplotlib import pylab
from pylab import *
import networkx as nx

graph = nx.from_pandas_edgelist(ruledf3, source = 'deschypo', target = 'descconc', 
                                  edge_attr = 'conf', create_using = nx.DiGraph())
# set colors by item category
gnodes = np.array(graph.node)
gnode_types = ['red' if 'BAG' in x else x for x in gnodes]
gnode_types = ['green' if 'TEACUP AND SAUCER' in x else x for x in gnode_types]
gnode_types = [x if (x=='red') or (x=='green')  else 'grey' for x in gnode_types]

cntbyitemtype = pd.Series(gnode_types).value_counts()
print(cntbyitemtype )

plt.figure(figsize = (6,4))
plt.bar(['BAG','OTHERS','TEACUP AND SAUCER'], cntbyitemtype,
       color=['red','grey','green'])
plt.title('Numer of Nodes by type in the rule set')
plt.show()

plt.figure(figsize = (20,8))
pylab.title('purchase associations between StockCodes\n(node size= Supp ; edge width= Conf)', size=18)
nx.draw_networkx(graph, node_color=gnode_types, edge_color='skyblue',
                 font_size=8, alpha=0.5, 
                 node_size=np.array(ruledf.supp)*20,
                 width = np.array(ruledf3.conf*10/ruledf3.conf.max()),
                 arrow=True, 
                 pos=nx.spring_layout(graph) )
                
plt.show()


plt.figure(figsize = (20,8))
pylab.title('purchase associations between StockCodes', size=18)      
nx.draw_networkx(graph, node_color=gnode_types, edge_color='skyblue',
                 font_size=8, alpha=0.5, node_size=30, 
                 width = np.array(ruledf3.conf*10/ruledf3.conf.max()),
                 arrow=True, 
                 pos=nx.circular_layout(graph)
                )

# https://qxf2.com/blog/drawing-weighted-graphs-with-networkx/ 

# plt.savefig("C:/YONG/m1710/0_AI_20171102/mov/fig_arm_001.png", format = "png", dpi = 400)
plt.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-a13d896d3650> in <module>()
      8 import networkx as nx
      9 
---> 10 graph = nx.from_pandas_edgelist(ruledf3, source = 'deschypo', target = 'descconc', 
     11                                   edge_attr = 'conf', create_using = nx.DiGraph())
     12 # set colors by item category

NameError: name 'ruledf3' is not defined
In [126]:
# node coloring example    

nx.draw(graph, node_color=gnode_types, with_labels=True,
       font_size=8)
plt.show()
 

Appendix. Practice Variations

  • InvoiceNo instead of CustomerID (i.e. InvoiceNo as the basket)
  • Trx date by CustomerID insetad of CustomerID (i.e. Trx day as the basket)
  • Eliminate too frequent rules