[ML] Catboost and DT - Sample Script
[ML] Catboost and DT - Sample Script
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
dt = DecisionTreeRegressor(max_depth=3, min_samples_leaf=100)
dtfeatures = ['is_female','is_seoul','age']
dt.fit(cust_prfl1[dtfeatures], cust_prfl1['amt'])
# sklearn decision trees do not handle categorical data
plt.figure(figsize=(16,8), dpi=80)
tree.plot_tree(dt, feature_names=dtfeatures,
filled=True, fontsize=11, precision=2)
plt.show()
# create a prediction model
from catboost import CatBoostRegressor
categorical_features_indices = np.where(cust_prfl5[dtfeatures].dtypes == np.object)[0]
cust_prfl5shuff = cust_prfl5.sample(frac=1)
cust_prfl5a = cust_prfl5shuff.head(3000)
cust_prfl5b = cust_prfl5shuff.tail(6000)
iters = 5000
cbmodel = CatBoostRegressor(iterations=iters,
eval_metric='R2',
learning_rate=0.005,
loss_function= 'RMSE',
use_best_model=True,
depth=3,
random_seed=42,
metric_period = int(iters/50))
cbmodel.fit(cust_prfl5b[dtfeatures], cust_prfl5b['amt2020'],
cat_features=categorical_features_indices,
eval_set=(cust_prfl5a[dtfeatures], cust_prfl5a['amt2020']),
plot=True)
# Mapping Feature Importance
plt.figure()
fea_imp = pd.DataFrame({'imp': cbmodel.feature_importances_, 'feature': dtfeatures})
fea_imp['imp'] = round(fea_imp.imp, 2)
mean_vimp = fea_imp.imp.mean()
fea_imp = fea_imp.sort_values(['imp', 'feature'], ascending=[True, False])
# to limit number of features? --> .iloc[-5:]
_ = fea_imp.plot(kind='barh', x='feature', y='imp', figsize=(4, 4), alpha=0.7)
plt.title('Var Imp from CatBoost')
plt.axvline(mean_vimp, linestyle=':')
plt.show()
fea_imp1 = fea_imp.sort_values('imp', ascending=False)
# add row total
fea_imp1['cum_sum_imp']= round(fea_imp1['imp'].cumsum(),2)
fea_imp1.loc['row_total'] = fea_imp.apply(lambda x: x.sum())
fea_imp1
# k-means clustering
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# 데이터 불러오기 (예시: iris dataset)
from sklearn.datasets import load_iris
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
# 데이터 표준화
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
# k-means 클러스터링
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(scaled_data)
labels = kmeans.labels_
# PCA로 2차원 축소
pca = PCA(n_components=2)
pca_components = pca.fit_transform(scaled_data)
# 결과를 DataFrame으로 정리
result_df = pd.DataFrame(data=pca_components, columns=['PC1', 'PC2'])
result_df['Cluster'] = labels
# 시각화
plt.figure(figsize=(8, 6))
for cluster in result_df['Cluster'].unique():
clustered = result_df[result_df['Cluster'] == cluster]
plt.scatter(clustered['PC1'], clustered['PC2'], label=f'Cluster {cluster}')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('KMeans Clusters (PCA-reduced Features)')
plt.legend()
plt.grid(True)
plt.show()