import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.columns
test.columns
plt.figure(figsize = (12,12))
sns.heatmap(data = train.corr(), annot =True)
train.isnull().sum()
# intensity 제거
train = train.drop(['intensity'], axis =1)
def feature_change(train):
# type
train['type'] = train['type'].map({'white': 0, 'red' : 1})
#sweetness
train['sweetness'] = train['sweetness'].map({'dry': 0, 'off-dry' : 1, 'medium-sweet' : 2})
return train
train = feature_change(train)
test = test.drop(['intensity'], axis = 1)
def feature_change(test):
# type
test['type'] = test['type'].map({'white': 0, 'red' : 1})
#sweetness
test['sweetness'] = test['sweetness'].map({'dry': 0, 'off-dry' : 1, 'medium-sweet' : 2})
return test
test = feature_change(test)
# 이상치만 잘 다듬는다면...
# 이 대회가 새로운 모델을 만들라는건 아니잖아?
# 보는 눈(해석)이 중요하다
train = train[train['density'] < 1.00128]
train = train[train['chlorides']< 0.3]
train.info()
train.shape
train = train.drop(['index'], axis = 1)
test = test.drop(['index'], axis = 1)
train_x = train.drop(columns=['quality'])
train_y = train['quality']
from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
print(train_x.shape)
print(val_x.shape)
print(train_y.shape)
print(val_y.shape)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
model = RandomForestClassifier(n_estimators=375, max_depth=14, random_state=42)
model.fit(train.drop(columns='quality'),train['quality'])
y_pred = model.predict(val_x)
print(confusion_matrix(val_y, y_pred))
print(classification_report(val_y, y_pred))
submission = pd.read_csv('sample_submission.csv')
y_pred = model.predict(test)
submission['quality'] = y_pred
submission.to_csv('submission_labtop5.csv', index=False)
점검 필요