You are on page 1of 3

save to csv

df.to_csv('output.csv', index=False)

change column name


df=df.rename(columns={'xxx': 'zzz'})

create new column


df['xxx_square']=df['xxx']

create array
X=np.array(df.iloc[:1,2])
Y=np.array(df['xxx'])

missing value
df.isnull().sum()

count categorical value


df['xxx].value_counts()

drop delete column


df=df.drop('xxx', axis=1)

label encoding
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
df=['xxx']=lb.fit_transform(df['xxx'])

filter data
df.query('xxx < 1000 and yyy > 10')
df.query('xxx < @aaa and yyy > @bbb')

groupby
df.groupby('xxx')['yyy'].sum()

group aggregate
df.groupby('xxx')['yyy'].agg(['mean','count'])

diff()

Datetimeformat64
df=pd.read_csv('data.csv', parse_dates=['Date'])
or
df['Date']=pd.to_datetime(df['Date'])

map, next to label encoding --- change to Nan if not assign


df['xxx']=df['xxx'].map({'Yes':'0', 'No':'1'})

change value
df.loc[df['xxx']=='aaa', 'xxx']='bbb'

Summary statistics for numerical data


numerical_data = df.select_dtypes(include=[np.number])
numerical_data.describe()

sort values
df.sort_values('xxx', ascending=False)
Plots
df.plot(kind='bar', x='xxx', y='yyy', figsize=(12,5), title='aaa')

Multiple plots
ax = df.plot(kind='scatter', x='datum',
y='Amoxycilin_caps',label='Amoxycilin_caps')
df.plot(kind='scatter', x='datum', y='Ampicloxa', label='Ampicloxa', c='m', ax=ax)
df.plot(kind='scatter', x='datum', y='Ceftriaxone', label='Ceftriaxone', c='y',
ax=ax)
df.plot(kind='scatter', x='datum', y='Ciprofloxacin', label='Ciprofloxacin', c='g',
ax=ax)
df.plot(kind='scatter', x='datum', y='Cotrimoxazole', label='Cotrimoxazole', c='c',
ax=ax)
plt.xlabel('Date')
plt.ylabel('Quantity')

Subplots
fig,(ax1,ax2,ax3,ax4,ax5) = plt.subplots(5, figsize=(10,10))
df1.plot(x='datum', y='Amoxycilin_caps', ax=ax1)
df1.plot(x='datum', y='Ampicloxa', c='m', ax=ax2)
df1.plot(x='datum', y='Ceftriaxone', c='y', ax=ax3)
df1.plot(x='datum', y='Ciprofloxacin', c='g', ax=ax4)
df1.plot(x='datum', y='Cotrimoxazole', c='c', ax=ax5)

Histogram
sns.boxplot(x='variable', y='value', data=pd.melt(df.iloc[:,1:6]))
df.hist()

Scaling
Standardization, mean=0, sd=1
Normalization, data scaled netween 0 and 1

from sklearn.preprocessing import StandardScaler, MinMaxScaler


ss=StandardScaler()
dfScaled=ss.fit_transform(df[['xxx','yyy']])

mms=MinMaxScaler()
dfscaled=mms.fit_transform(df['xxx'])

# random splitting data to train and test data


from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest=train_test_split(xScaled, y, test_size=0.3,
random_state=None)

Evaluation
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
rmse=np.sqrt(mean_squared_error(y, predict))
rSquared=r2_score(y, predict)
accuracy=accuracy_score(y, predict)

Append new row into dataframe


# creting table to store accuracy scores of the models
table = pd.DataFrame(columns=['Model','Train Accuracy %','Test Accuracy %'])
newRow = {'Model': 'KNN model', 'Train Accuracy %': trainAccScore,'Test Accuracy
%': testAccScore}
table = table.append(newRow, ignore_index=True)
Confusion Matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm=ConfusionMatrixDisplay(confusion_matrix(ytest,testPredict),display_labels=['Grad
e A','Grade B','Grade C'])

Prediction
trainPredict=model.predict(xtrain)
testPredict=model.predict(xtest)

Tuning Hyperparameter
from sklearn.model_selection import GridSearchCV

knn=KNeighborsClassifier()
params={'n_neighbors':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]}

bestParam=GridSearchCV(estimator=knn,param_grid=params).fit(xtrain,ytrain).best_par
ams_
bestParam

You might also like