Professional Documents
Culture Documents
Syntax
Syntax
df.to_csv('output.csv', index=False)
create array
X=np.array(df.iloc[:1,2])
Y=np.array(df['xxx'])
missing value
df.isnull().sum()
label encoding
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
df=['xxx']=lb.fit_transform(df['xxx'])
filter data
df.query('xxx < 1000 and yyy > 10')
df.query('xxx < @aaa and yyy > @bbb')
groupby
df.groupby('xxx')['yyy'].sum()
group aggregate
df.groupby('xxx')['yyy'].agg(['mean','count'])
diff()
Datetimeformat64
df=pd.read_csv('data.csv', parse_dates=['Date'])
or
df['Date']=pd.to_datetime(df['Date'])
change value
df.loc[df['xxx']=='aaa', 'xxx']='bbb'
sort values
df.sort_values('xxx', ascending=False)
Plots
df.plot(kind='bar', x='xxx', y='yyy', figsize=(12,5), title='aaa')
Multiple plots
ax = df.plot(kind='scatter', x='datum',
y='Amoxycilin_caps',label='Amoxycilin_caps')
df.plot(kind='scatter', x='datum', y='Ampicloxa', label='Ampicloxa', c='m', ax=ax)
df.plot(kind='scatter', x='datum', y='Ceftriaxone', label='Ceftriaxone', c='y',
ax=ax)
df.plot(kind='scatter', x='datum', y='Ciprofloxacin', label='Ciprofloxacin', c='g',
ax=ax)
df.plot(kind='scatter', x='datum', y='Cotrimoxazole', label='Cotrimoxazole', c='c',
ax=ax)
plt.xlabel('Date')
plt.ylabel('Quantity')
Subplots
fig,(ax1,ax2,ax3,ax4,ax5) = plt.subplots(5, figsize=(10,10))
df1.plot(x='datum', y='Amoxycilin_caps', ax=ax1)
df1.plot(x='datum', y='Ampicloxa', c='m', ax=ax2)
df1.plot(x='datum', y='Ceftriaxone', c='y', ax=ax3)
df1.plot(x='datum', y='Ciprofloxacin', c='g', ax=ax4)
df1.plot(x='datum', y='Cotrimoxazole', c='c', ax=ax5)
Histogram
sns.boxplot(x='variable', y='value', data=pd.melt(df.iloc[:,1:6]))
df.hist()
Scaling
Standardization, mean=0, sd=1
Normalization, data scaled netween 0 and 1
mms=MinMaxScaler()
dfscaled=mms.fit_transform(df['xxx'])
Evaluation
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
rmse=np.sqrt(mean_squared_error(y, predict))
rSquared=r2_score(y, predict)
accuracy=accuracy_score(y, predict)
Prediction
trainPredict=model.predict(xtrain)
testPredict=model.predict(xtest)
Tuning Hyperparameter
from sklearn.model_selection import GridSearchCV
knn=KNeighborsClassifier()
params={'n_neighbors':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]}
bestParam=GridSearchCV(estimator=knn,param_grid=params).fit(xtrain,ytrain).best_par
ams_
bestParam