You are on page 1of 2

Reading an entire file at once: Import csv

filename = 'siddhartha.txt' With open(“XXX.csv”) as file_pointer: Generating current date


with open(filename) as f_obj: csv_pointer = csv.reader(file_pointer) from datetime import datetime as dt
contents = f_obj.read() Writing data can be written from list of strings today = dt.now()
print(contents) date_string = dt.strftime(today,’%m/%d/%Y’)
data_row = [‘’_‘’,’’_ ‘’ ] print(date_string)
Reading line by line with open(“data.csv”,”w”) as file_pointer:
filename = 'siddhartha.txt' csv_pointer = csv.writer(file_pointer) Generating a specific date
with open(filename) as f_obj: csv_pointer.writerow(data_row) from datetime import datetime as dt
for line in f_obj: new_years = dt(2017,1,1)
print(line.rstrip()) import seaborn as sns fall_equinox = dt(year=2016, month=9,day=22)
import numpy as np
Storing the lines in a list: lower_triangle = np.triu(df.corr())
filename = 'siddhartha.txt' sns.heatmap(nanny.corr(), annot=True, mask=lower_triangle,
with open(filename) as f_obj: cmap= 'coolwarm') Create Dataframe
lines = f_obj.readlines() # triu() is a method in NumPy that returns the lower triangle of any Method 1: import pandas as pd
for line in lines: matrix given to it. listlist = [['John', 21], ['Emilie', 49]]
print(line.rstrip()) df_listlist = pd.DataFrame(listlist, columns = ['Name', 'Age'])
df.loc[:,["XXX", "YYY"]].plot.box() boxplot df_listlist
file_pointer.read() #Read as single string df.loc[:,["XXX", "YYY"]].plot.hist(alpha=0.5)overlapped colour
file_pointer.readline() #Read line by line as string diagram Method 2:
file_pointer.readlines()#Read as a single list df.plot.scatter(x='product1_price', y='product1_quantity_ordered'); dictlist = {'Name':['Robert', 'Faye'], 'Age':[56, 28]}
scatter diagram df_dictlist = pd.DataFrame(dictlist)
file_pointer = open(“filename.txt”,”w”) sns.pairplot(df)
data_list = [ “D1”,”D2”] import pandas as pd df_dictlist
for each line in data_list: import matplotlib.pyplot as plt
print(each_line,file=file_pointer) %matplotlib inline Method 3:
file.pointer.close()
name = ['Nada', 'Gareth', 'Johnny', 'Georgina']
 df.iloc[:, 2:8].sum().plot(kind="bar") age = [25, 30, 26, 22]
file_pointer.write(s) #s will be the string variable containing date  df.iloc[:,2:8].sum().sort_values().plot(kind="bar")ascendin list_of_tuples = list(zip(name, age))
file_pointer.writelines(sequence) #write a sequence of strings to g order print(list_of_tuples) [('Nada', 25), ('Gareth', 30)]
the file  df.groupby("Distribution df = pd.DataFrame(list_of_tuples, columns = ['Name', 'Age'])
Mode How opened File Exists File does not Channel").sum().Revenue.plot(kind="bar") only df
exists distribution channel 1&2
r Read only Opens file Error df_total = df[df.Category == "Total Population Growth"] data = pd.read_csv(‘XXX.csv')
w Write only Clears the file Creates & df_resident = df[df.Category == "Resident Population Growth"]
contents opens new fig, ax = plt.subplots() DataFrame Functions:
file ax.scatter(df_total.Year, df_total.Growth)  df.head() first 5 rows(default)
a Write only File contents Creates & ax.scatter(df_resident.Year, df_resident.Growth)  df.tail()last 5 rows(default) , unless replaced by number
left intact & opens a new #ax.legend(["Total Population Growth", "Resident Population  df.columnscolumn labels of DataFrame
new data file Growth"])  df.describe() descriptive statistics abt every numeric
appended at ax.legend(df.Category.unique()) column ie count,mean,std
file’s end
 df.isnull()/isna().sum()check total no.of empty cells for
r+ Read & write Reads & Error import seaborn as sns each column
overwrites from df = pd.read_csv("supermarket2.csv")  df[df[‘column_name’].isnull()]show NaN in that column
the file’s sns.pairplot(df)  df[df.isna().any(axis=1)]show rows in DataFrame are NaN
beginning
 df[‘XXX’].fillna((df[‘XXX’].mean().round(2),inplace=True)fill
w+ Read & write Clears the file Creates & empty cells in specified column with its column mean
contents opens a new Making a scatter plot  df=df.dropna(how=’any’)delete rows with at least 1 empty
file import matplotlib.pyplot as plt cell
a+ Read & write File contents Creates & x_values = list(range(1000))
 df.corr()/mean()/median()/mode()/var()/cov()/sum()/min()/
left intact and opens a new squares = [x**2 for x in x_values]
max()
read & write at file plt.scatter(x_values, squares, s=10)
file’s end plt.show()
 df.duplicated().sum()total no. of rows with duplicated  df[df.score >= 15.0].sort_values("score", ascending=False)
values [["name","score"]] or INSERT INTO MANAGER(AA,BB) VALUES (1003, "Jenny");
 df[df[[‘AA’,’BB’]].duplicated()show only those rows with  df.loc[df.score >= 15.0, SELECT * FROM __  * denote list out all fields
duplicated AA and BB ["name","score"]].sort_values("score", ascending=False) SELECT A,B / DISTINCT/ COUNT(*) / MIN(XXX)/ MAX(XXX)/
 df.drop_duplicates(inplace=True)  data[(data[‘XXX’] >= 9) & (data[‘YYY']<= 4)]
SUM(XXX) AS TOTAL_VALUE/ AVG(XXX) FROM __
 df.drop(columns = [‘??’], inplace = True)drop columns  filtered.reset_index(drop = True, inplace = True)
 df.drop(row_index_number) WHERE XXX >= 12
 data[(data[‘XXX’] >= 9) | (data[‘YYY']<= 4)]  ‘OR’
 df.groupby([‘column_name’]).mean()/.sum()/.count() <>(not equal), ^(to the power of)
 data[data[‘XXX’].str.contains/endswith/startswith(‘aaa’)]
 df.groupby([‘XXX’]).count()[‘YYY’].plot(kind=’bar’) no. of BETWEEN, IS NULL, LIKE, EXISTS, IN, AND
 data[~data[‘XXX’].str.contains(‘aaa’)] print those w/o ‘aaa’
YYY under each XXX  row = ["Peter", 18.0, 2, "yes"]
 df.XXX.replace(“aaa”,”AAA”, inplace=True)all aaa in df.loc[len(df)] = row
dataFrame  vertical_stack=pd.concat([XX_df.head(5),
add new row to the bottom
 df.iloc[3,1]= 14.6 replace particular entry only YY_df.tail(10)],axis=0)

1. Display all album titles by Iron Maiden  vertical_stack.reset_index(inplace=True) renumber the


DataFrame Attributes: index
SELECT Title,
 df.shape(rows,columns)  horizontal_stack=pd.concat([XX_df.head(5),
Name
 df.info()check empty cells or data type of each column YY_df.tail(10)],axis=1)
FROM albums
 df.XXX read column,if column name w/o empty spaces Combining data
INNER JOIN
 data[[‘XXX’,’YYY’]]
artists ON artists.ArtistId = albums.ArtistId
 type(df.column_name) WHERE Name = "Iron Maiden";
2. Find all artists who do not have any albums, sorted by
Indexing: artists’ names
 df.iloc[1]/data.iloc[1,:] SELECT Name,
 df.iloc[3,4]read (row,column) Title
 df.iloc[[0,2]] row index 0 and 2 FROM artists
 df.iloc[0,2]select cell at row 0 and column 2 LEFT JOIN
 df.iloc[:4]/data.iloc[:4,:]read row 0 to 3 albums ON artists.ArtistId = albums.ArtistId
 df.iloc[[0,1],[0,1]]select row 0 &1, column 0 & 1 WHERE Title IS NULL
 df.iloc[2:4,5:7]select row 2 & 3, and give value from ORDER BY Name;
column 5 & 6 3. Display all unique cities for each country where the
 df.iloc[df.column_name>5000, customers are located in


col_start_index:col_stop_index_exclusive]
df.iloc[df.column_name ==
SELECT DISTINCT city,  merged_inner = pd.merge(left=XXX,right=YYY)  if a row in
country XXX has a value of AA that doesn’t appear in AA column of
“???”,col_start_index:col_stop_index_exclusive] FROM customers YYY, it will not be included
 df.loc[:,’XXX’]read (row,column_name) ORDER BY country;
 df.loc[[0,2]]row index 0 & 2 4. Find the number of tracks for each album ID
 df.loc[2:4,’XXX’]read row 2 to 4, in that particular column SELECT albumid,
 df.loc[df['gender']=='Female', COUNT(trackid)
[‘column_name1’,’column_name2’] FROM tracks
df.loc[0,['EmpID']] EmpID 21 (read down) GROUP BY albumid;
 df.loc[[0],'EmpID']  0 21 (read across) 5. Find the total length and bytes for each album ID
 df.loc[0,'EmpID'] 21 (read across) SELECT albumid,
 df.loc[[0],["EmpID"]] 0 21 , with column name on top SUM(milliseconds) AS length,
(read down) SUM(bytes) AS size
FROM track
 df.sort_values(‘XXX’)sort one column in ascending order GROUP BY albumid;
 df.sort_values(‘XXX’,ascending=False)sort in descending 6. Display the track IDs & track names for which media type id
order is 1 or 2
 df.sort_values([‘XXX’,’YYY’],ascending = (True, False) ) SELECT TrackId,
 #not saved, if want need to add ,inplace =True Name,
 data[‘ZZZ’] = data[‘XXX’] * data[‘YYY’] Mediatypeid
 data[‘ZZZ’] = data.loc[:,[‘XXX’,’YYY’]].sum(axis=1)add FROM Tracks
column ZZZ=XXX+YYY WHERE MediaTypeId IN (1, 2);

You might also like