Professional Documents
Culture Documents
Piyush Shivhare CS201083 WebScraping - Ipynb - Colaboratory
Piyush Shivhare CS201083 WebScraping - Ipynb - Colaboratory
ipynb - Colaboratory
import requests
import pandas as pd
from bs4 import BeautifulSoup
# 1) scrape header tags from wikipedia.org and create a datarFrame
url = 'https://en.wikipedia.org/wiki/Main_Page'
response = requests.get(url)
soup = BeautifulSoup(response.text,'html.parser')
headers = soup.find_all(['h1','h2','h3','h4','h5','h6'])
header_text = [header.get_text() for header in headers]
df_headers = pd.DataFrame({'Headers':header_text})
df_headers = df_headers.style.set_caption('Header tags from wikipedia:')
df_headers
0 Main Page
1 Welcome to Wikipedia
4 In the news
5 On this day
9 Wikipedia languages
# 2) scrape IMDB's top rated 50 movies data and create a data frame
url = 'https://www.imdb.com/chart/top'
response = requests.get(url)
soup = BeautifulSoup(response.text,'html.parser')
movies = soup.select('td.titleColumn')
ratings = soup.select('td.imdbRating strong')
movie_data = []
for movie , rating in zip(movies,ratings):
title = movie.find('a').text
year = movie.find('span').text.strip('()')
rating = rating.text
movie_data.append({'Title':title,'Year':year,'Rating':rating})
df_movies = pd.DataFrame(movie_data)
print('Top 50 movies on IMDB:')
df_movies.head(50)
https://colab.research.google.com/drive/1oe3IwTz1pXJQB5ruNSRv8V1J3F9Ap7We#scrollTo=tsxIIWZUA1CX&printMode=true 1/6
6/1/23, 6:26 PM webScraping.ipynb - Colaboratory
2 The Dark Knight 2008 9.0
6 The Lord of the Rings: The Return of the King 2003 8.9
8 The Lord of the Rings: The Fellowship of the Ring 2001 8.8
for movie, rating in zip(movies, ratings):
45 The Intouchables 2011 8.5
title = movie.find('a').text
46 Modern Times 1936 8.4
year = movie.find('span', class_='lister-item-year').text.strip('()')
rating = rating.text
47 Once Upon a Time in the West 1968 8.4
movie_data.append({'Title': title, 'Year': year, 'Rating': rating})
48 Cinema Paradiso 1988 8.4
df_indian_movies = pd.DataFrame(movie_data)
49
print("Top 50 Indian Movies on IMDB:") Rear Window 1954 8.4
df_indian_movies.head(50)
https://colab.research.google.com/drive/1oe3IwTz1pXJQB5ruNSRv8V1J3F9Ap7We#scrollTo=tsxIIWZUA1CX&printMode=true 2/6
6/1/23, 6:26 PM webScraping.ipynb - Colaboratory
2 Paper Flowers 1959 Rate
5 Charulata 1964 2
7 Dev.D 2009 3
9 Awaara 1951 4
11 Aparajito 1956 5
13 Thirst 1957 6
15 Sholay 1975 7
28 Guide 1965 2
30 Roja 1992 3
34 Masoom 1983 5
44 Bhumika 1977 10
47 A Breath 2004 0
49 Haqeeqat 1964 1
https://colab.research.google.com/drive/1oe3IwTz1pXJQB5ruNSRv8V1J3F9Ap7We#scrollTo=tsxIIWZUA1CX&printMode=true 3/6
6/1/23, 6:26 PM webScraping.ipynb - Colaboratory
# 4) Scrape list of respected former presidents of India and create a data frame
url = 'https://presidentofindia.nic.in/former-presidents.htm'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
president_table = soup.find('table', class_='views-table')
president_data = []
if president_table is not None:
presidents = president_table.find_all('tr')[1:]
for president in presidents:
columns = president.find_all('td')
if len(columns) > 1:
name = columns[0].text.strip()
term = columns[1].text.strip()
president_data.append({'Name': name, 'Term of Office': term})
df_presidents = pd.DataFrame(president_data)
print("Former Presidents of India:")
print(df_presidents)
import requests
import pandas as pd
from bs4 import BeautifulSoup
def scrape_odi_teams_men():
url = 'https://www.icc-cricket.com/rankings/mens/team-rankings/odi'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', class_='table')
rows = table.find_all('tr')
team_data = []
for row in rows[1:11]: # Exclude the header row and consider only the top 10 teams
cells = row.find_all('td')
rank = cells[0].text.strip()
team = cells[1].text.strip()
matches = cells[2].text.strip()
points = cells[3].text.strip()
rating = cells[4].text.strip()
team_data.append({'Rank': rank, 'Team': team, 'Matches': matches, 'Points': points, 'Rating': rating})
df = pd.DataFrame(team_data)
return df
def scrape_odi_batsmen_men():
url = 'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/batting'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', class_='table')
rows = table.find_all('tr')
player_data = []
for row in rows[1:11]: # Exclude the header row and consider only the top 10 batsmen
cells = row.find_all('td')
rank = cells[0].text.strip()
player = cells[1].text.strip()
team = cells[2].text.strip()
rating = cells[3].text.strip()
player_data.append({'Rank': rank, 'Player': player, 'Team': team, 'Rating': rating})
df = pd.DataFrame(player_data)
return df
def scrape_odi_bowlers_men():
url = 'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/bowling'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', class_='table')
rows = table.find_all('tr')
player_data = []
for row in rows[1:11]: # Exclude the header row and consider only the top 10 bowlers
cells = row.find_all('td')
rank = cells[0].text.strip()
https://colab.research.google.com/drive/1oe3IwTz1pXJQB5ruNSRv8V1J3F9Ap7We#scrollTo=tsxIIWZUA1CX&printMode=true 4/6
6/1/23, 6:26 PM webScraping.ipynb - Colaboratory
player = cells[1].text.strip()
team = cells[2].text.strip()
rating = cells[3].text.strip()
player_data.append({'Rank': rank, 'Player': player, 'Team': team, 'Rating': rating})
df = pd.DataFrame(player_data)
return df
scrape_odi_teams_men()
scrape_odi_batsmen_men()
scrape_odi_bowlers_men()
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Function to scrape and create data frame
def scrape_icc_cricket(url):
# Send a GET request to the specified URL
response = requests.get(url)
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find the table containing the required data
table = soup.find('table')
# Initialize empty lists to store the data
teams = []
matches = []
points = []
ratings = []
# Iterate over each row in the table
for row in table.find_all('tr')[1:]:
# Extract the data from each column in the row
cols = row.find_all('td')
team = cols[0].text.strip()
match = cols[1].text.strip()
point = cols[2].text.strip()
rating = cols[3].text.strip()
# Append the data to the respective lists
teams.append(team)
matches.append(match)
points.append(point)
ratings.append(rating)
# Create a data frame using the scraped data
df = pd.DataFrame({
'Team': teams,
'Matches': matches,
'Points': points,
'Rating': ratings
})
return df
# Scrape and create data frame for top 10 ODI teams in women's cricket
url_a = 'https://www.icc-cricket.com/rankings/womens/team-rankings/odi'
df a = scrape icc cricket(url a)
https://colab.research.google.com/drive/1oe3IwTz1pXJQB5ruNSRv8V1J3F9Ap7We#scrollTo=tsxIIWZUA1CX&printMode=true 5/6
6/1/23, 6:26 PM webScraping.ipynb - Colaboratory
df_a scrape_icc_cricket(url_a)
# Scrape and create data frame for top 10 women's ODI batting players
url_b = 'https://www.icc-cricket.com/rankings/womens/player-rankings/odi/batting'
df_b = scrape_icc_cricket(url_b)
# Scrape and create data frame for top 10 women's ODI all-rounders
url_c = 'https://www.icc-cricket.com/rankings/womens/player-rankings/odi/all-rounder'
df_c = scrape_icc_cricket(url_c)
# Print the data frames
print("Top 10 ODI Teams in Women's Cricket:")
print(df_a)
print("\nTop 10 Women's ODI Batting Players:")
print(df_b)
print("\nTop 10 Women's ODI All-rounders:")
print(df_c)
Points Rating
0 AUS 754
1 SA 732
2 ENG 731
3 AUS 717
4 IND 716
.. ... ...
95 AUS 202
96 IRE 199
97 PAK 197
98 ZIM 196
99 NZ 196
https://colab.research.google.com/drive/1oe3IwTz1pXJQB5ruNSRv8V1J3F9Ap7We#scrollTo=tsxIIWZUA1CX&printMode=true 6/6