You are on page 1of 17

Notebook

April 5, 2024

Gender distribution in the dataset?


[ ]: import pandas as pd

# Load the Excel file


file_path = 'EXCEL FOR PRACTICAL 9.xlsx'
data = pd.read_excel(file_path)

display(data.head())

[1]: # Count the number of each gender in the dataset


gender_distribution = data['GENDER'].value_counts()

# Display the gender distribution


display(gender_distribution)

---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[1], line 2
1 # Count the number of each gender in the dataset
----> 2 gender_distribution = data['GENDER'].value_counts()
4 # Display the gender distribution
5 display(gender_distribution)

NameError: name 'data' is not defined

[2]: import pandas as pd

# Load the Excel file


data = pd.read_excel('EXCEL FOR PRACTICAL 9.xlsx', skiprows=1)

# Count the number of each gender in the dataset


gender_distribution = data['GENDER'].value_counts()

# Display the gender distribution


display(gender_distribution)

---------------------------------------------------------------------------

1
KeyError Traceback (most recent call last)
File /usr/local/lib/python3.11/site-packages/pandas/core/indexes/base.py:3805,␣
↪in Index.get_loc(self, key)

3804 try:
-> 3805 return self._engine.get_loc(casted_key)
3806 except KeyError as err:

File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.


↪PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.


↪PyObjectHashTable.get_item()

KeyError: 'GENDER'

The above exception was the direct cause of the following exception:

KeyError Traceback (most recent call last)


Cell In[2], line 7
4 data = pd.read_excel('EXCEL FOR PRACTICAL 9.xlsx', skiprows=1)
6 # Count the number of each gender in the dataset
----> 7 gender_distribution = data['GENDER'].value_counts()
9 # Display the gender distribution
10 display(gender_distribution)

File /usr/local/lib/python3.11/site-packages/pandas/core/frame.py:4090, in␣


↪DataFrame.__getitem__(self, key)

4088 if self.columns.nlevels > 1:


4089 return self._getitem_multilevel(key)
-> 4090 indexer = self.columns.get_loc(key)
4091 if is_integer(indexer):
4092 indexer = [indexer]

File /usr/local/lib/python3.11/site-packages/pandas/core/indexes/base.py:3812,␣
↪in Index.get_loc(self, key)

3807 if isinstance(casted_key, slice) or (


3808 isinstance(casted_key, abc.Iterable)
3809 and any(isinstance(x, slice) for x in casted_key)
3810 ):
3811 raise InvalidIndexError(key)
-> 3812 raise KeyError(key) from err
3813 except TypeError:
3814 # If we have a listlike key, _check_indexing_error will raise
3815 # InvalidIndexError. Otherwise we fall through and re-raise

2
3816 # the TypeError.
3817 self._check_indexing_error(key)

KeyError: 'GENDER'

[3]: import pandas as pd

# Load the Excel file


data = pd.read_excel('EXCEL FOR PRACTICAL 9.xlsx', skiprows=1)

# It seems there was an error with the column name. Let's print the column␣
↪names to understand the issue.

print(data.columns)

# Assuming the column name might have leading or trailing spaces or different␣
↪capitalization, we will standardize the column names and then count the␣

↪gender distribution.

data.columns = data.columns.str.strip().str.upper()

# Now, let's count the gender distribution again.


gender_distribution = data['GENDER'].value_counts()

# Display the gender distribution


display(gender_distribution)

Index(['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
'Unnamed: 5', 'Unnamed: 6'],
dtype='object')

---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
File /usr/local/lib/python3.11/site-packages/pandas/core/indexes/base.py:3805,␣
↪in Index.get_loc(self, key)

3804 try:
-> 3805 return self._engine.get_loc(casted_key)
3806 except KeyError as err:

File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.


↪PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.


↪PyObjectHashTable.get_item()

3
KeyError: 'GENDER'

The above exception was the direct cause of the following exception:

KeyError Traceback (most recent call last)


Cell In[3], line 13
10 data.columns = data.columns.str.strip().str.upper()
12 # Now, let's count the gender distribution again.
---> 13 gender_distribution = data['GENDER'].value_counts()
15 # Display the gender distribution
16 display(gender_distribution)

File /usr/local/lib/python3.11/site-packages/pandas/core/frame.py:4090, in␣


↪DataFrame.__getitem__(self, key)

4088 if self.columns.nlevels > 1:


4089 return self._getitem_multilevel(key)
-> 4090 indexer = self.columns.get_loc(key)
4091 if is_integer(indexer):
4092 indexer = [indexer]

File /usr/local/lib/python3.11/site-packages/pandas/core/indexes/base.py:3812,␣
↪in Index.get_loc(self, key)

3807 if isinstance(casted_key, slice) or (


3808 isinstance(casted_key, abc.Iterable)
3809 and any(isinstance(x, slice) for x in casted_key)
3810 ):
3811 raise InvalidIndexError(key)
-> 3812 raise KeyError(key) from err
3813 except TypeError:
3814 # If we have a listlike key, _check_indexing_error will raise
3815 # InvalidIndexError. Otherwise we fall through and re-raise
3816 # the TypeError.
3817 self._check_indexing_error(key)

KeyError: 'GENDER'

[4]: import pandas as pd


import plotly.graph_objects as go

# Load the Excel file with correct parameters


data = pd.read_excel('EXCEL FOR PRACTICAL 9.xlsx', skiprows=1, usecols="D:F")
data.columns = ['GENDER', 'CA', 'EXAM']

# Count the number of each gender in the dataset


gender_distribution = data['GENDER'].value_counts()

4
# Create a bar chart for gender distribution using Plotly
gender_chart = go.Figure(go.Bar(x=gender_distribution.index,␣
↪y=gender_distribution.values, marker_color=['blue', 'pink']))

gender_chart.update_layout(title_text='Gender Distribution in the Dataset',␣


↪xaxis_title='Gender', yaxis_title='Count', plot_bgcolor='#111',␣

↪paper_bgcolor='#111', font=dict(color='#7FDBFF'))

gender_chart.show()

Gender Distribution in the Dataset

50

40
Count

30

20

10

0
MALE FEMALE GENDER

Gender

Loading [MathJax]/extensions/MathMenu.js

The gender distribution in the dataset was successfully analyzed and visualized. Here are the key
points:
• The dataset contains information on two genders: Male (M) and Female (F).
• A bar chart was created to visually represent the distribution of genders.
• The exact counts of each gender were not explicitly mentioned in the summary, but they were
visually represented in the bar chart.
• The chart utilized colors (blue for males, pink for females) to differentiate between the genders.
• The visualization included titles and labels for clarity.

Explore performance differences between genders


[5]: # Calculate the mean performance (CA + EXAM) for each gender
data['TOTAL'] = data['CA'] + data['EXAM']
gender_performance = data.groupby('GENDER')['TOTAL'].mean()

5
# Display the mean performance for each gender
display(gender_performance)

---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
File /usr/local/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:
↪1942, in GroupBy._agg_py_fallback(self, how, values, ndim, alt)

1941 try:
-> 1942 res_values = self._grouper.agg_series(ser, alt, preserve_dtype=True)
1943 except Exception as err:

File /usr/local/lib/python3.11/site-packages/pandas/core/groupby/ops.py:864, in␣


↪BaseGrouper.agg_series(self, obj, func, preserve_dtype)

862 preserve_dtype = True


--> 864 result = self._aggregate_series_pure_python(obj, func)
866 npvalues = lib.maybe_convert_objects(result, try_float=False)

File /usr/local/lib/python3.11/site-packages/pandas/core/groupby/ops.py:885, in␣


↪BaseGrouper._aggregate_series_pure_python(self, obj, func)

884 for i, group in enumerate(splitter):


--> 885 res = func(group)
886 res = extract_result(res)

File /usr/local/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:
↪2454, in GroupBy.mean.<locals>.<lambda>(x)

2451 else:
2452 result = self._cython_agg_general(
2453 "mean",
-> 2454 alt=lambda x:␣
↪Series(x, copy=False).mean(numeric_only=numeric_only),

2455 numeric_only=numeric_only,
2456 )
2457 return result.__finalize__(self.obj, method="groupby")

File /usr/local/lib/python3.11/site-packages/pandas/core/series.py:6540, in␣


↪Series.mean(self, axis, skipna, numeric_only, **kwargs)

6532 @doc(make_doc("mean", ndim=1))


6533 def mean(
6534 self,
(…)
6538 **kwargs,
6539 ):
-> 6540 return NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)

File /usr/local/lib/python3.11/site-packages/pandas/core/generic.py:12417, in␣


↪NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)

6
12410 def mean(
12411 self,
12412 axis: Axis | None = 0,
(…)
12415 **kwargs,
12416 ) -> Series | float:
> 12417 return self._stat_function(
12418 "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs
12419 )

File /usr/local/lib/python3.11/site-packages/pandas/core/generic.py:12374, in␣


↪NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)

12372 validate_bool_kwarg(skipna, "skipna", none_allowed=False)


> 12374 return self._reduce(
12375 func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
12376 )

File /usr/local/lib/python3.11/site-packages/pandas/core/series.py:6448, in␣


↪Series._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)

6444 raise TypeError(


6445 f"Series.{name} does not allow {kwd_name}={numeric_only} "
6446 "with non-numeric dtypes."
6447 )
-> 6448 return op(delegate, skipna=skipna, **kwds)

File /usr/local/lib/python3.11/site-packages/pandas/core/nanops.py:147, in␣


↪bottleneck_switch.__call__.<locals>.f(values, axis, skipna, **kwds)

146 else:
--> 147 result = alt(values, axis=axis, skipna=skipna, **kwds)
149 return result

File /usr/local/lib/python3.11/site-packages/pandas/core/nanops.py:404, in␣


↪_datetimelike_compat.<locals>.new_func(values, axis, skipna, mask, **kwargs)

402 mask = isna(values)


--> 404 result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
406 if datetimelike:

File /usr/local/lib/python3.11/site-packages/pandas/core/nanops.py:720, in␣


↪nanmean(values, axis, skipna, mask)

719 the_sum = values.sum(axis, dtype=dtype_sum)


--> 720 the_sum = _ensure_numeric(the_sum)
722 if axis is not None and getattr(the_sum, "ndim", False):

File /usr/local/lib/python3.11/site-packages/pandas/core/nanops.py:1701, in␣


↪_ensure_numeric(x)

1699 if isinstance(x, str):


1700 # GH#44008, GH#36703 avoid casting e.g. strings to numeric
-> 1701 raise TypeError(f"Could not convert string '{x}' to numeric")

7
1702 try:

TypeError: Could not convert string 'CAEXAM' to numeric

The above exception was the direct cause of the following exception:

TypeError Traceback (most recent call last)


Cell In[5], line 3
1 # Calculate the mean performance (CA + EXAM) for each gender
2 data['TOTAL'] = data['CA'] + data['EXAM']
----> 3 gender_performance = data.groupby('GENDER')['TOTAL'].mean()
5 # Display the mean performance for each gender
6 display(gender_performance)

File /usr/local/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:
↪2452, in GroupBy.mean(self, numeric_only, engine, engine_kwargs)

2445 return self._numba_agg_general(


2446 grouped_mean,
2447 executor.float_dtype_mapping,
2448 engine_kwargs,
2449 min_periods=0,
2450 )
2451 else:
-> 2452 result = self._cython_agg_general(
2453 "mean",
2454␣
↪ alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only),
2455 numeric_only=numeric_only,
2456 )
2457 return result.__finalize__(self.obj, method="groupby")

File /usr/local/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:
↪1998, in GroupBy._cython_agg_general(self, how, alt, numeric_only, min_count,␣

↪**kwargs)

1995 result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)


1996 return result
-> 1998 new_mgr = data.grouped_reduce(array_func)
1999 res = self._wrap_agged_manager(new_mgr)
2000 if how in ["idxmin", "idxmax"]:

File /usr/local/lib/python3.11/site-packages/pandas/core/internals/base.py:367,␣
↪in SingleDataManager.grouped_reduce(self, func)

365 def grouped_reduce(self, func):


366 arr = self.array
--> 367 res = func(arr)
368 index = default_index(len(res))
370 mgr = type(self).from_array(res, index)

8
File /usr/local/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:
↪1995, in GroupBy._cython_agg_general.<locals>.array_func(values)

1992 return result


1994 assert alt is not None
-> 1995 result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
1996 return result

File /usr/local/lib/python3.11/site-packages/pandas/core/groupby/groupby.py:
↪1946, in GroupBy._agg_py_fallback(self, how, values, ndim, alt)

1944 msg = f"agg function failed [how->{how},dtype->{ser.dtype}]"


1945 # preserve the kind of exception that raised
-> 1946 raise type(err)(msg) from err
1948 if ser.dtype == object:
1949 res_values = res_values.astype(object, copy=False)

TypeError: agg function failed [how->mean,dtype->object]

[6]: import pandas as pd


import plotly.graph_objects as go

# Load the Excel file with correct parameters


data = pd.read_excel('EXCEL FOR PRACTICAL 9.xlsx', skiprows=1, usecols="D:F")
data.columns = ['GENDER', 'CA', 'EXAM']

# Convert CA and EXAM columns to numeric


data['CA'] = pd.to_numeric(data['CA'], errors='coerce')
data['EXAM'] = pd.to_numeric(data['EXAM'], errors='coerce')

# Calculate the total score


data['TOTAL'] = data['CA'] + data['EXAM']

# Calculate the mean performance (CA + EXAM) for each gender


gender_performance = data.groupby('GENDER')['TOTAL'].mean()

# Create a bar chart for gender performance using Plotly


gender_performance_chart = go.Figure(go.Bar(x=gender_performance.index,␣
↪y=gender_performance.values, marker_color=['blue', 'pink']))

gender_performance_chart.update_layout(title_text='Mean Performance by Gender',␣


↪xaxis_title='Gender', yaxis_title='Mean Total Score', plot_bgcolor='#111',␣

↪paper_bgcolor='#111', font=dict(color='#7FDBFF'))

gender_performance_chart.show()

9
Mean Performance by Gender

50

40
Mean Total Score

30

20

10

0
FEMALE GENDER MALE

Gender

The analysis of performance differences between genders revealed the following key points:
• Performance was evaluated based on the total score, calculated as the sum of CA (Continuous
Assessment) and EXAM scores.
• Both CA and EXAM scores were converted to numeric values to ensure accurate calculations.
• The mean performance for each gender was calculated and visualized in a bar chart.
• The bar chart utilized colors (blue for males, pink for females) to differentiate between the
genders, and included titles and labels for clarity.
• The exact mean performance values for each gender were not explicitly mentioned in the
summary, but they were visually represented in the bar chart.

Compare performance distribution between genders


[7]: import plotly.graph_objects as go
import plotly.express as px

# Create a box plot to compare the performance distribution between genders


gender_performance_boxplot = px.box(data, x='GENDER', y='TOTAL', color='GENDER',
title='Performance Distribution by Gender',
labels={'TOTAL': 'Total Score', 'GENDER':␣
↪'Gender'},

color_discrete_map={'MALE': 'blue',␣
↪'FEMALE': 'pink'})

10
gender_performance_boxplot.update_layout(plot_bgcolor='#111',␣
↪paper_bgcolor='#111', font=dict(color='#7FDBFF'))

gender_performance_boxplot.show()

Performance Distribution by Gender

Gender
80
GENDER
MALE
FEMALE
70
Total Score

60

50

40

MALE FEMALE

Gender

The comparison of performance distribution between genders was visualized using a box plot,
revealing key insights:
• The box plot shows the spread of total scores (sum of CA and EXAM) for each gender.
• Colors were used to distinguish genders (blue for males, pink for females), enhancing visual
clarity.
• The plot includes median, quartiles, and potential outliers, providing a comprehensive view
of the performance distribution.
• Specific numerical details such as exact quartiles or outliers were not mentioned, but these
are represented visually in the plot.
• The visualization was designed with a dark theme, using color coding for text and background
for better contrast.

Investigate correlations between CA and EXAM scores


[8]: import plotly.express as px

# Create a scatter plot to investigate the correlation between CA and EXAM␣


↪scores

11
correlation_plot = px.scatter(data, x='CA', y='EXAM', color='GENDER',␣
↪symbol='GENDER',

title='Correlation between CA and EXAM Scores by␣


↪Gender',

labels={'CA': 'Continuous Assessment (CA) Score',␣


↪'EXAM': 'Exam Score'},

color_discrete_map={'MALE': 'blue', 'FEMALE':␣


↪'pink'})

correlation_plot.update_layout(plot_bgcolor='#111', paper_bgcolor='#111',␣
↪font=dict(color='#7FDBFF'))

correlation_plot.show()

Correlation between CA and EXAM Scores by Gender

60 GENDER
GENDER
55 MALE
FEMALE
50

45
Exam Score

40

35

30

25

20
5 10 15 20 25

Continuous Assessment (CA) Score

The investigation of correlations between CA (Continuous Assessment) and EXAM scores was
conducted through a scatter plot, revealing several insights:
• The scatter plot visualizes the relationship between CA and EXAM scores for each gender,
with colors distinguishing genders (blue for males, pink for females).
• Symbols were used to differentiate genders within the plot, enhancing the visual distinction.
• The plot includes titles and labels for clarity, explaining what CA and EXAM scores represent.
• While the plot visually suggests a relationship between CA and EXAM scores, specific cor-
relation coefficients were not calculated in this summary.
• The visualization was designed with a dark theme, using color coding for text and background

12
for better contrast.

Explore the distribution of CA and EXAM scores individually


[9]: import plotly.graph_objects as go

# Create histograms for CA and EXAM scores


cA_histogram = go.Figure()
cA_histogram.add_trace(go.Histogram(x=data['CA'], name='CA Scores',␣
↪marker_color='lightblue'))

cA_histogram.update_layout(title_text='Distribution of Continuous Assessment␣


↪(CA) Scores', xaxis_title='CA Score', yaxis_title='Count',␣

↪plot_bgcolor='#111', paper_bgcolor='#111', font=dict(color='#7FDBFF'))

exam_histogram = go.Figure()
exam_histogram.add_trace(go.Histogram(x=data['EXAM'], name='EXAM Scores',␣
↪marker_color='lightgreen'))

exam_histogram.update_layout(title_text='Distribution of Exam Scores',␣


↪xaxis_title='Exam Score', yaxis_title='Count', plot_bgcolor='#111',␣

↪paper_bgcolor='#111', font=dict(color='#7FDBFF'))

# Display the histograms


cA_histogram.show()
exam_histogram.show()

Distribution of Continuous Assessment (CA) Scores

15
Count

10

0
5 10 15 20 25

CA Score

13
Distribution of Exam Scores

25

20

15
Count

10

0
20 30 40 50

Exam Score

The exploration of the distribution of CA (Continuous Assessment) and EXAM scores was con-
ducted through histograms, revealing distinct insights for each:
• CA Scores Distribution:
– Visualized using a light blue histogram.
– The histogram shows the frequency of various CA scores across the dataset.
– The layout includes titles and labels for clarity, with a dark theme for better visual
contrast.
• EXAM Scores Distribution:
– Visualized using a light green histogram.
– Similar to the CA scores, this histogram displays the frequency of various EXAM scores.
– The design and layout follow the same theme as the CA scores histogram for consistency.

Compare average performance by gender To compare the average performance by gender,


we’ll calculate the mean total score (sum of CA and EXAM scores) for each gender. This will
involve grouping the data by gender and then calculating the mean for the total scores. Let’s
proceed with this analysis.

[10]: # Calculate the mean total score for each gender


gender_mean_performance = data.groupby('GENDER')['TOTAL'].mean()

14
# Display the mean total score for each gender
display(gender_mean_performance)

GENDER
FEMALE 56.181818
GENDER NaN
MALE 51.400000
Name: TOTAL, dtype: float64
The comparison of average performance by gender yielded the following results:
• Female: The average total score for females is approximately 56.18.
• Male: The average total score for males is approximately 51.40.
• An entry labeled GENDER with a NaN value was also observed, indicating a possible data
entry error or missing gender information for some records.
This analysis suggests that, on average, females have a higher total score compared to males in this
dataset.

Visualize the performance distribution by gender


[11]: import plotly.express as px

# Create a box plot to visualize the performance distribution by gender


performance_distribution_by_gender = px.box(data, x='GENDER', y='TOTAL',␣
↪color='GENDER',

title='Performance Distribution by␣


↪Gender',

labels={'TOTAL': 'Total Score',␣


↪'GENDER': 'Gender'},

color_discrete_map={'MALE':␣
↪'blue', 'FEMALE': 'pink'})

performance_distribution_by_gender.update_layout(plot_bgcolor='#111',␣
↪paper_bgcolor='#111', font=dict(color='#7FDBFF'))

performance_distribution_by_gender.show()

15
Performance Distribution by Gender

Gender
80
GENDER
MALE
FEMALE
70
Total Score

60

50

40

MALE FEMALE

Gender

test the null hypothesis that there is no difference between male and female students
performance
[12]: from scipy.stats import ttest_ind

# Separate the data by gender


data_male = data[data['GENDER'] == 'MALE']['TOTAL']
data_female = data[data['GENDER'] == 'FEMALE']['TOTAL']

# Perform an independent t-test between male and female students' total scores
ttest_results = ttest_ind(data_male.dropna(), data_female.dropna())

# Display the results of the t-test


print(f'T-statistic: {ttest_results.statistic:.4f}')
print(f'P-value: {ttest_results.pvalue:.4f}')

T-statistic: -1.8084
P-value: 0.0746
The independent t-test conducted to compare the performance (total scores) between male and
female students yielded the following results:
• T-statistic: -1.8084, indicating the direction and magnitude of the difference between the
group means.

16
• P-value: 0.0746, which suggests that the difference in mean performance between male and
female students is not statistically significant at the conventional 0.05 level.
Based on these results, we fail to reject the null hypothesis that there is no difference in performance
between male and female students. This implies that any observed difference in mean performance
between genders in this dataset is not statistically significant.

17

You might also like