Spaces:

Eemansleepdeprived
/

okulary

Build error

App Files Files Community

okulary / student_analysis /datavis.py

Eemansleepdeprived

Upload 310 files

36eb7b3 verified 3 months ago

raw

history blame contribute delete

6.13 kB

	# %%
	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt

	# Load the dataset
	data = pd.read_csv("synthetic_data_with_all_subjects.csv") # Replace "your_dataset.csv" with the path to your dataset file

	# Display the first few rows of the dataset
	data.head()

	# %%
	# Bar plot of mean scores by gender
	plt.figure(figsize=(10, 6))
	sns.barplot(x='gender', y='math score', data=data)
	plt.title('Mean Math Score by Gender')
	plt.xlabel('Gender')
	plt.ylabel('Mean Math Score')
	plt.show()

	# %%
	# Box plot of scores distribution by test preparation course
	plt.figure(figsize=(10, 6))
	sns.boxplot(x='test preparation course', y='reading score', data=data)
	plt.title('Reading Score Distribution by Test Preparation Course')
	plt.xlabel('Test Preparation Course')
	plt.ylabel('Reading Score')
	plt.show()

	# %%
	# Violin plot of writing scores by parental level of education
	plt.figure(figsize=(12, 8))
	sns.violinplot(x='parental level of education', y='writing score', data=data)
	plt.title('Writing Score Distribution by Parental Level of Education')
	plt.xlabel('Parental Level of Education')
	plt.ylabel('Writing Score')
	plt.xticks(rotation=45)
	plt.show()

	# %%
	# Pair plot of all scores
	sns.pairplot(data[['math score', 'reading score', 'writing score', 'physics score', 'computer science score']])
	plt.show()


	# %%
	mean_scores = data[['math score', 'reading score', 'writing score', 'physics score', 'computer science score']].mean()
	median_scores = data[['math score', 'reading score', 'writing score', 'physics score', 'computer science score']].median()

	# Plot mean scores

	# %%
	plt.figure(figsize=(10, 6))
	sns.barplot(x=mean_scores.index, y=mean_scores.values)
	plt.title('Mean Scores for Each Subject')
	plt.xlabel('Subject')
	plt.ylabel('Mean Score')
	plt.xticks(rotation=45)
	plt.show()

	# %%
	plt.figure(figsize=(10, 6))
	sns.barplot(x=median_scores.index, y=median_scores.values)
	plt.title('Median Scores for Each Subject')
	plt.xlabel('Subject')
	plt.ylabel('Median Score')
	plt.xticks(rotation=45)
	plt.show()



	# %%
	highest_scores = data[['math score', 'reading score', 'writing score', 'physics score', 'computer science score']].max()
	lowest_scores = data[['math score', 'reading score', 'writing score', 'physics score', 'computer science score']].min()


	# %%
	plt.figure(figsize=(10, 6))
	sns.barplot(x=highest_scores.index, y=highest_scores.values)
	plt.title('Highest Scores for Each Subject')
	plt.xlabel('Subject')
	plt.ylabel('Highest Score')
	plt.xticks(rotation=45)
	plt.show()

	# %%
	plt.figure(figsize=(10, 6))
	sns.barplot(x=lowest_scores.index, y=lowest_scores.values)
	plt.title('Lowest Scores for Each Subject')
	plt.xlabel('Subject')
	plt.ylabel('Lowest Score')
	plt.xticks(rotation=45)
	plt.show()

	# %%
	highest_scorers = data[['math score', 'reading score', 'writing score', 'physics score', 'computer science score']].idxmax(axis=0)

	# Plot highest scorers
	plt.figure(figsize=(10, 6))
	sns.countplot(highest_scorers)
	plt.title('Highest Scorer in Each Subject')
	plt.xlabel('Subject')
	plt.ylabel('Number of Students')
	plt.xticks(rotation=45)
	plt.show()

	# %% [markdown]
	# ### STUDENT INDIVIDUAL DATA VSI

	# %%
	student_data = data.iloc[0]

	# Plot individual student performance
	plt.figure(figsize=(10, 6))
	sns.barplot(x=student_data.index[5:], y=student_data.values[5:])
	plt.title('Individual Student Performance')
	plt.xlabel('Subject')
	plt.ylabel('Score')
	plt.xticks(rotation=45)
	plt.show()

	# %%
	sns.pairplot(data.iloc[:1][['math score', 'reading score', 'writing score', 'physics score', 'computer science score']])
	plt.show()


	# %%
	import pandas as pd
	import numpy as np

	# Generate synthetic data for exam scores
	np.random.seed(42) # for reproducibility

	# Number of semesters
	num_semesters = 6

	# Number of subjects
	num_subjects = 5

	# Create a DataFrame to store the data
	exam_scores = pd.DataFrame(np.random.randint(0, 101, size=(num_semesters, num_subjects)),
	columns=['Subject 1', 'Subject 2', 'Subject 3', 'Subject 4', 'Subject 5'])

	# Add semester column
	exam_scores['Semester'] = range(1, num_semesters + 1)

	# Save the data to a CSV file
	exam_scores.to_csv("student_exam_scores.csv", index=False)

	# Display the first few rows of the dataset
	print(exam_scores.head())

	# %%
	import seaborn as sns
	import matplotlib.pyplot as plt

	# Load the dataset
	exam_scores = pd.read_csv("student_exam_scores.csv")

	# Line plot of exam scores over semesters for each subject
	plt.figure(figsize=(10, 6))
	for subject in ['Subject 1', 'Subject 2', 'Subject 3', 'Subject 4', 'Subject 5']:
	sns.lineplot(x='Semester', y=subject, data=exam_scores, label=subject)
	plt.title('Exam Scores Over Semesters')
	plt.xlabel('Semester')
	plt.ylabel('Score')
	plt.legend()
	plt.grid(True)
	plt.show()

	# Box plot of exam scores distribution for each subject
	plt.figure(figsize=(10, 6))
	sns.boxplot(data=exam_scores.drop('Semester', axis=1))
	plt.title('Distribution of Exam Scores for Each Subject')
	plt.xlabel('Subject')
	plt.ylabel('Score')
	plt.show()


	# %%
	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt

	# Load the dataset
	exam_scores = pd.read_csv("student_exam_scores.csv")

	# Separate plots for each subject
	for subject in ['Subject 1', 'Subject 2', 'Subject 3', 'Subject 4', 'Subject 5']:
	# Line plot of exam scores over semesters for the subject
	plt.figure(figsize=(8, 5))
	sns.lineplot(x='Semester', y=subject, data=exam_scores)
	plt.title(f'{subject} Exam Scores Over Semesters')
	plt.xlabel('Semester')
	plt.ylabel('Score')
	plt.grid(True)
	plt.show()

	# Calculate difference between consecutive semesters
	exam_scores_diff = exam_scores[[subject]].diff()

	# Find semester with most improvement and decline
	most_improved_semester = exam_scores_diff.idxmax()[0]
	most_declined_semester = exam_scores_diff.idxmin()[0]

	print(f"For {subject}:")
	print(f"Most Improvement: Semester {most_improved_semester}, Score Increase: {exam_scores_diff.loc[most_improved_semester][0]}")
	print(f"Quality Decline: Semester {most_declined_semester}, Score Decrease: {exam_scores_diff.loc[most_declined_semester][0]}\n")