#importing the required libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')


df=pd.read_csv('haberman.csv') # Reading the csv file
df.columns #Checking the column names of DataSet.

Index(['30', '64', '1', '1.1'], dtype='object')


#Naming the labels referenced from the source of the given dataset.
df.columns=['age','year','nodes','status']


#Replacing Variables for better understanding
df.status[df['status']==1]='Survived'
df.status[df['status']==2]='NotSurvived'


df.head() #overview of DataSet


#Description of DataSet
print(df.describe()) 
print("\nshape ",df.shape,"\n")
print(df.status.value_counts(),"\n\n...")

              age        year       nodes
count  305.000000  305.000000  305.000000
mean    52.531148   62.849180    4.036066
std     10.744024    3.254078    7.199370
min     30.000000   58.000000    0.000000
25%     44.000000   60.000000    0.000000
50%     52.000000   63.000000    1.000000
75%     61.000000   66.000000    4.000000
max     83.000000   69.000000   52.000000

shape  (305, 4) 

Survived       224
NotSurvived     81
Name: status, dtype: int64 

...


sns.FacetGrid(df,hue='status',size=5).map(sns.distplot,'year').add_legend()
plt.ylabel('P(X)=x')
plt.title('PDF of year')
plt.show()


sns.FacetGrid(df,hue='status',size=5).map(sns.distplot,'age').add_legend()
plt.title('PDF of age')
plt.ylabel('P(X)=x')
plt.show()
print("\n\n...")
sns.FacetGrid(df,hue='status',size=5).map(sns.distplot,'nodes').add_legend()
plt.ylabel('P(X)=x')
plt.title('PDF of nodes')
plt.show()


...


counts,bins=np.histogram(df['age'],bins=18,density=True)
pdf=counts/(sum(counts))
cdf=np.cumsum(pdf)
plt.plot(bins[1:],pdf,label='pdf')
plt.plot(bins[1:],cdf,label='cdf')
plt.xlabel("age")
plt.ylabel("PDF: P(X)=x\nCDF: P(X)<=x")
plt.legend(bbox_to_anchor=(1.3, 0.6))
plt.title("PDF & CDF of age")
plt.show()
print(f"Survived:\n CDF:{cdf} \n BinEdges:{bins} \n\n...\n\n\n\n")

Survived:
 CDF:[0.01311475 0.04918033 0.10819672 0.1704918  0.25901639 0.34754098
 0.44262295 0.5442623  0.64262295 0.72786885 0.8        0.87540984
 0.91803279 0.95737705 0.98360656 0.99344262 0.99672131 1.        ] 
 BinEdges:[30.         32.94444444 35.88888889 38.83333333 41.77777778 44.72222222
 47.66666667 50.61111111 53.55555556 56.5        59.44444444 62.38888889
 65.33333333 68.27777778 71.22222222 74.16666667 77.11111111 80.05555556
 83.        ] 

...


counts,bins=np.histogram(df['year'],bins=10,density=True)
pdf=counts/(sum(counts))
cdf=np.cumsum(pdf)
plt.plot(bins[1:],pdf,label='pdf')
plt.plot(bins[1:],cdf,label='cdf')
plt.xlabel("year")
plt.ylabel("PDF: P(X)=x\nCDF: P(X)<=x")
plt.legend(bbox_to_anchor=(1.3, 0.6))
plt.title("PDF & CDF of year")
plt.show()
print(f"Survived:\n CDF:{cdf} \n BinEdges:{bins} \n\n...\n\n\n\n")

Survived:
 CDF:[0.20655738 0.29836066 0.38360656 0.45901639 0.55737705 0.6557377
 0.74754098 0.83934426 0.92131148 1.        ] 
 BinEdges:[58.  59.1 60.2 61.3 62.4 63.5 64.6 65.7 66.8 67.9 69. ] 

...


Survived=df['nodes'][df['status']=='Survived']
NotSurvived=df['nodes'][df['status']=='NotSurvived']
counts,bins=np.histogram(Survived,bins=18,density=True)
pdf=counts/(sum(counts))
cdf=np.cumsum(pdf)
plt.plot(bins[1:],pdf,label='S_pdf')
plt.plot(bins[1:],cdf,label='S_cdf')
plt.xlabel("Nodes")
plt.ylabel("PDF: P(X)=x\nCDF: P(X)<=x")
plt.legend(bbox_to_anchor=(1.3, 0.6))
plt.title("The PDF and CDF of nodes for Survived")
plt.show()
print(f"Survived:\n CDF:{cdf} \n BinEdges:{bins} \n\n...\n\n\n\n")
print("\n\n The PDF and CDF of nodes for NotSurvived")
counts1,bins1=np.histogram(NotSurvived,bins=18,density=True)
NSpdf=counts1/(sum(counts1))
NScdf=np.cumsum(NSpdf)
plt.plot(bins1[1:],NSpdf,label='NS_pdf')
plt.plot(bins1[1:],NScdf,label='NS_cdf')
plt.xlabel("Nodes")
plt.ylabel("PDF: P(X)=x\nCDF: P(X)<=x")
plt.legend(bbox_to_anchor=(1.3, 0.6))
plt.title("The PDF and CDF of nodes for NotSurvived")
plt.show()
print(f"NotSurvived:\n CDF:{NScdf} \n BinEdges:{bins1}\n")

Survived:
 CDF:[0.73214286 0.84375    0.88392857 0.92410714 0.93303571 0.95535714
 0.95982143 0.97321429 0.98214286 0.98660714 0.99107143 0.99553571
 0.99553571 0.99553571 0.99553571 0.99553571 0.99553571 1.        ] 
 BinEdges:[ 0.          2.55555556  5.11111111  7.66666667 10.22222222 12.77777778
 15.33333333 17.88888889 20.44444444 23.         25.55555556 28.11111111
 30.66666667 33.22222222 35.77777778 38.33333333 40.88888889 43.44444444
 46.        ] 

...


 The PDF and CDF of nodes for NotSurvived

NotSurvived:
 CDF:[0.39506173 0.56790123 0.65432099 0.75308642 0.82716049 0.86419753
 0.90123457 0.96296296 0.97530864 0.97530864 0.97530864 0.97530864
 0.98765432 0.98765432 0.98765432 0.98765432 0.98765432 1.        ] 
 BinEdges:[ 0.          2.88888889  5.77777778  8.66666667 11.55555556 14.44444444
 17.33333333 20.22222222 23.11111111 26.         28.88888889 31.77777778
 34.66666667 37.55555556 40.44444444 43.33333333 46.22222222 49.11111111
 52.        ]


sns.boxplot(x='status',y='year',data=df)
plt.title('Boxplot of status-year')
plt.grid
plt.show()


sns.boxplot(x='status',y='age',data=df)
plt.title('BoxPlot of age-status')
plt.grid
plt.show()


sns.boxplot(x='status',y='nodes',data=df)
plt.title('Boxplot of nodes-status')
plt.show()


#(df['age'][df['status']=='NotSurvived']).min()


sns.violinplot(data=df,x='year',y='nodes',size=30)
plt.title('Violin plot of year-nodes')
plt.show()
sns.violinplot(data=df,x='year',y='age',size=30)
plt.title('Violin plot of year-age')
plt.show()
sns.violinplot(data=df,x='age',y='status',size=30)
plt.title('Violin plot of age-status')
plt.show()


#df['nodes'][df['status']=='NotSurvived'][df['nodes']>=24]


# We are undable to make sense of data because major overlap can be observed.
#So, We will try drawing conclusions through overall plot.


sns.set_style('whitegrid')
sns.FacetGrid(df,hue='status',size=5).map(plt.scatter,'age','nodes').add_legend().set(title='Scatter Plot of nodes-age')
plt.show()
print('\n')
sns.FacetGrid(df,hue='status',size=5).map(plt.scatter,'age','year').add_legend().set(title='Scatter plot of year-age')
plt.show()
print('\n')
sns.FacetGrid(df,hue='status',size=5).map(plt.scatter,'year','nodes').add_legend().set(title='Scatter plot of nodes-year')
plt.show()


#Pairplot
sns.set_style("whitegrid");
sns.pairplot(df,hue='status',size=5);
plt.show()


%%html
<style>
table {float:left}
</style>

Nodes	Survived	NotSurvived	Difference
2	73%	39%	34%
5	84%	56%	28%
10	92%	71%	21%
15	95%	85%	10%
30	99%	97%	3%

Univariate Analysis¶

Insights and Takeaways:¶