Skip to content

vaishnavi2810-code/Titanic_survival_prediction

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

13 Commits
 
 
 
 
 
 

Repository files navigation

Titanic_survival_prediction

A machine learning model for predicting the survival of passengers aboard RMS Titanic, a British passenger liner that sank in the North Atlantic Ocean.

The first step is to add libraries

library(dplyr)
library(tidyr)
library(e1071)
library(rpart)
library(rpart.plot)
library(caret)
library(ModelMetrics)

Import the dataset

Titanic_Data <- read.csv("/home/vaishnavi/Project/dataset.csv")
glimpse(Titanic_Data)
View(Titanic_Data)

This dataset contains total 21 columns as above.

Next step is to look for NA values or any unnecessary symbols

any(Titanic_Data=="?")##True that means there exist some values as such "?"
which(Titanic_Data=="?",arr.ind=TRUE)

The above code is for the verification if there exists any "?" in the dataset.

colSums(is.na(Titanic_Data))
head(Titanic_Data)

The above code is to check of NA values. It displays the number of NA values in every column.

Next step is to remove NA values

#removing all target class NA valued rows from whole dataset
Titanic_Data <- Titanic_Data %>% drop_na('Survived')
#removing duplicate columns
Titanic_Data <- select(Titanic_Data,-c('Pclass', 'Age', 'Name'))
colSums(is.na(Titanic_Data))

The above code removes all NA values from the target class which is Survived. Duplicate columns are aslo removed, the columns Pclass, Age, Name are same as the columns Class, Age_wiki and Name_wiki respectively.

library(ggplot2)
ggplot(Titanic_Data,aes(x=Age_wiki))+geom_bar()
mean(as.numeric(Titanic_Data$Age_wiki),na.rm = TRUE) #29.00 approx

Plot a bar graph to check which is best aggregate to replace NA values in Age_wiki column. Now find the mean of Age_wiki column to replace NA values by mean.

Titanic_Data$Age_wiki <- ifelse(is.na(Titanic_Data$Age_wiki), as.character(29.00), Titanic_Data$Age_wiki)

Replacing NA values with mean.

summary(Titanic_Data$Class)
str(Titanic_Data$Class)

View the summary of Class column to find out the best aggregate for replacing NA values.

Titanic_Data$Class <- ifelse(is.na(Titanic_Data$Class), median(Titanic_Data$Class,na.rm = TRUE), Titanic_Data$Class)

Replace NA values with median.

Next step is to remove the unecessary columns.

str(Titanic_Data$WikiId)
str(Titanic_Data$Body)

By this information we can know that WikiId and Body columns has no importance in the dataset, so we can remove those columns.

Titanic_Data <- select(Titanic_Data,-c('WikiId'))
Titanic_Data <- select(Titanic_Data,-c('Body'))
table(Titanic_Data$Survived) #1 for survived , 0 for dead 

This lists the number of passenegers in the target class, 1 is for survived and 0 is for not-survived.

head(Titanic_Data$Ticket)
Titanic_Data$Age_wiki <- as.numeric(Titanic_Data$Age_wiki)

Convert Age_wiki to numeric data type.

Titanic_Data$Age_wiki <- cut(Titanic_Data$Age_wiki , breaks=c(0,10,20,30,40,50,60,70,100), labels = c("0-10","10-20","20-30","30-40","40-50","50-60","60-70","70-100"))
summary(Titanic_Data$Age_wiki)                            
colSums(is.na(Titanic_Data))

The above code breaks the age into range of values as you can see in the image. Finally all the NA values are removed.

Training and Testing dataset

Titanic_c = Titanic_Data[,c('Class','Sex','SibSp','Parch','Fare','Age_wiki','Embarked','Survived')]

Copy this dataset as Titanic_c to divide it into traning and testing part.

train_row_id <- sample(1:nrow(Titanic_c),size = ceiling(0.7*nrow(Titanic_c)))
#View(train_row_id)
X_train <- Titanic_c[train_row_id, ]
x_test <- Titanic_c[-train_row_id, ]
#removing col to be predicted from test dataset
x_test1 <- x_test[,-8] 

divide the dataset into training and testing part and remove the target class from it.

Data exploration and visualization

This step is important because it gives us a better understanding of the dataset. The better we know the data, the better our analysis will be.

library(ggplot2)
ggplot(Titanic_c,aes(x=Sex))+geom_bar()

table(Titanic_c$Sex)

ggplot(Titanic_c,aes(x=Survived,fill=Sex))+geom_histogram(bins = 5)

summary(Titanic_c$Age_wiki)

ggplot(Titanic_c,aes(x=Survived,fill=Age_wiki))+geom_histogram(bins = 6)

board_city <- unique(Titanic_Data$Boarded)
board_city[1]

count1=0
count2=0
count3=0
count4=0
count5=0
Titanic_Data$Boarded[1]
for(i in 1:nrow(Titanic_Data)){
  if(Titanic_Data$Boarded[i]==board_city[1]){
    count1=count1+1
  }
  if(Titanic_Data$Boarded[i]==board_city[2]){
    count2=count2+1
  }
  if(Titanic_Data$Boarded[i]==board_city[3]){
    count3=count3+1
  }
  if(Titanic_Data$Boarded[i]==board_city[4]){
    count4=count4+1
  }
  if(Titanic_Data$Boarded[i]==board_city[5]){
    count5=count5+1
  }
}
count_board <- c(count1,count2,count3,count4,count5)
pie(count_board,labels = board_city,
    col = rainbow(5))

Model building - Decision Tree

First model that we build is a decision tree model.

#model building
model <- rpart(Survived~.,data = X_train,method="class")
View(model)

Testing the model

#model testing
model_pr <- predict(model,x_test1)
plot(model)
plot(model,uniform = TRUE,main="Titanic Survival")
text(model,use.n = TRUE, all = TRUE, cex=.8)
View(model_pr)

model_pr <- ifelse(model_pr[,1] <= model_pr[,2],1,0)
View(model_pr)
str(model_pr)
prediction <- table(model_pr, x_test$Survived)


acc2<-sum(diag(prediction))/sum(prediction)*100
print(acc2)

The above code gives us the accuracy of the model which is 80.89%.

Model building - Navie Bayesian

Now we will build another model which is Navie Bayesian and we will compare it's performance with the decision tree model.

First step is to define the class variables.

#creating class variables
titanic_class<-ifelse(Titanic_c$Survived==0,"No","Yes");
titanic_2<-data.frame(Titanic_c,titanic_class)

Then we will set the seed and divide our dataset into training and testing set.

#using naive bayesian
set.seed(2)
id<-sample(2,nrow(titanic_2),prob=c(.7,.3),replace=TRUE)
print(id)

titanic_train<-titanic_2[id==1,]
titanic_test<-titanic_2[id==2,]
print(titanic_train)
print(titanic_test)

Building the Naive Bayesian model.

library(e1071)
model<-naiveBayes(titanic_class~.,titanic_train)
print(model)


Predicting the model

#predict model
pmodel<-predict(model,titanic_test)

Plot confusion matrix

#plot confusion matrix
prediction <- table(pmodel,titanic_test$titanic_class)
acc2<-sum(diag(prediction))/sum(prediction)*100
print(acc2)
str(pmodel)

By using Navie Bayesian, we are getting the accuracy 98.89%. So we can say that Naive Bayesian is a better approach for Titanic Survival Prediction.

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages