####################################################################################
#
#PRELIMINARY ANALYSIS
#
####################################################################################
#upload dataset
train <- read.csv("train.csv", header = TRUE)
test <- read.csv("test.csv", header = TRUE)
#merge two datasets in one
#first thing I had the column/variable survived in test set
test.survived = data.frame(Survived = rep("None", nrow(test)), test[,])
#then i have to adjust the position of the first two columns the other way around
test.survived[ , c(1,2)] <- test.survived[ , c(2,1)]
colnames(test.survived)[c(1,2)] <- colnames(test.survived)[c(2,1)]
#finally i can combine since they perfect match each other
data.combined = rbind(train, test.survived)
#remove columns passengerID
data.combined[1] <- NULL
#some information on the dataframe
str(data.combined)
#I don't want Survived and Pclass to be int or chr: for example, Pclass 3 is not > of Pclass 1
#It doesn't affect the probability y more; they are factors (3=third class, 1=first class)
data.combined$Survived = as.factor(data.combined$Survived)
data.combined$Pclass = as.factor(data.combined$Pclass)
#take the "Survived" variable and tabulate data according to that
table(data.combined$Survived) #the dataset is not too much skewed (no unbalanced db)
#distribution across classes and cross tabulation
table(data.combined$Pclass)
table(data.combined$Pclass, data.combined$Survived)
#count with condition
count_0 = length(which(data.combined$Survived == "0"))
count_1 = length(which(data.combined$Survived == "1"))
#or sum with condition
sum_0 = sum(data.combined$Survived == "0")
sum_1 = sum(data.combined$Survived == "1")
table(data.combined$Pclass, data.combined$Survived)
tableSC = table(data.combined$Pclass, data.combined$Survived)/(count_0+count_1)
#i want to see the %: I do a function
percent <- function(x, digits = 2, format = "f") {
paste0(formatC(100 * x, format = format, digits = digits), "%")
}
percent(tableSC)
#let's make some plot to clarify visually the hypothesis: rich survived more
#load a library to make good plot
library(ggplot2)
train$Pclass = as.factor(train$Pclass)
ggplot(train, aes(x=Pclass, fill=factor(Survived)))+
geom_bar(width=0.5) +
xlab("Class") +
ylab("Total Count") +
labs(fill="Survived") +
scale_fill_brewer(palette="Paired")
#name as character not as factor
train$Name = as.character(train$Name)
str(train)
head(train$Name) #check the first names in the db
length(unique(train$Name)) #the result suggests that are all unique names
length(unique(data.combined$Name)) #how many unique names cross datasets: we have 2 duplicates
#let's check if it's a matter of common names or a duplicated records to be removed
#from data.combined I grab the variable "Name" and I find all dublicates that I store in this vector
dup.names = as.character(data.combined[which(duplicated(data.combined$Name)), "Name"])
#then i take a look to these records
data.combined[which(data.combined$Name %in% dup.names),]
#I see that in names there are titles like "Miss, Mrs, Mr ecc."; i want to see if they can be used in the analysis
library(stringr)
misses = data.combined[which(str_detect(data.combined$Name, "Miss.")),]
misses[1:6,]
mrs = data.combined[which(str_detect(data.combined$Name, "Mrs.")),]
mrs[1:6,]
#Now let's check the sex patterns
males = data.combined[which(train$Sex == "male"), ]
males[1:6,]
table(train$Survived, train$Sex)
Nmale = sum(train$Sex=="male")
Nfemale = sum(train$Sex=="female")
Nfemale00 = sum(train$Sex=="female" & train$Survived=="0") #OR
Nfemale0 = sum(train['Sex']=="female" & train['Survived']=="0")
Nfemale1 = sum(train$Sex=="female" & train$Survived=="1")
Nmale0 = sum(train$Sex=="male" & train$Survived=="0")
Nmale1 = sum(train$Sex=="male" & train$Survived=="1")
Nfemale1/Nfemale
Nfemale0/Nfemale
Nmale1/Nmale
Nmale0/Nmale
#of course it's clear a correlation between the prob to survive and the sex
#explore the difference between str_detect and grep function
data.combined$Name = as.character(data.combined$Name)
ciao1 = grep("Miss.", data.combined$Name)
ciao2 = str_detect(data.combined$Name, "Miss.")
ciao2[1:100] #boolean output (if Name is "Miss" return True, otherwise False)
ciao1[1:100] #it gives back the row position of the Name which contains "Miss"
#but there's another useful information in the name that can be used in the model (title)
#we create the tile variable to expand upon relationship between Pclass and Survived0
extractTitle = function(Name) {
Name = as.character(Name)
if (length(grep("Miss.", Name)) > 0) {
return ("Miss.")
} else if (length(grep("Mrs.", Name)) > 0) {
return ("Mrs.")
} else if (length(grep("Master.", Name)) > 0) {
return ("Master.")
} else if (length(grep("Mr.", Name)) > 0) {
return ("Mr.")
} else {
return ("Other.")
}
}
titles = NULL #initiate the new variable
for (i in 1:nrow(data.combined)){
titles = c(titles, extractTitle(data.combined[i, "Name"])) #I assign to "titles" all Miss. Mrs. ecc.
}
data.combined$Title = as.factor(titles) #I create a new variable Title in data.combined and I assign all "titles" values
####################################################################################
#CHECK why it doesn't work instead of the line ahead!!!
#if (str_detect(data.combined$Name, "Miss.")) {
# return ("Miss.")
####################################################################################
#let's graph Title, Pclass and Survived/No
data.combined$Title = as.factor(data.combined$Title)
ggplot(data.combined[1:891,], aes(x=Title, fill=Survived)) +
geom_bar(width = 0.8) +
facet_wrap(~Pclass) +
ggtitle("Pclass") +
xlab("Title") +
ylab("Total Count") +
labs(fill = "Survived")
#Explore Sex variable
table(data.combined$Sex) #that's a problem for the model (males doubled females) -> we will fix that
#let's graph Sex, Pclass and Survived/No
ggplot(data.combined[1:891,], aes(x=Sex, fill=Survived)) +
geom_bar(width = 0.8) +
facet_wrap("Pclass") +
ggtitle("Pclass") +
xlab("Sex") +
ylab("Total Count") +
labs(fill = "Survived")
summary(data.combined$Age)
summary(data.combined[1:891, "Age"]) #bad news most of the data are even in the training set
#it would be useful to find a proxy for age (look at Title)
#let's graph Age, Sex Pclass and Survived/No
ggplot(data.combined[1:891,], aes(x=Age, fill = Survived)) +
geom_histogram(binwidth = 10) +
facet_wrap(~Sex+Pclass) +
ggtitle("Pclass") +
xlab("Age") +
ylab("Total Count") +
labs(fill = "Survived")
library(Hmisc) #for the function cut2
#let's come back to "Title" Engineering Feature
#First validate the hyphothesis: "Master is a proxy for young male"
boys = data.combined[which(data.combined$Title == "Master."), ]
mean_Age_Boys = mean(boys$Age, na.rm = TRUE) #the average age is very love in fact
summary(boys$Age)
#Miss. instead is more complicated (not easy to associate to a specific time-frame)
miss = data.combined[which(data.combined$Title == "Miss."), ]
summary(miss$Age)
ggplot(miss, aes(x=Age, fill=Survived)) +
geom_histogram(binwidth = 1) +
ggtitle("Pclass") +
xlab("Age") +
ylab("Total Count") +
labs(fill = "Survived")
ggplot(miss[which(miss$Survived != "None"),], aes(x=Age, fill=Survived)) +
geom_histogram(binwidth = 1) +
ggtitle("Pclass") +
xlab("Age") +
ylab("Total Count") +
labs(fill = "Survived")
miss2 = miss[which(miss$Survived != "None"),]
summary(miss$Age)
summary(miss2$Age)
####################################################################################
#CHECK -> in the future do some feature engineering to define when a Miss. is a child or a woman
#the reason is that the probability being a young Miss. seems to be different from older Miss.
#for example, check other variables (number of children, parents etc.)
#moreover, check how number of children, parents etc. influences the survive rate for all
####################################################################################
#let's explore this point for future feature engineering
miss_alone = miss[which(miss$SibSp == "0" & miss$Parch == "0"),]
miss_not_alone = miss[which(miss$SibSp != "0" & miss$Parch != "0"),]
summary(miss_alone)
summary(miss_not_alone)
#noteworthy that Miss. with children died more
tableMA = table(miss_alone$Survived)
tableMNA = table(miss_not_alone$Survived)
ggplot(miss_not_alone[which(miss_not_alone$Survived != "None"), ], aes(x=Age, fill=Survived))+
geom_histogram(binwidth = 1) +
ggtitle("Miss. with parents or children")
summary(miss_not_alone$Age)
ggplot(miss_alone[which(miss_alone$Survived != "None"), ], aes(x=Age, fill=Survived))+
geom_histogram(binwidth = 1) +
ggtitle("Miss. alone")
summary(miss_alone$Age)
#I've seen that miss not alone died more -> I want to check the class of this Miss
ggplot(miss_not_alone[which(miss_not_alone$Survived != "None"), ], aes(x=Pclass, fill=Survived))+
geom_bar(width = 1) +
ggtitle("Miss. with parents or children")
ggplot(miss_alone[which(miss_not_alone$Survived != "None"), ], aes(x=Pclass, fill=Survived))+
geom_bar(width = 1) +
ggtitle("Miss. alone")
#In Miss. rich with children didn't die -> third class Miss. had to decide between them and their children
#find the line of children whose mum is dead to see if they are dead or not
#first I collect all children (I set age <= 14.5)
children = data.combined[which(data.combined$Age <= 14.5),]
#I want to have an idea og how many children died across class
ggplot(children[which(children$Survived != "None"),], aes(x=Pclass, fill=Survived)) + geom_bar(width = 0.8)
#a lot of children in third class died; I want to check if the mum of saved children are dead or not
#if I take one children and I look for every person with the same ticket number, I found his relatives
data.combined[which(data.combined$Ticket == "349909"),] #this is one of the children died
children_died = children[which(children$Survived == "0"),]
ticket_children_died = children_died$Ticket
miss_with_children_died = miss[which(miss$Ticket %in% ticket_children_died),]
ggplot(miss_with_children_died[which(miss_with_children_died$Survived != "None"), ], aes(x=Pclass, fill=Survived)) +
geom_bar(width = 0.8) +
ggtitle("Misses with their children dead")
#Misses whose child was dead, they died as well (they would have saved their child first)
children_not_died = children[which(children$Survived == "1"),]
ticket_children_not_died = children_not_died$Ticket
miss_with_children_not_died = miss[which(miss$Ticket %in% ticket_children_not_died),]
ggplot(miss_with_children_not_died[which(miss_with_children_not_died$Survived != "None"), ], aes(x=Pclass, fill=Survived)) +
geom_bar(width = 0.8) +
ggtitle("Misses with their children not dead")
#Okey, apparently every Miss. whose child was not dead survived (It was allowed to stay together)
#as seen previously, I aspect that most of the woman whose child is dead are in third class
ggplot(miss_with_children_died[which(miss_with_children_died$Survived != "None"), ], aes(x=Pclass, fill=Survived)) +
geom_bar(width = 0.8) +
facet_wrap("Pclass") +
ggtitle("Dristibution of Misses with their children dead across Pclass")
#and so it is!
#Therefore, as said before, if you are a Miss alone, you heve higher change to survive
#This is due to the fact that Miss with children in third class, couldnt' save the life or their children (and their life as well)
#taking 14.5 as the maximum number of Master.'s age, I want to check how many Miss_alone has that age
length(which(miss_alone$Age <= 14.5)) #only 4 out of 150 are less than 14.5
#let's move to Sibsp variable
summary(data.combined$SibSp)
#can we treat it as a factor (for more interesting visualization)?
length(unique(data.combined$SibSp)) #few unique values, so yes
data.combined$SibSp = as.factor(data.combined$SibSp)
####################################################################################
#CHECK -> if a lot of unique values, make intervals first then factorize
#extra: give a name to each factor interval
####################################################################################
#let's make some plots
ggplot(data.combined[1:891,], aes(x= SibSp, fill = Survived)) +
geom_bar(width = 0.8) +
facet_wrap(~Pclass + Title) +
ggtitle("Pclass, Title")
xlab("Number of Sib/Sp") +
ylab("Total Count") +
labs("Survived") +
ylim(0,300)
#I'm intrigued by "Master." but I cannot see the y values
master_df = data.combined[which(data.combined$Title == "Master." & data.combined$Survived != "None"),]
ggplot(master_df, aes(x=SibSp, fill=Survived)) + geom_bar(width = 0.8)
#clearly, if you have more than 2 sibsp you are dead -> large family died
ggplot(master_df, aes(x=SibSp, fill=Survived)) + geom_bar(width = 0.8) + facet_wrap("Pclass")
#and this is regardless to the class of belonging (noteworthy than only third class has a lot of children)
ggplot(data.combined[1:891,], aes(x= SibSp, fill = Survived)) +
geom_bar(width = 0.8) +
facet_wrap(~Pclass) +
ggtitle("Pclass") +
xlab("Number of Sib/Sp") +
ylab("Total Count") +
labs("Survived")
only_dead = data.combined[which(data.combined$Survived == "0"),]
ggplot(only_dead, aes(x=Pclass)) +
geom_bar(width = 1, fill = "red") +
facet_wrap(~SibSp) +
xlab("Pclass") +
ylab("TotalCount")
only_dead_not_alone = data.combined[which(data.combined$Survived == "0" & data.combined$SibSp != "0" & data.combined$Parch != 0),]
ggplot(only_dead_not_alone, aes(x=Pclass)) +
geom_bar(width = 1, fill = "red") +
facet_wrap(~SibSp) +
xlab("Pclass") +
ylab("TotalCount")
#let's move to Parch variable and do the same type of analysis
summary(data.combined$Parch)
#can we treat it as a factor (for more interesting visualization)?
length(unique(data.combined$Parch)) #few unique values, so yes
data.combined$Parch = as.factor(data.combined$Parch)
ggplot(data.combined[1:891,], aes(x= Parch, fill = Survived)) +
geom_bar(width = 0.8) +
facet_wrap(~Pclass + Title) +
ggtitle("Pclass, Title")
xlab("Number of Parents/Children") +
ylab("Total Count") +
labs("Survived") +
ylim(0,300)
ggplot(data.combined[1:891,], aes(x= Parch, fill = Survived)) +
geom_bar(width = 0.8) +
facet_wrap(~Pclass) +
ggtitle("Pclass") +
xlab("Number of Parch") +
ylab("Total Count") +
labs("Survived")
only_dead = data.combined[which(data.combined$Survived == "0"),]
ggplot(only_dead, aes(x=Pclass)) +
geom_bar(width = 1, fill = "red") +
facet_wrap(~Parch) +
xlab("Pclass") +
ylab("TotalCount")
only_dead_not_alone = data.combined[which(data.combined$Survived == "0" & data.combined$SibSp != "0" & data.combined$Parch != 0),]
ggplot(only_dead_not_alone, aes(x=Pclass)) +
geom_bar(width = 1, fill = "red") +
facet_wrap(~Parch) +
xlab("Pclass") +
ylab("TotalCount")
#Feature Engineering: let's create the family size
temp.sibsp = c(train$SibSp, test$SibSp)
temp.Parch = c(train$Parch, test$Parch)
#the family is given by the some of parents, children, siblings, spouse + one for himself/herself
data.combined$Family_Size = as.factor(temp.Parch+temp.sibsp +1)
ggplot(data.combined[1:891,], aes(x=Family_Size, fill=Survived)) +
geom_bar(width = 0.8)
#I see from this graph that survival rate are much lower for family composed by more than 4 members
#hence, I split family size in two groups according to that
#I group family_size in Large (>4) and Small (<=4)
data.combined$New_Family_Size = as.numeric(data.combined$Family_Size)
i=0
for (i in c(1:nrow(data.combined))){
if (data.combined[i, "New_Family_Size"] <=4){
data.combined[i, "New_Family_Size"] = as.factor(data.combined[i, "New_Family_Size"])
data.combined[i, "New_Family_Size"] = "Small Family"
}
else if (data.combined[i, "New_Family_Size"] >4){
data.combined[i, "New_Family_Size"] = as.factor(data.combined[i, "New_Family_Size"])
data.combined[i, "New_Family_Size"] = "Large Family"
}
}
data.combined$New_Family_Size = as.factor(data.combined$New_Family_Size)
ggplot(data.combined[1:891, ], aes(x=New_Family_Size, fill = Survived)) + geom_bar(width = 0.8)
table(data.combined[1:891, "New_Family_Size"], data.combined[1:891, "Survived"])
#Variable Cabin
ggplot(data.combined[1:891, ], aes(x=Cabin, fill=Survived)) + geom_bar(width = 0.8)
length(unique(data.combined$Cabin))
data.combined$New_Cabin = as.character(data.combined$Cabin)
data.combined$New_Cabin = str_sub(data.combined$New_Cabin,1,1)
ggplot(data.combined[which(data.combined$Survived != "None" & data.combined$New_Cabin != ""),], aes(x=New_Cabin, fill=Survived)) +
geom_bar(width = 0.8) #cabin seems not to have an influence (even if I was thinking the proximity had an impact on survival rate)
#idiot, the reason why it's like that is because you removed the blank (which could be potentially indicative of third class people)
data.combined$New_Cabin2 = as.character(substr(data.combined$Cabin,1,1))
data.combined[which(data.combined$New_Cabin2 == ""), "New_Cabin2"] = "O"
ggplot(data.combined[1:891,], aes(x=New_Cabin2, fill=Survived)) +
geom_bar(width = 0.8) +
facet_wrap("Pclass")
#therefore, we can use Pclass as a good proxy and not this variable (risking overfitting)
ggplot(data.combined[which(data.combined$Survived != "None" & data.combined$New_Cabin != ""),], aes(x=New_Cabin, fill=Survived)) +
geom_bar(width = 0.8) +
facet_wrap("Pclass") #it seems we have information on the cabin only for first class guys
table(data.combined$Pclass, data.combined$New_Cabin) #infact, in the previous plot survival rate were very high!
table(data.combined$Survived, data.combined$New_Cabin)
table(data.combined$Sex, data.combined$New_Cabin)
#from the table above I can see that: cabin A is the only one quite bad in terms of people died
#but it can be explained because the % of male is much higher
#I suggest to remove this variable
#before moving to the next variable, let's check the guys with multiple cabins
data.combined$Cabin = as.character(data.combined$Cabin)
data.combined[which(is.na(data.combined$Cabin)), "Cabin"] = "Other"
multiple_cabin = ifelse(str_detect(data.combined$Cabin, " "), "Multiple", "Alone")
data.combined$multiple.cabin = as.factor(multiple_cabin)
prop.table(table(data.combined$multiple.cabin, data.combined$Survived))
ggplot(data.combined[1:891,], aes(x=multiple.cabin, fill=Survived)) + geom_bar(width = 0.8)
ggplot(data.combined[1:891,], aes(x=multiple.cabin, fill=Survived)) + geom_bar(width = 0.8) + facet_wrap("Pclass")
#Variable Embarked
ggplot(data.combined[which(data.combined$Survived != "None" & data.combined$Embarked != ""),], aes(x=Embarked, fill=Survived)) +
geom_bar(width = 0.8) #in S and Q there are more dead people than in C but it doesn't make any sense
#it's more realistic that a certain class has been mainly been embarked from C (but the correlation on survival rate is given by class)
ggplot(data.combined[which(data.combined$Survived != "None" & data.combined$Title != "Mr."),], aes(x=Embarked, fill=Survived)) +
geom_bar(width = 0.8) +
facet_wrap("Pclass")
#I suggest to remove this variable
#Variable Fare
#It could be interesting but we have Pclass that is a good proxy for the Fare. Let's see nonetheless how it is distributed across class
ggplot(data.combined[which(data.combined$Survived != "None"),], aes(x=Fare, fill=Survived)) + geom_histogram(binwidth = 10)
table(data.combined$Pclass, data.combined$Fare) #they must be aggregated
#first I want to see the average fare across classes
mean_1 = mean(data.combined[which(data.combined$Pclass == "1"), "Fare"], na.rm = TRUE)
mean_1.1 = mean(data.combined[which(data.combined$Pclass == "1" & data.combined$Sex == "male"), "Fare"], na.rm = TRUE)
mean_1.2 = mean(data.combined[which(data.combined$Pclass == "1" & data.combined$Sex == "female"), "Fare"], na.rm = TRUE)
mean_2 = mean(data.combined[which(data.combined$Pclass == "2"), "Fare"], na.rm = TRUE)
mean_2.1 = mean(data.combined[which(data.combined$Pclass == "2" & data.combined$Sex == "male"), "Fare"], na.rm = TRUE)
mean_2.2 = mean(data.combined[which(data.combined$Pclass == "2" & data.combined$Sex == "female"), "Fare"], na.rm = TRUE)
mean_3 = mean(data.combined[which(data.combined$Pclass == "3"), "Fare"], na.rm = TRUE)
mean_3.1 = mean(data.combined[which(data.combined$Pclass == "3" & data.combined$Sex == "male"), "Fare"], na.rm = TRUE)
mean_3.2 = mean(data.combined[which(data.combined$Pclass == "3" & data.combined$Sex == "female"), "Fare"], na.rm = TRUE)
#I want to devide in two groups: <50 and >50 (to have something more general than Pclass)
#Check after: man in second and third class died in the same way (women don't -> see if women in second class that didnìt died have different fares)
#Check after: how to fix N/A values for Age
#Check after: how to balance male/female data before the model development -> study unbalanced dataset
#Check after: some fare are not consistent with the class (therefore should be adjusted; but, I've already decide to remove it)
#First I remove the N/A value, second I create New_Fare and I devide it in two groups
for (i in c(1:nrow(data.combined))){
if(is.na(data.combined[i, "Fare"])){
to_be_changed = data.combined[i, ]
}
}
#all this strings can be replaced with one string:
to_be_changed2 = data.combined[which(is.na(data.combined$Fare)),]
data.combined[which(is.na(data.combined$Fare)), "Fare"] = mean_3.1 #being third class boy
#now that I replaced the N/A value with its mean I can create the new variable and do feature engineering
data.combined$New_Fare = as.integer(data.combined$Fare)
for (i in c(1:nrow(data.combined))){
if (data.combined[i, "New_Fare"] <= 50){
data.combined[i, "New_Fare"] = as.integer(data.combined[i, "New_Fare"])
data.combined[i, "New_Fare"] = "Poor"
} else if(data.combined[i, "New_Fare"] > 50){
data.combined[i, "New_Fare"] = as.integer(data.combined[i, "New_Fare"])
data.combined[i, "New_Fare"] = "Rich"
}
}
data.combined$New_Fare = as.factor(data.combined$New_Fare)
ggplot(data.combined[which(data.combined$Survived != "None"),], aes(x=New_Fare, fill = Survived)) +
geom_bar(width = 0.8)
ggplot(data.combined[which(data.combined$Survived != "None"),], aes(x=New_Fare, fill = Survived)) +
geom_bar(width = 0.8) +
facet_wrap("Sex")
#everything we can get from this plot, we can obtain with an higher detail using Pclass
#we can therefore remove also this variable and keep Pclass
#Variable Ticket -> the more cryptic one
length(unique(data.combined$Ticket))
head(data.combined$Ticket)
data.combined[1:10,]
#roughly looking at data: I can see three things to be verified:
# 1) rows with the same Ticket has the same fare
# 2) initially, the first number is the PClass (but then changes) -> it might be a matter of precedence (people in first class entered first) -> both in ticket with only numbers and with also letters
# 3) some code are numbers, other have also letters which seems to be associated to places (e.g. Paris) -> feature engineering with the story of titanic
data.combined$New_Ticket = as.character(data.combined$Ticket)
data.combined$New_Ticket2 = as.character(data.combined$Ticket)
library(tcR)
data.combined$New_Ticket = reverse.string(data.combined$New_Ticket)
data.combined$New_Ticket2 = reverse.string(data.combined$New_Ticket2)
for (i in c(1:nrow(data.combined))){
data.combined[i, c(17,18)] = str_split_fixed(data.combined[i, "New_Ticket"], " ", 2)
}
data.combined$New_Ticket = NULL
colnames(data.combined)[17] = "New_Ticket"
data.combined$New_Ticket = as.character(data.combined$New_Ticket)
data.combined$New_Ticket = reverse.string(data.combined$New_Ticket) #maybe it doesn't work witg empty string
data.combined[which(data.combined$New_Ticket != ""), "New_Ticket"] = reverse.string(data.combined[which(data.combined$New_Ticket != ""), "New_Ticket"])
length(unique(data.combined$New_Ticket))
ggplot(data.combined[which(data.combined$New_Ticket != "" & data.combined$Survived != "None"),], aes(x=New_Ticket, fill=Survived)) +
geom_bar(width = 0.4)
#too many unique values -> it's impossible to get any conclusion
#extract a specific substring inside the ticket string (e.g. "Paris" from "SC/PARIS 2133")
extractTicket = function(New_Ticket2) {
New_Ticket2 = as.character(New_Ticket2)
if (length(grep("PARIS", New_Ticket2)) > 0) {
return ("PARIS")
} else if (length(grep("C.A.", New_Ticket2)) > 0) {
return ("C.A.")
} else if (length(grep("PC", New_Ticket2)) > 0) {
return ("PC")
} else if (length(grep("Paris", New_Ticket2)) > 0) {
return ("PARIS")
} else if (length(grep("SOTON", New_Ticket2)) > 0) {
return ("SOTON")
} else if (length(grep("STON", New_Ticket2)) > 0) {
return ("STON")
} else if (length(grep("W./C.", New_Ticket2)) > 0) {
return ("W./C.")
} else if (length(grep("A/S", New_Ticket2)) > 0) {
return ("A/S")
} else {
return ("Other.")
}
}
New_Ticket2 = NULL
data.combined$New_Ticket2 = as.character(data.combined$Ticket)
for (i in 1:nrow(data.combined)){
New_Ticket2 = c(New_Ticket2, extractTicket(data.combined[i, "New_Ticket2"]))
}
data.combined$New_Ticket2 = as.factor(New_Ticket2)
ggplot(data.combined[which(data.combined$Survived != "None" & data.combined$New_Ticket2 != "Other."),], aes(x=New_Ticket2, fill=Survived)) +
geom_bar(width = 0.8)
#SOTON e W./C. has very low survival rate, but: 1) I don't know what they mean, 2) the total number is very low (no significance)
####################################################################################
# SUMMMING UP:
# 1) Age has too many missing values - we might think to replace it
# 2) Sex is unbalanced -> we should have initially balance the dataset
# 3) We will remove name and use title has a proxy of Age and Sex (that can be removed) -> it doesn't fix the unbalances
# 4) Cabin, Fare, ticket and Embarked will be removed
# 5) Parch and SibSp could be kept to further explore in relation to family size
# 6) All the other variables created should be removed (they were just editing training)
####################################################################################
####################################################################################
#
#EXPLANATORY MODEL
#
####################################################################################
library(randomForest)
#instead of rushing, using all variables we select a couple of them (Pclass and Title) that we think could have a great impact
#we first create a specific training dataframe for our first attemp
rf.train.1 = data.combined[1:891, c("Title", "Pclass")] #set of x (vector of variables)
rf.labe.l = as.factor(train$Survived) #set of y (Survived or not)
#random forest is random -> hence we use set.seed to have the possibility to replicate generating the same random values
#the seed inside (the argument) is the number from which each computer starts to generated random number (I will make hence the same argument of the vide I'm whatching)
set.seed(1234)
#we can start training the model with the train df we have created
#importance = TRUE -> keep track of the relative importance of features when building the trees of the forest
#ntree = 1000 -> how many individual trees are in the forest (default 500)
rf.1 = randomForest(x=rf.train.1, y=rf.label, importance = TRUE, ntree = 1000)
rf.1
varImpPlot(rf.1)
#20.76
rf.train.2 = data.combined[1:891, c("Title", "Pclass", "Parch")]
rf.label.2 = as.factor(train$Survived)
set.seed(1234)
rf.2 = randomForest(rf.train.2, rf.label.2, importance = TRUE, ntree = 1000)
rf.2
varImpPlot(rf.2)
#19.42
rf.train.3 = data.combined[1:891, c("Title", "Pclass", "SibSp")]
rf.label.3 = as.factor(train$Survived)
set.seed(1234)
rf.3 = randomForest(rf.train.3, rf.label.3, importance = TRUE, ntree = 1000)
rf.3
varImpPlot(rf.3)
#19.3
data.combined$Family_Size = as.factor(data.combined$Family_Size)
rf.train.4 = data.combined[1:891, c("Title", "Pclass", "Family_Size")]
rf.label.4 = as.factor(train$Survived)
set.seed(1234)
rf.4 = randomForest(rf.train.4, rf.label.4, importance = TRUE, ntree = 1000)
rf.4
varImpPlot(rf.4)
#17.73
rf.train.5 = data.combined[1:891, c("Title", "Pclass", "New_Family_Size")]
rf.label.5 = as.factor(train$Survived)
set.seed(1234)
rf.5 = randomForest(rf.train.5, rf.label.5, importance = TRUE, ntree = 1000)
rf.5
varImpPlot(rf.5)
#17.51
rf.train.6 = data.combined[1:891, c("Title", "Fare", "New_Family_Size")]
rf.label.6 = as.factor(train$Survived)
set.seed(1234)
rf.6 = randomForest(rf.train.6, rf.label.6, importance = TRUE, ntree = 1000)
rf.6
varImpPlot(rf.6)
#17.62
data.combined$Fare = as.integer(data.combined$Fare)
rf.train.6 = data.combined[1:891, c("Title", "Fare", "New_Family_Size")]
rf.label.6 = as.factor(train$Survived)
set.seed(1234)
rf.6 = randomForest(rf.train.6, rf.label.6, importance = TRUE, ntree = 1000)
rf.6
varImpPlot(rf.6)
#17.51
rf.train.7 = data.combined[1:891, c("Title", "Fare", "Family_Size")]
rf.label.7 = as.factor(train$Survived)
set.seed(1234)
rf.7 = randomForest(rf.train.7, rf.label.7, importance = TRUE, ntree = 1000)
rf.7
varImpPlot(rf.7)
#17.4 -> Fare is better than PClass (of course, but risk of overfitting)
rf.train.8 = data.combined[1:891, c("Title", "New_Fare", "New_Family_Size")]
rf.label.8 = as.factor(train$Survived)
set.seed(1234)
rf.8 = randomForest(rf.train.8, rf.label.8, importance = TRUE, ntree = 1000)
rf.8
varImpPlot(rf.8)
#17.73
#just as a try i create two other columns consisting in Parch/Family_Size and SibSp/Family_Size
data.combined$Family_Size = as.numeric(data.combined$Family_Size)
data.combined$SibSp = as.numeric(data.combined$SibSp)
data.combined$Parch = as.numeric(data.combined$Parch)
data.combined$New_FS1 = data.combined$SibSp / data.combined$Family_Size
data.combined$New_FS2 = data.combined$Parch / data.combined$Family_Size
rf.train.9 = data.combined[1:891, c("Title", "Fare", "New_FS1")]
rf.label.9 = as.factor(train$Survived)
set.seed(1234)
rf.9 = randomForest(rf.train.9, rf.label.9, importance = TRUE, ntree = 1000)
rf.9
varImpPlot(rf.9)
#18.52
rf.train.10 = data.combined[1:891, c("Title", "Fare", "New_FS2")]
rf.label.10 = as.factor(train$Survived)
set.seed(1234)
rf.10 = randomForest(rf.train.10, rf.label.10, importance = TRUE, ntree = 1000)
rf.10
varImpPlot(rf.10)
#18.29
rf.train.11 = data.combined[1:891, c("Title", "Fare", "New_FS1", "New_FS2")]
rf.label.11 = as.factor(train$Survived)
set.seed(1234)
rf.11 = randomForest(rf.train.11, rf.label.11, importance = TRUE, ntree = 1000)
rf.11
varImpPlot(rf.11)
#18.41
####################################################################################
# SUBMISSION
####################################################################################
#create the dataset used for the submission
test.submit.df = data.combined[which(data.combined$Survived == "None"), c("Title", "Fare", "Family_Size")]
#make predictions
rf.7.preds = predict(rf.7, test.submit.df)
table(rf.7.preds)
#create the dataset for the final submission (passengerID + predictions)
submit.df = data.frame(PassengerID = rep(892:nrow(data.combined)), Survived = rf.7.preds)
#let's create the .csv file to upload
levels(submit.df$Survived) = c("0", "1")
write.csv(submit.df, "RF_20190114_1.csv", row.names = FALSE)
#my score was 0.78947 uploading on Kagle
#a couple of more submissions
test.submit.df2 = data.combined[which(data.combined$Survived == "None"), c("Title", "Pclass", "New_Family_Size")]
rf.5.preds = predict(rf.5, test.submit.df2)
table(rf.5.preds)
submit.df2 = data.frame(PassengerID = rep(892:nrow(data.combined)), Survived = rf.5.preds)
levels(submit.df2$Survived) = c("0", "1")
write.csv(submit.df2, "RF_20190114_2.csv", row.names = FALSE)
#my score was 0.78947 uploading on Kagle
#The actual results are slightly less than the OOB results -> instead of submit multiple times
#adjusting and risking to overfit the result again, we must do CROSS VALIDATION to obtain,
#before the submission, the right estimate of accuracy
####################################################################################
# CROSS VALIDATION
#it is used for:
#(1) After having chosen the model type (random forest), I want to see if a feature or feature engineering really imporved the model
#(2) If I change model type, it works better or not in respect to random forest?
####################################################################################
library(caret) #it can do thousands of things, included cross validation
library(doSNOW) #it enable parallel computing when doing cross validation
#we do 10-fold cross-validation with stratified sample (it means that the ratio of survavors remain unchanged)
set.seed(2348)
cv.10.folds = createMultiFolds(rf.label.4, k=10, times=10)
#we have created folds, we have to set up that caret will use them for cross validation
control1 = trainControl(method = "repeatedcv", number = 10, repeats = 10, index = cv.10.folds)
#set up doSNOW for multi-core training -> partition of GPU for parallel computing
c1 = makeCluster(6, type = "SOCK") #6 are how many subprocess the computer will manage in parallel
registerDoSNOW(c1) #caret understand that you want to work in parallel by registering
#training function for caret
library(e1071)
set.seed(34324)
rf.5.cv.1 = train(x = rf.train.4, y = rf.label.4, method = "rf", tuneLength = 2, ntree=1000, trControl = control1)
#note: method says "use the random forest, the same as before", trControl says "how you are going to do trainig: repeated validation with 10 cv..."
#tuneLength -> hyperparameter values for tuning: try a maximum of 3 values for the number of trees and find who works the best
#shutdown the cluster previously registered
stopCluster(c1)
#check the results
rf.5.cv.1
#the above results are closer to the OOB results rather than submission results (too optimistic)
#we use less folds to see if, with less data, it doesn't lead me again to overfitting
set.seed(5983)
cv.3.folds = createMultiFolds(y = rf.label.4, k=3, times = 10)
control2 = trainControl(method = "repeatedcv", number = 3, repeats = 10, index = cv.3.folds)
c2 = makeCluster(6, type = "SOCK")
registerDoSNOW(c2)
set.seed(89472)
rf.5.cv.2 = train(x=rf.train.4, y=rf.label.4, method = "rf", tuneLength = 2, ntree = 1000, trControl = control2)
stopCluster(c2)
rf.5.cv.2
####################################################################################
# EXPLANATORY PART 2 and further FEATURE ENGINEERING
####################################################################################
#We take just one tree because is easy to be understood
#we create the function such that in each attempt we might just change some parameters' value
#the objective is to select a single tree og the 1000 to see how it works and make improvments
rpart.cv = function(seed, training, labels, control){
library(rpart)
library(rpart.plot)
library(doSNOW)
library(caret)
c1 = makeCluster(6, type = "SOCK")
registerDoSNOW(c1)
set.seed(seed)
rpart.cv = train(x=training, y=labels, method = "rpart", tuneLength = 30, trControl = control)
stopCluster(c1)
return(rpart.cv)
}
features = c("Title", "Pclass", "Family_Size")
rpart.train.1 = data.combined[1:891, features]
#let's run the function created
rpart.1.cv.1 = rpart.cv(94622, rpart.train.1, rf.label.4, control1)
rpart.1.cv.1
#Plot
prp(rpart.1.cv.1$finalModel, extra = 1, under = TRUE)
#Note: suggested to put "rpart.1.cv.1$finalModel" in the Console
# Interesting things noted that could be subjected to improvments:
# 1) If you are a Mr. or Other you are classified automatically as Not Survived, creationg obvious inacuracy (for example we know that male in first class survived more than male in third)
# 2) All other titles go to the next text: You are in first and second class or third class (this time is accurate -> no need for improvments)
# 3) Last test: if your family size is more than 5, you die, otherwise you survive with 100% accuracy -> very specific, overfitting
# - the test size can have a family of more than 5 in third class in which they survive for any reasons
#deeper look at title (it's strange to have Mr. and Other together -> maybe other groups different types of folk that can be better classified)
table(data.combined$Title)
data.title = data.combined[which(data.combined$Title == "Other."), c("Survived", "Pclass", "Sex", "Name")]
#enfait there are different titles that we cannot overlook
#let's split the name and extraxt all titles
data.combined[1:10, "Name"] #a normal name is given by Name-comma-Title-space-Name-space-otherNames
library(stringr)
name.splits = str_split(data.combined$Name, ",")
last.names = sapply(name.splits, "[", 1)
#alternatively: last.names2 = sapply(strsplit(data.combined$Name, ","), head, n = 1) or with tail to have the last value
secondpart.names = sapply(name.splits, "[", 2)
name.splits = str_split(secondpart.names, " ")
titles = sapply(name.splits, "[", 2)
unique(titles)
#we can see that we had a lot of titles that should be classified better (e.g. Lady and Dona are woman and, being classified as Others., ther were predicted to die always)
#it's strange the "the" Title
#data.combined[which(titles == "the"),]
#initially I did this (I create a new column equal to variable titles and then I look inside the dataset):
#data.combined$New_Title = titles
#data.combined[which(data.combined$New_Title == "the"),]
#it's interesting that I can check inside the dataset a specific value of the variable titles even if it's not yet included formally as a new column of the dataset
#anyway "the" was for "the Contess." -> it can be changed as "Mrs."
#data.combined[which(data.combined$New_Title == "the"), "New_Title"] <- "Mrs."
#So, I can do all corrections like that, but I want to learn how to modify the variables titles and than create New_Title and make it equal to titles
#data.combined$New_Title = NULL
data.combined[which(titles == "the"),]
titles[titles == "the"] = "Lady."
#titles[titles == "Dr." | titles == "Dona."]
#or alternatively
#titles[titles %in% c("Dona.", "Dr.")]
#okey, but I don't want Dona. and Dr. together -> it was just for learning
data.combined[which(titles == "Dona."),] #-> she has 39 with siblings and Parch -> I assign Mrs.
titles[titles == "Dona."] = "Lady."
data.combined[which(titles %in% c("Ms.", "Mlle.")),] #Mlle stands for Miss. (hence there was a missclassified item)
#Ms. is here undefined but there is no such a different between Miss and Mrs -> I assign Miss.
titles[titles %in% c("Ms.", "Mlle.")] <- "Miss."
titles[titles %in% c("Jonkheer.", "Don.")] <- "Sir."
titles[titles %in% c("Col.", "Capt.", "Major.")] <- "Officer"
titles[titles == "Mme."] <- "Mrs."
unique(titles)
table(titles)
data.combined$New_Title = titles
library(ggplot2)
ggplot(data.combined[1:891,], aes(x=New_Title, fill=Survived)) +
geom_bar(width = 0.8) +
xlab("New_Title") +
ylab("Total Count") +
ggtitle("Survival rate for new titles") +
facet_wrap("Pclass")
data.combined[which(data.combined$New_Title %in% c("Dr.", "Officer", "Sir.", "Rev.")), "New_Title"] = "Mr."
data.combined[which(data.combined$New_Title == "Lady."), "New_Title"] = "Mrs."
table(data.combined$New_Title)
#let's quickly cross validate the new algorithm
features = c("New_Title", "Pclass", "Family_Size")
rpart.train.2 = data.combined[1:891, features]
rpart.2.cv.1 = rpart.cv(94622, rpart.train.2, rf.label.4, control1)
rpart.2.cv.1
prp(rpart.2.cv.1$finalModel, extra=1, under=TRUE)
rpart.2.cv.1$finalModel
#better accuracy but still: (1) if you are a male, you die and (2) the family_size test must be fixed
#Dive in 1st class "Mr."
first.mr.df = data.combined[which(data.combined$New_Title == "Mr." & data.combined$Pclass == "1"),]
summary(first.mr.df)
#there is a female which has a title of Mr. -> I update the tile
data.combined[which(data.combined$New_Title == "Mr." & data.combined$Pclass == "1" & data.combined$Sex == "female"), "New_Title"] <- "Mrs."
#check if there are other wrong assignment
table(data.combined$Sex, data.combined$New_Title)
#let's see al the misclassified male in first class
first.mr.df = data.combined[which(data.combined$New_Title == "Mr." & data.combined$Pclass == "1"),]
summary(first.mr.df[which(first.mr.df$Survived == "1"),])
View(first.mr.df[which(first.mr.df$Survived == "1"),])
#from this table I see something strange in the ticket and fare (there are same fares for same ticket but they are aggregated)
#so, for example, there is no 512.3292 fare, because it's the sum of the fare of the all people with the same ticket (technically coming from the same family)
#and we undertand how our variable Family_Size brings some misleading information (someone that has ticket in common with other people was assigned as a unitary family size)
#task: create two columns: one that, for each row, understand how many people have the same ticket and two that gives the average fare
for (i in c(1:nrow(data.combined))){
ticket_iesimo = data.combined[i, "Ticket"]
data.combined[i, "Number_People"] = as.numeric(length(which(data.combined$Ticket %in% ticket_iesimo)))
data.combined[i, "Avg_Fare"] = as.integer(data.combined[i, "Fare"]/data.combined[i, "Number_People"])
}
#Now let's compare the distribution of survival rate for Mr. 1st class with Fare and New_Fare
ggplot(first.mr.df, aes(x=Fare, fill=Survived)) +
geom_density(alpha = 0.5) +
ggtitle("1st Mr distribution by Fare")
ggplot(first.mr.df, aes(x=Avg_Fare, fill=Survived)) +
geom_density(alpha = 0.5) +
ggtitle("1st Mr distribution by Fare")
#With the feature engineering on fare we have fixed the problem that in the test set there were a lot of fares without a reference in the training
ggplot(first.mr.df[which(first.mr.df$Survived != "None"),], aes(x=Avg_Fare, fill=Survived)) + geom_density(alpha = 0.5)
ggplot(first.mr.df[which(first.mr.df$Survived != "None"),], aes(x=Number_People, fill=Survived)) + geom_density(alpha = 0.5)
#regarding the number of people travelling with, always concerning 1st Mr, we have a major density for >4, but in that case the survaval rate is close to 0
#these two features show some interesting patterns which could be help in forecasting the survival rate, but might be correlated
summary(data.combined$Avg_Fare)
summary(data.combined$Number_People)
#even if not tecnically required here, it's a good practice to normalize data (using pre-process Caret function)
data.combined.to.normalise = data.combined[, c("Number_People", "Avg_Fare")]
preProc = preProcess(data.combined.to.normalise, method = c("center", "scale"))
data.combined.normalised = predict(preProc, data.combined.to.normalise)
#let's see if they are tollerated
cor(data.combined.normalised$Number_People, data.combined.normalised$Avg_Fare) #which is very low
cor(data.combined$Number_People, data.combined$Avg_Fare) #which is the same before normalising (deepen the use of normalisation)
data.combined$Family_Size = as.numeric(data.combined$Family_Size)
#let's cross validate the new model
features = c("Pclass", "New_Title", "Number_People", "Avg_Fare")
rpart.train.3 = data.combined[1:891, features]
rpart.3.cv.1 = rpart.cv(94622, rpart.train.3, rf.label.4, control1)
rpart.3.cv.1
prp(rpart.3.cv.1$finalModel, extra=1, under=TRUE)
rpart.3.cv.1$finalModel
#New Submission
test.submit.df3 = data.combined[which(data.combined$Survived == "None"), c("Pclass", "New_Title", "Number_People", "Avg_Fare")]
rpart.preds = predict(rpart.3.cv.1$finalModel, test.submit.df3, type = "class")
table(rpart.preds)
submit.df3 = data.frame(PassengerID = rep(892:nrow(data.combined)), Survived = rpart.preds)
write.csv(submit.df3, "RF_20190115_2.csv", row.names = FALSE)
#the result of accuracy is more than 0.80 and top 10% in the leaderboard
#to have the probability of surviving or not instead of the class
test.submit.df4 = data.combined[which(data.combined$Survived == "None"), c("Pclass", "New_Title", "Number_People", "Avg_Fare")]
rpart.preds4 = predict(rpart.3.cv.1$finalModel, test.submit.df3, type = "prob")
table(rpart.preds4)
submit.df4 = data.frame(PassengerID = rep(892:nrow(data.combined)), Survived = rpart.preds4)
write.csv(submit.df4, "Probability_of_Surviving.csv", row.names = FALSE)
#Let's do now the corresponding RandomForest using the new features and submit
library(randomForest)
features = c("Pclass", "New_Title", "Number_People", "Avg_Fare")
rf.train.temp = data.combined[1:891, features]
rf.label = as.factor(data.combined[which(data.combined$Survived != "None"), "Survived"])
set.seed(1234)
rf.temp = randomForest(x=rf.train.temp, y=rf.label.10, ntree = 1000)
rf.temp
varImpPlot(rf.temp)
data.combined$New_Title = as.factor(data.combined$New_Title)
rf.train.t = data.combined[1:891, c("Pclass", "New_Title", "Number_People", "Avg_Fare")]
rf.label.t = as.factor(train$Survived)
set.seed(1234)
rf.t = randomForest(rf.train.t, rf.label.t, importance = TRUE, ntree = 1000)
rf.t
varImpPlot(rf.t)
test.submit.df.t = data.combined[which(data.combined$Survived == "None"), c("Pclass", "New_Title", "Number_People", "Avg_Fare")]
rf.preds.t = predict(rf.t, test.submit.df.t)
table(rf.preds.t)
submit.df.t = data.frame(PassengerID = rep(892:nrow(data.combined)), Survived = rf.preds.t)
write.csv(submit.df2, "RF_20190115_4.csv", row.names = FALSE)
####################################################################################
# MUTUAL INFORMATION - BASIC
library(infotheo)
#mutual information -> how much information of Survival rate I get from each single variable
#it's useful to see if our effort in feature engineering are well repaid or we are loosing information
#it's useful to validate if the features we have selected actually bring information (if it's likely to be predictive)
mutinformation(rf.label, data.combined$Pclass[1:891])
mutinformation(rf.label, data.combined$Parch[1:891])
mutinformation(rf.label, data.combined$SibSp[1:891])
mutinformation(rf.label, data.combined$Ticket[1:891])
mutinformation(rf.label, data.combined$Title[1:891])
mutinformation(rf.label, data.combined$New_Title[1:891])
mutinformation(rf.label, data.combined$New_Title2[1:891])
mutinformation(rf.label, data.combined$Fare[1:891])
mutinformation(rf.label, data.combined$Avg_Fare[1:891])
mutinformation(rf.label, data.combined$Family_Size[1:891])
mutinformation(rf.label, data.combined$New_Family_Size[1:891])
mutinformation(rf.label, data.combined$Sex[1:891])
mutinformation(rf.label, data.combined$Number_People[1:891])
data.combined$FFF = discretize(data.combined$Fare)
mutinformation(rf.label, discretize(data.combined$Fare[1:891]))
mutinformation(rf.label, discretize(data.combined$Avg_Fare[1:891]))
####################################################################################
# DIMENSION REDUCTION - BASIC
library(Rtsne)
#we try to see a 2d visualization of a 4th dimensions space
#we start from the lower part of the graph - dataset of any record which is not Mr.
most.correct = data.combined[which(data.combined$Title != "Mr.")]
#to be finished
####################################################################################
# CONDITIONAL MUTUAL INFORMATION - BASIC
#to be done
####################################################################################
# MISSING VALUES - INPUTATION NA
#let's take age which is the value with the highest number of NA values
#Note: with such a high number of NA, it' better to find a proxy for age (like Title)
sum(is.na(data.combined$Age))
sum(!(is.na(data.combined$Age)))
#handmade
mean_title1 = mean(data.combined[which(data.combined$Title == "Mr."), "Age"], na.rm = TRUE)
mean_title2 = mean(data.combined[which(data.combined$Title == "Mrs."), "Age"], na.rm = TRUE)
mean_title3 = mean(data.combined[which(data.combined$Title == "Miss."), "Age"], na.rm = TRUE)
mean_title4 = mean(data.combined[which(data.combined$Title == "Master."), "Age"], na.rm = TRUE)
mean_title5 = mean(data.combined[which(data.combined$Title == "Other."), "Age"], na.rm = TRUE)
data.combined$Age2 = data.combined$Age
data.combined[which(data.combined$Title == "Mr." & is.na(data.combined$Age2)), "Age2"] = mean_title1
data.combined[which(data.combined$Title == "Mrs." & is.na(data.combined$Age2)), "Age2"] = mean_title2
data.combined[which(data.combined$Title == "Miss." & is.na(data.combined$Age2)), "Age2"] = mean_title3
data.combined[which(data.combined$Title == "Master." & is.na(data.combined$Age2)), "Age2"] = mean_title4
data.combined[which(data.combined$Title == "Other." & is.na(data.combined$Age2)), "Age2"] = mean_title5
summary(data.combined$Age)
summary(data.combined$Age2)
ggplot(data.combined[1:891,], aes(x=Age, fill=Survived)) + geom_density(alpha=0.5)
ggplot(data.combined[1:891,], aes(x=Age2, fill=Survived)) + geom_density(alpha=0.5)
#if I want to sobstitute with the mean or median of everything in few rows
library(Hmisc)
data.combined$Age3 = data.combined$Age
set.seed(1234)
data.combined$Age3 = as.integer(impute(data.combined$Age3, fun = mean))
ggplot(data.combined[1:891,], aes(x=Age3, fill=Survived)) + geom_density(alpha=0.5)
data.combined$Age4 = data.combined$Age
data.combined$Age4 = as.integer(impute(data.combined$Age4, fun = median))
ggplot(data.combined[1:891,], aes(x=Age4, fill=Survived)) + geom_density(alpha=0.5)
#linear regression model
data.combined$Age5 = data.combined$Age
model = lm(Age5 ~ Pclass + New_Title + Number_People + Avg_Fare, data.combined)
I = is.na(data.combined$Age5)
data.combined$Age5[I] = as.integer(predict(model, newdata = data.combined[I, ]))
ggplot(data.combined[1:891,], aes(x=Age5, fill=Survived)) + geom_density(alpha=0.5)
#let's see how Age5 now can explain the survival rate
mutinformation(rf.label, discretize(data.combined$Age5[1:891])) #very few :(
#test correlation between the new Age and the categorical variable New_Title
#read the following link:
#https://stats.stackexchange.com/questions/119835/correlation-between-a-nominal-iv-and-a-continuous-dv-variable
#as a training, let's take a categorical variable (Embarked) and let's do imputation on that
data.combined[which(data.combined$Embarked == ""), "Embarked"] = NA
data.combined[which(is.na(data.combined$Embarked)),]
data.combined$Embarked2 = data.combined$Embarked
set.seed(1234)
data.combined$Embarked2 = impute(data.combined$Embarked2, "random")
data.combined[which(is.na(data.combined$Embarked2)),] #NA values are no longer here
library(missMDA)
data("ozone")
res.impute <- imputeFAMD(ozone, ncp=3)
New_Ozone_df = res.impute$completeObs
#let's apply this to data.combined
data.combined2 = data.combined
data.combined2[, c(3, 8, 10:31)] <- NULL
data.combined2[c(7:9, 25:30, 77:78), 3] <- NA
res.impute2 <- imputeFAMD(data.combined2, ncp=5)
data.combined.new = res.impute2$completeObs
data.combined.new$Survived = as.character(data.combined.new$Survived)
Survived1 = substr(data.combined.new$Survived,nchar(data.combined.new$Survived),nchar(data.combined.new$Survived)+1)[1:891]
Survived2 = substr(data.combined.new$Survived,nchar(data.combined.new$Survived)-3,nchar(data.combined.new$Survived))[892:1309]
Survivedt = c(Survived1, Survived2)
data.combined.new$Survived = as.factor(Survivedt)
data.combined.new$Pclass = as.character(data.combined.new$Pclass)
Classt = substr(data.combined.new$Pclass,nchar(data.combined.new$Pclass),nchar(data.combined.new$Pclass)+1)
data.combined.new$Pclass = as.factor(Classt)
####################################################################################
# OUTLIERS DETECTION AND FIXING
#let's take for example Age6 in order to include it inside the model (even if it doesn't seem to bring a lot of predictive capacity)
boxplot.stats(data.combined$Age5)$out
data.combined3 = data.combined[which(!(data.combined$Age5 %in% boxplot.stats(data.combined$Age5)$out) & data.combined$Survived != "None"),]
rf.label.data.combined3 = data.combined3$Survived
mutinformation(discretize(data.combined3$Age), rf.label.data.combined3) #it doen't make any sense to drop the outliers
#let's make a submission
features = c("Pclass", "New_Title", "Number_People", "Avg_Fare", "Age5")
rpart.train.4 = data.combined[1:891, features]
rpart.3.cv.2 = rpart.cv(94622, rpart.train.4, rf.label.4, control1)
rpart.3.cv.2
prp(rpart.3.cv.1$finalModel, extra=1, under=TRUE)
rpart.3.cv.1$finalModel
test.submit.df5 = data.combined[which(data.combined$Survived == "None"), c("Pclass", "New_Title", "Number_People", "Avg_Fare", "Age5")]
rpart.preds = predict(rpart.3.cv.2$finalModel, test.submit.df5, type = "class")
table(rpart.preds)
submit.df3 = data.frame(PassengerID = rep(892:nrow(data.combined)), Survived = rpart.preds)
write.csv(submit.df3, "RF_20190116_2.csv", row.names = FALSE)
####################################################################################
# SCALING OF EACH VARIABLE
#scaling all the variables included in the model (normalization) and then check mutial information and submit
data.combined_BackUP = data.combined
preProc = preProcess(data.combined[,c("Number_People", "Avg_Fare", "Age5")], method = c("center", "scale"))
data.combined[,c("Number_People", "Avg_Fare", "Age5")] = predict(preProc, data.combined[,c("Number_People", "Avg_Fare", "Age5")])
features = c("Pclass", "New_Title", "Number_People", "Avg_Fare", "Age5")
rpart.train.4 = data.combined[1:891, features]
rpart.3.cv.2 = rpart.cv(94622, rpart.train.4, rf.label.4, control1)
rpart.3.cv.2
prp(rpart.3.cv.1$finalModel, extra=1, under=TRUE)
rpart.3.cv.1$finalModel
test.submit.df5 = data.combined[which(data.combined$Survived == "None"), c("Pclass", "New_Title", "Number_People", "Avg_Fare", "Age5")]
rpart.preds = predict(rpart.3.cv.2$finalModel, test.submit.df5, type = "class")
table(rpart.preds)
submit.df3 = data.frame(PassengerID = rep(892:nrow(data.combined)), Survived = rpart.preds)
write.csv(submit.df3, "RF_20190116_3.csv", row.names = FALSE)
#
#PRELIMINARY ANALYSIS
#
####################################################################################
#upload dataset
train <- read.csv("train.csv", header = TRUE)
test <- read.csv("test.csv", header = TRUE)
#merge two datasets in one
#first thing I had the column/variable survived in test set
test.survived = data.frame(Survived = rep("None", nrow(test)), test[,])
#then i have to adjust the position of the first two columns the other way around
test.survived[ , c(1,2)] <- test.survived[ , c(2,1)]
colnames(test.survived)[c(1,2)] <- colnames(test.survived)[c(2,1)]
#finally i can combine since they perfect match each other
data.combined = rbind(train, test.survived)
#remove columns passengerID
data.combined[1] <- NULL
#some information on the dataframe
str(data.combined)
#I don't want Survived and Pclass to be int or chr: for example, Pclass 3 is not > of Pclass 1
#It doesn't affect the probability y more; they are factors (3=third class, 1=first class)
data.combined$Survived = as.factor(data.combined$Survived)
data.combined$Pclass = as.factor(data.combined$Pclass)
#take the "Survived" variable and tabulate data according to that
table(data.combined$Survived) #the dataset is not too much skewed (no unbalanced db)
#distribution across classes and cross tabulation
table(data.combined$Pclass)
table(data.combined$Pclass, data.combined$Survived)
#count with condition
count_0 = length(which(data.combined$Survived == "0"))
count_1 = length(which(data.combined$Survived == "1"))
#or sum with condition
sum_0 = sum(data.combined$Survived == "0")
sum_1 = sum(data.combined$Survived == "1")
table(data.combined$Pclass, data.combined$Survived)
tableSC = table(data.combined$Pclass, data.combined$Survived)/(count_0+count_1)
#i want to see the %: I do a function
percent <- function(x, digits = 2, format = "f") {
paste0(formatC(100 * x, format = format, digits = digits), "%")
}
percent(tableSC)
#let's make some plot to clarify visually the hypothesis: rich survived more
#load a library to make good plot
library(ggplot2)
train$Pclass = as.factor(train$Pclass)
ggplot(train, aes(x=Pclass, fill=factor(Survived)))+
geom_bar(width=0.5) +
xlab("Class") +
ylab("Total Count") +
labs(fill="Survived") +
scale_fill_brewer(palette="Paired")
#name as character not as factor
train$Name = as.character(train$Name)
str(train)
head(train$Name) #check the first names in the db
length(unique(train$Name)) #the result suggests that are all unique names
length(unique(data.combined$Name)) #how many unique names cross datasets: we have 2 duplicates
#let's check if it's a matter of common names or a duplicated records to be removed
#from data.combined I grab the variable "Name" and I find all dublicates that I store in this vector
dup.names = as.character(data.combined[which(duplicated(data.combined$Name)), "Name"])
#then i take a look to these records
data.combined[which(data.combined$Name %in% dup.names),]
#I see that in names there are titles like "Miss, Mrs, Mr ecc."; i want to see if they can be used in the analysis
library(stringr)
misses = data.combined[which(str_detect(data.combined$Name, "Miss.")),]
misses[1:6,]
mrs = data.combined[which(str_detect(data.combined$Name, "Mrs.")),]
mrs[1:6,]
#Now let's check the sex patterns
males = data.combined[which(train$Sex == "male"), ]
males[1:6,]
table(train$Survived, train$Sex)
Nmale = sum(train$Sex=="male")
Nfemale = sum(train$Sex=="female")
Nfemale00 = sum(train$Sex=="female" & train$Survived=="0") #OR
Nfemale0 = sum(train['Sex']=="female" & train['Survived']=="0")
Nfemale1 = sum(train$Sex=="female" & train$Survived=="1")
Nmale0 = sum(train$Sex=="male" & train$Survived=="0")
Nmale1 = sum(train$Sex=="male" & train$Survived=="1")
Nfemale1/Nfemale
Nfemale0/Nfemale
Nmale1/Nmale
Nmale0/Nmale
#of course it's clear a correlation between the prob to survive and the sex
#explore the difference between str_detect and grep function
data.combined$Name = as.character(data.combined$Name)
ciao1 = grep("Miss.", data.combined$Name)
ciao2 = str_detect(data.combined$Name, "Miss.")
ciao2[1:100] #boolean output (if Name is "Miss" return True, otherwise False)
ciao1[1:100] #it gives back the row position of the Name which contains "Miss"
#but there's another useful information in the name that can be used in the model (title)
#we create the tile variable to expand upon relationship between Pclass and Survived0
extractTitle = function(Name) {
Name = as.character(Name)
if (length(grep("Miss.", Name)) > 0) {
return ("Miss.")
} else if (length(grep("Mrs.", Name)) > 0) {
return ("Mrs.")
} else if (length(grep("Master.", Name)) > 0) {
return ("Master.")
} else if (length(grep("Mr.", Name)) > 0) {
return ("Mr.")
} else {
return ("Other.")
}
}
titles = NULL #initiate the new variable
for (i in 1:nrow(data.combined)){
titles = c(titles, extractTitle(data.combined[i, "Name"])) #I assign to "titles" all Miss. Mrs. ecc.
}
data.combined$Title = as.factor(titles) #I create a new variable Title in data.combined and I assign all "titles" values
####################################################################################
#CHECK why it doesn't work instead of the line ahead!!!
#if (str_detect(data.combined$Name, "Miss.")) {
# return ("Miss.")
####################################################################################
#let's graph Title, Pclass and Survived/No
data.combined$Title = as.factor(data.combined$Title)
ggplot(data.combined[1:891,], aes(x=Title, fill=Survived)) +
geom_bar(width = 0.8) +
facet_wrap(~Pclass) +
ggtitle("Pclass") +
xlab("Title") +
ylab("Total Count") +
labs(fill = "Survived")
#Explore Sex variable
table(data.combined$Sex) #that's a problem for the model (males doubled females) -> we will fix that
#let's graph Sex, Pclass and Survived/No
ggplot(data.combined[1:891,], aes(x=Sex, fill=Survived)) +
geom_bar(width = 0.8) +
facet_wrap("Pclass") +
ggtitle("Pclass") +
xlab("Sex") +
ylab("Total Count") +
labs(fill = "Survived")
summary(data.combined$Age)
summary(data.combined[1:891, "Age"]) #bad news most of the data are even in the training set
#it would be useful to find a proxy for age (look at Title)
#let's graph Age, Sex Pclass and Survived/No
ggplot(data.combined[1:891,], aes(x=Age, fill = Survived)) +
geom_histogram(binwidth = 10) +
facet_wrap(~Sex+Pclass) +
ggtitle("Pclass") +
xlab("Age") +
ylab("Total Count") +
labs(fill = "Survived")
library(Hmisc) #for the function cut2
#let's come back to "Title" Engineering Feature
#First validate the hyphothesis: "Master is a proxy for young male"
boys = data.combined[which(data.combined$Title == "Master."), ]
mean_Age_Boys = mean(boys$Age, na.rm = TRUE) #the average age is very love in fact
summary(boys$Age)
#Miss. instead is more complicated (not easy to associate to a specific time-frame)
miss = data.combined[which(data.combined$Title == "Miss."), ]
summary(miss$Age)
ggplot(miss, aes(x=Age, fill=Survived)) +
geom_histogram(binwidth = 1) +
ggtitle("Pclass") +
xlab("Age") +
ylab("Total Count") +
labs(fill = "Survived")
ggplot(miss[which(miss$Survived != "None"),], aes(x=Age, fill=Survived)) +
geom_histogram(binwidth = 1) +
ggtitle("Pclass") +
xlab("Age") +
ylab("Total Count") +
labs(fill = "Survived")
miss2 = miss[which(miss$Survived != "None"),]
summary(miss$Age)
summary(miss2$Age)
####################################################################################
#CHECK -> in the future do some feature engineering to define when a Miss. is a child or a woman
#the reason is that the probability being a young Miss. seems to be different from older Miss.
#for example, check other variables (number of children, parents etc.)
#moreover, check how number of children, parents etc. influences the survive rate for all
####################################################################################
#let's explore this point for future feature engineering
miss_alone = miss[which(miss$SibSp == "0" & miss$Parch == "0"),]
miss_not_alone = miss[which(miss$SibSp != "0" & miss$Parch != "0"),]
summary(miss_alone)
summary(miss_not_alone)
#noteworthy that Miss. with children died more
tableMA = table(miss_alone$Survived)
tableMNA = table(miss_not_alone$Survived)
ggplot(miss_not_alone[which(miss_not_alone$Survived != "None"), ], aes(x=Age, fill=Survived))+
geom_histogram(binwidth = 1) +
ggtitle("Miss. with parents or children")
summary(miss_not_alone$Age)
ggplot(miss_alone[which(miss_alone$Survived != "None"), ], aes(x=Age, fill=Survived))+
geom_histogram(binwidth = 1) +
ggtitle("Miss. alone")
summary(miss_alone$Age)
#I've seen that miss not alone died more -> I want to check the class of this Miss
ggplot(miss_not_alone[which(miss_not_alone$Survived != "None"), ], aes(x=Pclass, fill=Survived))+
geom_bar(width = 1) +
ggtitle("Miss. with parents or children")
ggplot(miss_alone[which(miss_not_alone$Survived != "None"), ], aes(x=Pclass, fill=Survived))+
geom_bar(width = 1) +
ggtitle("Miss. alone")
#In Miss. rich with children didn't die -> third class Miss. had to decide between them and their children
#find the line of children whose mum is dead to see if they are dead or not
#first I collect all children (I set age <= 14.5)
children = data.combined[which(data.combined$Age <= 14.5),]
#I want to have an idea og how many children died across class
ggplot(children[which(children$Survived != "None"),], aes(x=Pclass, fill=Survived)) + geom_bar(width = 0.8)
#a lot of children in third class died; I want to check if the mum of saved children are dead or not
#if I take one children and I look for every person with the same ticket number, I found his relatives
data.combined[which(data.combined$Ticket == "349909"),] #this is one of the children died
children_died = children[which(children$Survived == "0"),]
ticket_children_died = children_died$Ticket
miss_with_children_died = miss[which(miss$Ticket %in% ticket_children_died),]
ggplot(miss_with_children_died[which(miss_with_children_died$Survived != "None"), ], aes(x=Pclass, fill=Survived)) +
geom_bar(width = 0.8) +
ggtitle("Misses with their children dead")
#Misses whose child was dead, they died as well (they would have saved their child first)
children_not_died = children[which(children$Survived == "1"),]
ticket_children_not_died = children_not_died$Ticket
miss_with_children_not_died = miss[which(miss$Ticket %in% ticket_children_not_died),]
ggplot(miss_with_children_not_died[which(miss_with_children_not_died$Survived != "None"), ], aes(x=Pclass, fill=Survived)) +
geom_bar(width = 0.8) +
ggtitle("Misses with their children not dead")
#Okey, apparently every Miss. whose child was not dead survived (It was allowed to stay together)
#as seen previously, I aspect that most of the woman whose child is dead are in third class
ggplot(miss_with_children_died[which(miss_with_children_died$Survived != "None"), ], aes(x=Pclass, fill=Survived)) +
geom_bar(width = 0.8) +
facet_wrap("Pclass") +
ggtitle("Dristibution of Misses with their children dead across Pclass")
#and so it is!
#Therefore, as said before, if you are a Miss alone, you heve higher change to survive
#This is due to the fact that Miss with children in third class, couldnt' save the life or their children (and their life as well)
#taking 14.5 as the maximum number of Master.'s age, I want to check how many Miss_alone has that age
length(which(miss_alone$Age <= 14.5)) #only 4 out of 150 are less than 14.5
#let's move to Sibsp variable
summary(data.combined$SibSp)
#can we treat it as a factor (for more interesting visualization)?
length(unique(data.combined$SibSp)) #few unique values, so yes
data.combined$SibSp = as.factor(data.combined$SibSp)
####################################################################################
#CHECK -> if a lot of unique values, make intervals first then factorize
#extra: give a name to each factor interval
####################################################################################
#let's make some plots
ggplot(data.combined[1:891,], aes(x= SibSp, fill = Survived)) +
geom_bar(width = 0.8) +
facet_wrap(~Pclass + Title) +
ggtitle("Pclass, Title")
xlab("Number of Sib/Sp") +
ylab("Total Count") +
labs("Survived") +
ylim(0,300)
#I'm intrigued by "Master." but I cannot see the y values
master_df = data.combined[which(data.combined$Title == "Master." & data.combined$Survived != "None"),]
ggplot(master_df, aes(x=SibSp, fill=Survived)) + geom_bar(width = 0.8)
#clearly, if you have more than 2 sibsp you are dead -> large family died
ggplot(master_df, aes(x=SibSp, fill=Survived)) + geom_bar(width = 0.8) + facet_wrap("Pclass")
#and this is regardless to the class of belonging (noteworthy than only third class has a lot of children)
ggplot(data.combined[1:891,], aes(x= SibSp, fill = Survived)) +
geom_bar(width = 0.8) +
facet_wrap(~Pclass) +
ggtitle("Pclass") +
xlab("Number of Sib/Sp") +
ylab("Total Count") +
labs("Survived")
only_dead = data.combined[which(data.combined$Survived == "0"),]
ggplot(only_dead, aes(x=Pclass)) +
geom_bar(width = 1, fill = "red") +
facet_wrap(~SibSp) +
xlab("Pclass") +
ylab("TotalCount")
only_dead_not_alone = data.combined[which(data.combined$Survived == "0" & data.combined$SibSp != "0" & data.combined$Parch != 0),]
ggplot(only_dead_not_alone, aes(x=Pclass)) +
geom_bar(width = 1, fill = "red") +
facet_wrap(~SibSp) +
xlab("Pclass") +
ylab("TotalCount")
#let's move to Parch variable and do the same type of analysis
summary(data.combined$Parch)
#can we treat it as a factor (for more interesting visualization)?
length(unique(data.combined$Parch)) #few unique values, so yes
data.combined$Parch = as.factor(data.combined$Parch)
ggplot(data.combined[1:891,], aes(x= Parch, fill = Survived)) +
geom_bar(width = 0.8) +
facet_wrap(~Pclass + Title) +
ggtitle("Pclass, Title")
xlab("Number of Parents/Children") +
ylab("Total Count") +
labs("Survived") +
ylim(0,300)
ggplot(data.combined[1:891,], aes(x= Parch, fill = Survived)) +
geom_bar(width = 0.8) +
facet_wrap(~Pclass) +
ggtitle("Pclass") +
xlab("Number of Parch") +
ylab("Total Count") +
labs("Survived")
only_dead = data.combined[which(data.combined$Survived == "0"),]
ggplot(only_dead, aes(x=Pclass)) +
geom_bar(width = 1, fill = "red") +
facet_wrap(~Parch) +
xlab("Pclass") +
ylab("TotalCount")
only_dead_not_alone = data.combined[which(data.combined$Survived == "0" & data.combined$SibSp != "0" & data.combined$Parch != 0),]
ggplot(only_dead_not_alone, aes(x=Pclass)) +
geom_bar(width = 1, fill = "red") +
facet_wrap(~Parch) +
xlab("Pclass") +
ylab("TotalCount")
#Feature Engineering: let's create the family size
temp.sibsp = c(train$SibSp, test$SibSp)
temp.Parch = c(train$Parch, test$Parch)
#the family is given by the some of parents, children, siblings, spouse + one for himself/herself
data.combined$Family_Size = as.factor(temp.Parch+temp.sibsp +1)
ggplot(data.combined[1:891,], aes(x=Family_Size, fill=Survived)) +
geom_bar(width = 0.8)
#I see from this graph that survival rate are much lower for family composed by more than 4 members
#hence, I split family size in two groups according to that
#I group family_size in Large (>4) and Small (<=4)
data.combined$New_Family_Size = as.numeric(data.combined$Family_Size)
i=0
for (i in c(1:nrow(data.combined))){
if (data.combined[i, "New_Family_Size"] <=4){
data.combined[i, "New_Family_Size"] = as.factor(data.combined[i, "New_Family_Size"])
data.combined[i, "New_Family_Size"] = "Small Family"
}
else if (data.combined[i, "New_Family_Size"] >4){
data.combined[i, "New_Family_Size"] = as.factor(data.combined[i, "New_Family_Size"])
data.combined[i, "New_Family_Size"] = "Large Family"
}
}
data.combined$New_Family_Size = as.factor(data.combined$New_Family_Size)
ggplot(data.combined[1:891, ], aes(x=New_Family_Size, fill = Survived)) + geom_bar(width = 0.8)
table(data.combined[1:891, "New_Family_Size"], data.combined[1:891, "Survived"])
#Variable Cabin
ggplot(data.combined[1:891, ], aes(x=Cabin, fill=Survived)) + geom_bar(width = 0.8)
length(unique(data.combined$Cabin))
data.combined$New_Cabin = as.character(data.combined$Cabin)
data.combined$New_Cabin = str_sub(data.combined$New_Cabin,1,1)
ggplot(data.combined[which(data.combined$Survived != "None" & data.combined$New_Cabin != ""),], aes(x=New_Cabin, fill=Survived)) +
geom_bar(width = 0.8) #cabin seems not to have an influence (even if I was thinking the proximity had an impact on survival rate)
#idiot, the reason why it's like that is because you removed the blank (which could be potentially indicative of third class people)
data.combined$New_Cabin2 = as.character(substr(data.combined$Cabin,1,1))
data.combined[which(data.combined$New_Cabin2 == ""), "New_Cabin2"] = "O"
ggplot(data.combined[1:891,], aes(x=New_Cabin2, fill=Survived)) +
geom_bar(width = 0.8) +
facet_wrap("Pclass")
#therefore, we can use Pclass as a good proxy and not this variable (risking overfitting)
ggplot(data.combined[which(data.combined$Survived != "None" & data.combined$New_Cabin != ""),], aes(x=New_Cabin, fill=Survived)) +
geom_bar(width = 0.8) +
facet_wrap("Pclass") #it seems we have information on the cabin only for first class guys
table(data.combined$Pclass, data.combined$New_Cabin) #infact, in the previous plot survival rate were very high!
table(data.combined$Survived, data.combined$New_Cabin)
table(data.combined$Sex, data.combined$New_Cabin)
#from the table above I can see that: cabin A is the only one quite bad in terms of people died
#but it can be explained because the % of male is much higher
#I suggest to remove this variable
#before moving to the next variable, let's check the guys with multiple cabins
data.combined$Cabin = as.character(data.combined$Cabin)
data.combined[which(is.na(data.combined$Cabin)), "Cabin"] = "Other"
multiple_cabin = ifelse(str_detect(data.combined$Cabin, " "), "Multiple", "Alone")
data.combined$multiple.cabin = as.factor(multiple_cabin)
prop.table(table(data.combined$multiple.cabin, data.combined$Survived))
ggplot(data.combined[1:891,], aes(x=multiple.cabin, fill=Survived)) + geom_bar(width = 0.8)
ggplot(data.combined[1:891,], aes(x=multiple.cabin, fill=Survived)) + geom_bar(width = 0.8) + facet_wrap("Pclass")
#Variable Embarked
ggplot(data.combined[which(data.combined$Survived != "None" & data.combined$Embarked != ""),], aes(x=Embarked, fill=Survived)) +
geom_bar(width = 0.8) #in S and Q there are more dead people than in C but it doesn't make any sense
#it's more realistic that a certain class has been mainly been embarked from C (but the correlation on survival rate is given by class)
ggplot(data.combined[which(data.combined$Survived != "None" & data.combined$Title != "Mr."),], aes(x=Embarked, fill=Survived)) +
geom_bar(width = 0.8) +
facet_wrap("Pclass")
#I suggest to remove this variable
#Variable Fare
#It could be interesting but we have Pclass that is a good proxy for the Fare. Let's see nonetheless how it is distributed across class
ggplot(data.combined[which(data.combined$Survived != "None"),], aes(x=Fare, fill=Survived)) + geom_histogram(binwidth = 10)
table(data.combined$Pclass, data.combined$Fare) #they must be aggregated
#first I want to see the average fare across classes
mean_1 = mean(data.combined[which(data.combined$Pclass == "1"), "Fare"], na.rm = TRUE)
mean_1.1 = mean(data.combined[which(data.combined$Pclass == "1" & data.combined$Sex == "male"), "Fare"], na.rm = TRUE)
mean_1.2 = mean(data.combined[which(data.combined$Pclass == "1" & data.combined$Sex == "female"), "Fare"], na.rm = TRUE)
mean_2 = mean(data.combined[which(data.combined$Pclass == "2"), "Fare"], na.rm = TRUE)
mean_2.1 = mean(data.combined[which(data.combined$Pclass == "2" & data.combined$Sex == "male"), "Fare"], na.rm = TRUE)
mean_2.2 = mean(data.combined[which(data.combined$Pclass == "2" & data.combined$Sex == "female"), "Fare"], na.rm = TRUE)
mean_3 = mean(data.combined[which(data.combined$Pclass == "3"), "Fare"], na.rm = TRUE)
mean_3.1 = mean(data.combined[which(data.combined$Pclass == "3" & data.combined$Sex == "male"), "Fare"], na.rm = TRUE)
mean_3.2 = mean(data.combined[which(data.combined$Pclass == "3" & data.combined$Sex == "female"), "Fare"], na.rm = TRUE)
#I want to devide in two groups: <50 and >50 (to have something more general than Pclass)
#Check after: man in second and third class died in the same way (women don't -> see if women in second class that didnìt died have different fares)
#Check after: how to fix N/A values for Age
#Check after: how to balance male/female data before the model development -> study unbalanced dataset
#Check after: some fare are not consistent with the class (therefore should be adjusted; but, I've already decide to remove it)
#First I remove the N/A value, second I create New_Fare and I devide it in two groups
for (i in c(1:nrow(data.combined))){
if(is.na(data.combined[i, "Fare"])){
to_be_changed = data.combined[i, ]
}
}
#all this strings can be replaced with one string:
to_be_changed2 = data.combined[which(is.na(data.combined$Fare)),]
data.combined[which(is.na(data.combined$Fare)), "Fare"] = mean_3.1 #being third class boy
#now that I replaced the N/A value with its mean I can create the new variable and do feature engineering
data.combined$New_Fare = as.integer(data.combined$Fare)
for (i in c(1:nrow(data.combined))){
if (data.combined[i, "New_Fare"] <= 50){
data.combined[i, "New_Fare"] = as.integer(data.combined[i, "New_Fare"])
data.combined[i, "New_Fare"] = "Poor"
} else if(data.combined[i, "New_Fare"] > 50){
data.combined[i, "New_Fare"] = as.integer(data.combined[i, "New_Fare"])
data.combined[i, "New_Fare"] = "Rich"
}
}
data.combined$New_Fare = as.factor(data.combined$New_Fare)
ggplot(data.combined[which(data.combined$Survived != "None"),], aes(x=New_Fare, fill = Survived)) +
geom_bar(width = 0.8)
ggplot(data.combined[which(data.combined$Survived != "None"),], aes(x=New_Fare, fill = Survived)) +
geom_bar(width = 0.8) +
facet_wrap("Sex")
#everything we can get from this plot, we can obtain with an higher detail using Pclass
#we can therefore remove also this variable and keep Pclass
#Variable Ticket -> the more cryptic one
length(unique(data.combined$Ticket))
head(data.combined$Ticket)
data.combined[1:10,]
#roughly looking at data: I can see three things to be verified:
# 1) rows with the same Ticket has the same fare
# 2) initially, the first number is the PClass (but then changes) -> it might be a matter of precedence (people in first class entered first) -> both in ticket with only numbers and with also letters
# 3) some code are numbers, other have also letters which seems to be associated to places (e.g. Paris) -> feature engineering with the story of titanic
data.combined$New_Ticket = as.character(data.combined$Ticket)
data.combined$New_Ticket2 = as.character(data.combined$Ticket)
library(tcR)
data.combined$New_Ticket = reverse.string(data.combined$New_Ticket)
data.combined$New_Ticket2 = reverse.string(data.combined$New_Ticket2)
for (i in c(1:nrow(data.combined))){
data.combined[i, c(17,18)] = str_split_fixed(data.combined[i, "New_Ticket"], " ", 2)
}
data.combined$New_Ticket = NULL
colnames(data.combined)[17] = "New_Ticket"
data.combined$New_Ticket = as.character(data.combined$New_Ticket)
data.combined$New_Ticket = reverse.string(data.combined$New_Ticket) #maybe it doesn't work witg empty string
data.combined[which(data.combined$New_Ticket != ""), "New_Ticket"] = reverse.string(data.combined[which(data.combined$New_Ticket != ""), "New_Ticket"])
length(unique(data.combined$New_Ticket))
ggplot(data.combined[which(data.combined$New_Ticket != "" & data.combined$Survived != "None"),], aes(x=New_Ticket, fill=Survived)) +
geom_bar(width = 0.4)
#too many unique values -> it's impossible to get any conclusion
#extract a specific substring inside the ticket string (e.g. "Paris" from "SC/PARIS 2133")
extractTicket = function(New_Ticket2) {
New_Ticket2 = as.character(New_Ticket2)
if (length(grep("PARIS", New_Ticket2)) > 0) {
return ("PARIS")
} else if (length(grep("C.A.", New_Ticket2)) > 0) {
return ("C.A.")
} else if (length(grep("PC", New_Ticket2)) > 0) {
return ("PC")
} else if (length(grep("Paris", New_Ticket2)) > 0) {
return ("PARIS")
} else if (length(grep("SOTON", New_Ticket2)) > 0) {
return ("SOTON")
} else if (length(grep("STON", New_Ticket2)) > 0) {
return ("STON")
} else if (length(grep("W./C.", New_Ticket2)) > 0) {
return ("W./C.")
} else if (length(grep("A/S", New_Ticket2)) > 0) {
return ("A/S")
} else {
return ("Other.")
}
}
New_Ticket2 = NULL
data.combined$New_Ticket2 = as.character(data.combined$Ticket)
for (i in 1:nrow(data.combined)){
New_Ticket2 = c(New_Ticket2, extractTicket(data.combined[i, "New_Ticket2"]))
}
data.combined$New_Ticket2 = as.factor(New_Ticket2)
ggplot(data.combined[which(data.combined$Survived != "None" & data.combined$New_Ticket2 != "Other."),], aes(x=New_Ticket2, fill=Survived)) +
geom_bar(width = 0.8)
#SOTON e W./C. has very low survival rate, but: 1) I don't know what they mean, 2) the total number is very low (no significance)
####################################################################################
# SUMMMING UP:
# 1) Age has too many missing values - we might think to replace it
# 2) Sex is unbalanced -> we should have initially balance the dataset
# 3) We will remove name and use title has a proxy of Age and Sex (that can be removed) -> it doesn't fix the unbalances
# 4) Cabin, Fare, ticket and Embarked will be removed
# 5) Parch and SibSp could be kept to further explore in relation to family size
# 6) All the other variables created should be removed (they were just editing training)
####################################################################################
####################################################################################
#
#EXPLANATORY MODEL
#
####################################################################################
library(randomForest)
#instead of rushing, using all variables we select a couple of them (Pclass and Title) that we think could have a great impact
#we first create a specific training dataframe for our first attemp
rf.train.1 = data.combined[1:891, c("Title", "Pclass")] #set of x (vector of variables)
rf.labe.l = as.factor(train$Survived) #set of y (Survived or not)
#random forest is random -> hence we use set.seed to have the possibility to replicate generating the same random values
#the seed inside (the argument) is the number from which each computer starts to generated random number (I will make hence the same argument of the vide I'm whatching)
set.seed(1234)
#we can start training the model with the train df we have created
#importance = TRUE -> keep track of the relative importance of features when building the trees of the forest
#ntree = 1000 -> how many individual trees are in the forest (default 500)
rf.1 = randomForest(x=rf.train.1, y=rf.label, importance = TRUE, ntree = 1000)
rf.1
varImpPlot(rf.1)
#20.76
rf.train.2 = data.combined[1:891, c("Title", "Pclass", "Parch")]
rf.label.2 = as.factor(train$Survived)
set.seed(1234)
rf.2 = randomForest(rf.train.2, rf.label.2, importance = TRUE, ntree = 1000)
rf.2
varImpPlot(rf.2)
#19.42
rf.train.3 = data.combined[1:891, c("Title", "Pclass", "SibSp")]
rf.label.3 = as.factor(train$Survived)
set.seed(1234)
rf.3 = randomForest(rf.train.3, rf.label.3, importance = TRUE, ntree = 1000)
rf.3
varImpPlot(rf.3)
#19.3
data.combined$Family_Size = as.factor(data.combined$Family_Size)
rf.train.4 = data.combined[1:891, c("Title", "Pclass", "Family_Size")]
rf.label.4 = as.factor(train$Survived)
set.seed(1234)
rf.4 = randomForest(rf.train.4, rf.label.4, importance = TRUE, ntree = 1000)
rf.4
varImpPlot(rf.4)
#17.73
rf.train.5 = data.combined[1:891, c("Title", "Pclass", "New_Family_Size")]
rf.label.5 = as.factor(train$Survived)
set.seed(1234)
rf.5 = randomForest(rf.train.5, rf.label.5, importance = TRUE, ntree = 1000)
rf.5
varImpPlot(rf.5)
#17.51
rf.train.6 = data.combined[1:891, c("Title", "Fare", "New_Family_Size")]
rf.label.6 = as.factor(train$Survived)
set.seed(1234)
rf.6 = randomForest(rf.train.6, rf.label.6, importance = TRUE, ntree = 1000)
rf.6
varImpPlot(rf.6)
#17.62
data.combined$Fare = as.integer(data.combined$Fare)
rf.train.6 = data.combined[1:891, c("Title", "Fare", "New_Family_Size")]
rf.label.6 = as.factor(train$Survived)
set.seed(1234)
rf.6 = randomForest(rf.train.6, rf.label.6, importance = TRUE, ntree = 1000)
rf.6
varImpPlot(rf.6)
#17.51
rf.train.7 = data.combined[1:891, c("Title", "Fare", "Family_Size")]
rf.label.7 = as.factor(train$Survived)
set.seed(1234)
rf.7 = randomForest(rf.train.7, rf.label.7, importance = TRUE, ntree = 1000)
rf.7
varImpPlot(rf.7)
#17.4 -> Fare is better than PClass (of course, but risk of overfitting)
rf.train.8 = data.combined[1:891, c("Title", "New_Fare", "New_Family_Size")]
rf.label.8 = as.factor(train$Survived)
set.seed(1234)
rf.8 = randomForest(rf.train.8, rf.label.8, importance = TRUE, ntree = 1000)
rf.8
varImpPlot(rf.8)
#17.73
#just as a try i create two other columns consisting in Parch/Family_Size and SibSp/Family_Size
data.combined$Family_Size = as.numeric(data.combined$Family_Size)
data.combined$SibSp = as.numeric(data.combined$SibSp)
data.combined$Parch = as.numeric(data.combined$Parch)
data.combined$New_FS1 = data.combined$SibSp / data.combined$Family_Size
data.combined$New_FS2 = data.combined$Parch / data.combined$Family_Size
rf.train.9 = data.combined[1:891, c("Title", "Fare", "New_FS1")]
rf.label.9 = as.factor(train$Survived)
set.seed(1234)
rf.9 = randomForest(rf.train.9, rf.label.9, importance = TRUE, ntree = 1000)
rf.9
varImpPlot(rf.9)
#18.52
rf.train.10 = data.combined[1:891, c("Title", "Fare", "New_FS2")]
rf.label.10 = as.factor(train$Survived)
set.seed(1234)
rf.10 = randomForest(rf.train.10, rf.label.10, importance = TRUE, ntree = 1000)
rf.10
varImpPlot(rf.10)
#18.29
rf.train.11 = data.combined[1:891, c("Title", "Fare", "New_FS1", "New_FS2")]
rf.label.11 = as.factor(train$Survived)
set.seed(1234)
rf.11 = randomForest(rf.train.11, rf.label.11, importance = TRUE, ntree = 1000)
rf.11
varImpPlot(rf.11)
#18.41
####################################################################################
# SUBMISSION
####################################################################################
#create the dataset used for the submission
test.submit.df = data.combined[which(data.combined$Survived == "None"), c("Title", "Fare", "Family_Size")]
#make predictions
rf.7.preds = predict(rf.7, test.submit.df)
table(rf.7.preds)
#create the dataset for the final submission (passengerID + predictions)
submit.df = data.frame(PassengerID = rep(892:nrow(data.combined)), Survived = rf.7.preds)
#let's create the .csv file to upload
levels(submit.df$Survived) = c("0", "1")
write.csv(submit.df, "RF_20190114_1.csv", row.names = FALSE)
#my score was 0.78947 uploading on Kagle
#a couple of more submissions
test.submit.df2 = data.combined[which(data.combined$Survived == "None"), c("Title", "Pclass", "New_Family_Size")]
rf.5.preds = predict(rf.5, test.submit.df2)
table(rf.5.preds)
submit.df2 = data.frame(PassengerID = rep(892:nrow(data.combined)), Survived = rf.5.preds)
levels(submit.df2$Survived) = c("0", "1")
write.csv(submit.df2, "RF_20190114_2.csv", row.names = FALSE)
#my score was 0.78947 uploading on Kagle
#The actual results are slightly less than the OOB results -> instead of submit multiple times
#adjusting and risking to overfit the result again, we must do CROSS VALIDATION to obtain,
#before the submission, the right estimate of accuracy
####################################################################################
# CROSS VALIDATION
#it is used for:
#(1) After having chosen the model type (random forest), I want to see if a feature or feature engineering really imporved the model
#(2) If I change model type, it works better or not in respect to random forest?
####################################################################################
library(caret) #it can do thousands of things, included cross validation
library(doSNOW) #it enable parallel computing when doing cross validation
#we do 10-fold cross-validation with stratified sample (it means that the ratio of survavors remain unchanged)
set.seed(2348)
cv.10.folds = createMultiFolds(rf.label.4, k=10, times=10)
#we have created folds, we have to set up that caret will use them for cross validation
control1 = trainControl(method = "repeatedcv", number = 10, repeats = 10, index = cv.10.folds)
#set up doSNOW for multi-core training -> partition of GPU for parallel computing
c1 = makeCluster(6, type = "SOCK") #6 are how many subprocess the computer will manage in parallel
registerDoSNOW(c1) #caret understand that you want to work in parallel by registering
#training function for caret
library(e1071)
set.seed(34324)
rf.5.cv.1 = train(x = rf.train.4, y = rf.label.4, method = "rf", tuneLength = 2, ntree=1000, trControl = control1)
#note: method says "use the random forest, the same as before", trControl says "how you are going to do trainig: repeated validation with 10 cv..."
#tuneLength -> hyperparameter values for tuning: try a maximum of 3 values for the number of trees and find who works the best
#shutdown the cluster previously registered
stopCluster(c1)
#check the results
rf.5.cv.1
#the above results are closer to the OOB results rather than submission results (too optimistic)
#we use less folds to see if, with less data, it doesn't lead me again to overfitting
set.seed(5983)
cv.3.folds = createMultiFolds(y = rf.label.4, k=3, times = 10)
control2 = trainControl(method = "repeatedcv", number = 3, repeats = 10, index = cv.3.folds)
c2 = makeCluster(6, type = "SOCK")
registerDoSNOW(c2)
set.seed(89472)
rf.5.cv.2 = train(x=rf.train.4, y=rf.label.4, method = "rf", tuneLength = 2, ntree = 1000, trControl = control2)
stopCluster(c2)
rf.5.cv.2
####################################################################################
# EXPLANATORY PART 2 and further FEATURE ENGINEERING
####################################################################################
#We take just one tree because is easy to be understood
#we create the function such that in each attempt we might just change some parameters' value
#the objective is to select a single tree og the 1000 to see how it works and make improvments
rpart.cv = function(seed, training, labels, control){
library(rpart)
library(rpart.plot)
library(doSNOW)
library(caret)
c1 = makeCluster(6, type = "SOCK")
registerDoSNOW(c1)
set.seed(seed)
rpart.cv = train(x=training, y=labels, method = "rpart", tuneLength = 30, trControl = control)
stopCluster(c1)
return(rpart.cv)
}
features = c("Title", "Pclass", "Family_Size")
rpart.train.1 = data.combined[1:891, features]
#let's run the function created
rpart.1.cv.1 = rpart.cv(94622, rpart.train.1, rf.label.4, control1)
rpart.1.cv.1
#Plot
prp(rpart.1.cv.1$finalModel, extra = 1, under = TRUE)
#Note: suggested to put "rpart.1.cv.1$finalModel" in the Console
# Interesting things noted that could be subjected to improvments:
# 1) If you are a Mr. or Other you are classified automatically as Not Survived, creationg obvious inacuracy (for example we know that male in first class survived more than male in third)
# 2) All other titles go to the next text: You are in first and second class or third class (this time is accurate -> no need for improvments)
# 3) Last test: if your family size is more than 5, you die, otherwise you survive with 100% accuracy -> very specific, overfitting
# - the test size can have a family of more than 5 in third class in which they survive for any reasons
#deeper look at title (it's strange to have Mr. and Other together -> maybe other groups different types of folk that can be better classified)
table(data.combined$Title)
data.title = data.combined[which(data.combined$Title == "Other."), c("Survived", "Pclass", "Sex", "Name")]
#enfait there are different titles that we cannot overlook
#let's split the name and extraxt all titles
data.combined[1:10, "Name"] #a normal name is given by Name-comma-Title-space-Name-space-otherNames
library(stringr)
name.splits = str_split(data.combined$Name, ",")
last.names = sapply(name.splits, "[", 1)
#alternatively: last.names2 = sapply(strsplit(data.combined$Name, ","), head, n = 1) or with tail to have the last value
secondpart.names = sapply(name.splits, "[", 2)
name.splits = str_split(secondpart.names, " ")
titles = sapply(name.splits, "[", 2)
unique(titles)
#we can see that we had a lot of titles that should be classified better (e.g. Lady and Dona are woman and, being classified as Others., ther were predicted to die always)
#it's strange the "the" Title
#data.combined[which(titles == "the"),]
#initially I did this (I create a new column equal to variable titles and then I look inside the dataset):
#data.combined$New_Title = titles
#data.combined[which(data.combined$New_Title == "the"),]
#it's interesting that I can check inside the dataset a specific value of the variable titles even if it's not yet included formally as a new column of the dataset
#anyway "the" was for "the Contess." -> it can be changed as "Mrs."
#data.combined[which(data.combined$New_Title == "the"), "New_Title"] <- "Mrs."
#So, I can do all corrections like that, but I want to learn how to modify the variables titles and than create New_Title and make it equal to titles
#data.combined$New_Title = NULL
data.combined[which(titles == "the"),]
titles[titles == "the"] = "Lady."
#titles[titles == "Dr." | titles == "Dona."]
#or alternatively
#titles[titles %in% c("Dona.", "Dr.")]
#okey, but I don't want Dona. and Dr. together -> it was just for learning
data.combined[which(titles == "Dona."),] #-> she has 39 with siblings and Parch -> I assign Mrs.
titles[titles == "Dona."] = "Lady."
data.combined[which(titles %in% c("Ms.", "Mlle.")),] #Mlle stands for Miss. (hence there was a missclassified item)
#Ms. is here undefined but there is no such a different between Miss and Mrs -> I assign Miss.
titles[titles %in% c("Ms.", "Mlle.")] <- "Miss."
titles[titles %in% c("Jonkheer.", "Don.")] <- "Sir."
titles[titles %in% c("Col.", "Capt.", "Major.")] <- "Officer"
titles[titles == "Mme."] <- "Mrs."
unique(titles)
table(titles)
data.combined$New_Title = titles
library(ggplot2)
ggplot(data.combined[1:891,], aes(x=New_Title, fill=Survived)) +
geom_bar(width = 0.8) +
xlab("New_Title") +
ylab("Total Count") +
ggtitle("Survival rate for new titles") +
facet_wrap("Pclass")
data.combined[which(data.combined$New_Title %in% c("Dr.", "Officer", "Sir.", "Rev.")), "New_Title"] = "Mr."
data.combined[which(data.combined$New_Title == "Lady."), "New_Title"] = "Mrs."
table(data.combined$New_Title)
#let's quickly cross validate the new algorithm
features = c("New_Title", "Pclass", "Family_Size")
rpart.train.2 = data.combined[1:891, features]
rpart.2.cv.1 = rpart.cv(94622, rpart.train.2, rf.label.4, control1)
rpart.2.cv.1
prp(rpart.2.cv.1$finalModel, extra=1, under=TRUE)
rpart.2.cv.1$finalModel
#better accuracy but still: (1) if you are a male, you die and (2) the family_size test must be fixed
#Dive in 1st class "Mr."
first.mr.df = data.combined[which(data.combined$New_Title == "Mr." & data.combined$Pclass == "1"),]
summary(first.mr.df)
#there is a female which has a title of Mr. -> I update the tile
data.combined[which(data.combined$New_Title == "Mr." & data.combined$Pclass == "1" & data.combined$Sex == "female"), "New_Title"] <- "Mrs."
#check if there are other wrong assignment
table(data.combined$Sex, data.combined$New_Title)
#let's see al the misclassified male in first class
first.mr.df = data.combined[which(data.combined$New_Title == "Mr." & data.combined$Pclass == "1"),]
summary(first.mr.df[which(first.mr.df$Survived == "1"),])
View(first.mr.df[which(first.mr.df$Survived == "1"),])
#from this table I see something strange in the ticket and fare (there are same fares for same ticket but they are aggregated)
#so, for example, there is no 512.3292 fare, because it's the sum of the fare of the all people with the same ticket (technically coming from the same family)
#and we undertand how our variable Family_Size brings some misleading information (someone that has ticket in common with other people was assigned as a unitary family size)
#task: create two columns: one that, for each row, understand how many people have the same ticket and two that gives the average fare
for (i in c(1:nrow(data.combined))){
ticket_iesimo = data.combined[i, "Ticket"]
data.combined[i, "Number_People"] = as.numeric(length(which(data.combined$Ticket %in% ticket_iesimo)))
data.combined[i, "Avg_Fare"] = as.integer(data.combined[i, "Fare"]/data.combined[i, "Number_People"])
}
#Now let's compare the distribution of survival rate for Mr. 1st class with Fare and New_Fare
ggplot(first.mr.df, aes(x=Fare, fill=Survived)) +
geom_density(alpha = 0.5) +
ggtitle("1st Mr distribution by Fare")
ggplot(first.mr.df, aes(x=Avg_Fare, fill=Survived)) +
geom_density(alpha = 0.5) +
ggtitle("1st Mr distribution by Fare")
#With the feature engineering on fare we have fixed the problem that in the test set there were a lot of fares without a reference in the training
ggplot(first.mr.df[which(first.mr.df$Survived != "None"),], aes(x=Avg_Fare, fill=Survived)) + geom_density(alpha = 0.5)
ggplot(first.mr.df[which(first.mr.df$Survived != "None"),], aes(x=Number_People, fill=Survived)) + geom_density(alpha = 0.5)
#regarding the number of people travelling with, always concerning 1st Mr, we have a major density for >4, but in that case the survaval rate is close to 0
#these two features show some interesting patterns which could be help in forecasting the survival rate, but might be correlated
summary(data.combined$Avg_Fare)
summary(data.combined$Number_People)
#even if not tecnically required here, it's a good practice to normalize data (using pre-process Caret function)
data.combined.to.normalise = data.combined[, c("Number_People", "Avg_Fare")]
preProc = preProcess(data.combined.to.normalise, method = c("center", "scale"))
data.combined.normalised = predict(preProc, data.combined.to.normalise)
#let's see if they are tollerated
cor(data.combined.normalised$Number_People, data.combined.normalised$Avg_Fare) #which is very low
cor(data.combined$Number_People, data.combined$Avg_Fare) #which is the same before normalising (deepen the use of normalisation)
data.combined$Family_Size = as.numeric(data.combined$Family_Size)
#let's cross validate the new model
features = c("Pclass", "New_Title", "Number_People", "Avg_Fare")
rpart.train.3 = data.combined[1:891, features]
rpart.3.cv.1 = rpart.cv(94622, rpart.train.3, rf.label.4, control1)
rpart.3.cv.1
prp(rpart.3.cv.1$finalModel, extra=1, under=TRUE)
rpart.3.cv.1$finalModel
#New Submission
test.submit.df3 = data.combined[which(data.combined$Survived == "None"), c("Pclass", "New_Title", "Number_People", "Avg_Fare")]
rpart.preds = predict(rpart.3.cv.1$finalModel, test.submit.df3, type = "class")
table(rpart.preds)
submit.df3 = data.frame(PassengerID = rep(892:nrow(data.combined)), Survived = rpart.preds)
write.csv(submit.df3, "RF_20190115_2.csv", row.names = FALSE)
#the result of accuracy is more than 0.80 and top 10% in the leaderboard
#to have the probability of surviving or not instead of the class
test.submit.df4 = data.combined[which(data.combined$Survived == "None"), c("Pclass", "New_Title", "Number_People", "Avg_Fare")]
rpart.preds4 = predict(rpart.3.cv.1$finalModel, test.submit.df3, type = "prob")
table(rpart.preds4)
submit.df4 = data.frame(PassengerID = rep(892:nrow(data.combined)), Survived = rpart.preds4)
write.csv(submit.df4, "Probability_of_Surviving.csv", row.names = FALSE)
#Let's do now the corresponding RandomForest using the new features and submit
library(randomForest)
features = c("Pclass", "New_Title", "Number_People", "Avg_Fare")
rf.train.temp = data.combined[1:891, features]
rf.label = as.factor(data.combined[which(data.combined$Survived != "None"), "Survived"])
set.seed(1234)
rf.temp = randomForest(x=rf.train.temp, y=rf.label.10, ntree = 1000)
rf.temp
varImpPlot(rf.temp)
data.combined$New_Title = as.factor(data.combined$New_Title)
rf.train.t = data.combined[1:891, c("Pclass", "New_Title", "Number_People", "Avg_Fare")]
rf.label.t = as.factor(train$Survived)
set.seed(1234)
rf.t = randomForest(rf.train.t, rf.label.t, importance = TRUE, ntree = 1000)
rf.t
varImpPlot(rf.t)
test.submit.df.t = data.combined[which(data.combined$Survived == "None"), c("Pclass", "New_Title", "Number_People", "Avg_Fare")]
rf.preds.t = predict(rf.t, test.submit.df.t)
table(rf.preds.t)
submit.df.t = data.frame(PassengerID = rep(892:nrow(data.combined)), Survived = rf.preds.t)
write.csv(submit.df2, "RF_20190115_4.csv", row.names = FALSE)
####################################################################################
# MUTUAL INFORMATION - BASIC
library(infotheo)
#mutual information -> how much information of Survival rate I get from each single variable
#it's useful to see if our effort in feature engineering are well repaid or we are loosing information
#it's useful to validate if the features we have selected actually bring information (if it's likely to be predictive)
mutinformation(rf.label, data.combined$Pclass[1:891])
mutinformation(rf.label, data.combined$Parch[1:891])
mutinformation(rf.label, data.combined$SibSp[1:891])
mutinformation(rf.label, data.combined$Ticket[1:891])
mutinformation(rf.label, data.combined$Title[1:891])
mutinformation(rf.label, data.combined$New_Title[1:891])
mutinformation(rf.label, data.combined$New_Title2[1:891])
mutinformation(rf.label, data.combined$Fare[1:891])
mutinformation(rf.label, data.combined$Avg_Fare[1:891])
mutinformation(rf.label, data.combined$Family_Size[1:891])
mutinformation(rf.label, data.combined$New_Family_Size[1:891])
mutinformation(rf.label, data.combined$Sex[1:891])
mutinformation(rf.label, data.combined$Number_People[1:891])
data.combined$FFF = discretize(data.combined$Fare)
mutinformation(rf.label, discretize(data.combined$Fare[1:891]))
mutinformation(rf.label, discretize(data.combined$Avg_Fare[1:891]))
####################################################################################
# DIMENSION REDUCTION - BASIC
library(Rtsne)
#we try to see a 2d visualization of a 4th dimensions space
#we start from the lower part of the graph - dataset of any record which is not Mr.
most.correct = data.combined[which(data.combined$Title != "Mr.")]
#to be finished
####################################################################################
# CONDITIONAL MUTUAL INFORMATION - BASIC
#to be done
####################################################################################
# MISSING VALUES - INPUTATION NA
#let's take age which is the value with the highest number of NA values
#Note: with such a high number of NA, it' better to find a proxy for age (like Title)
sum(is.na(data.combined$Age))
sum(!(is.na(data.combined$Age)))
#handmade
mean_title1 = mean(data.combined[which(data.combined$Title == "Mr."), "Age"], na.rm = TRUE)
mean_title2 = mean(data.combined[which(data.combined$Title == "Mrs."), "Age"], na.rm = TRUE)
mean_title3 = mean(data.combined[which(data.combined$Title == "Miss."), "Age"], na.rm = TRUE)
mean_title4 = mean(data.combined[which(data.combined$Title == "Master."), "Age"], na.rm = TRUE)
mean_title5 = mean(data.combined[which(data.combined$Title == "Other."), "Age"], na.rm = TRUE)
data.combined$Age2 = data.combined$Age
data.combined[which(data.combined$Title == "Mr." & is.na(data.combined$Age2)), "Age2"] = mean_title1
data.combined[which(data.combined$Title == "Mrs." & is.na(data.combined$Age2)), "Age2"] = mean_title2
data.combined[which(data.combined$Title == "Miss." & is.na(data.combined$Age2)), "Age2"] = mean_title3
data.combined[which(data.combined$Title == "Master." & is.na(data.combined$Age2)), "Age2"] = mean_title4
data.combined[which(data.combined$Title == "Other." & is.na(data.combined$Age2)), "Age2"] = mean_title5
summary(data.combined$Age)
summary(data.combined$Age2)
ggplot(data.combined[1:891,], aes(x=Age, fill=Survived)) + geom_density(alpha=0.5)
ggplot(data.combined[1:891,], aes(x=Age2, fill=Survived)) + geom_density(alpha=0.5)
#if I want to sobstitute with the mean or median of everything in few rows
library(Hmisc)
data.combined$Age3 = data.combined$Age
set.seed(1234)
data.combined$Age3 = as.integer(impute(data.combined$Age3, fun = mean))
ggplot(data.combined[1:891,], aes(x=Age3, fill=Survived)) + geom_density(alpha=0.5)
data.combined$Age4 = data.combined$Age
data.combined$Age4 = as.integer(impute(data.combined$Age4, fun = median))
ggplot(data.combined[1:891,], aes(x=Age4, fill=Survived)) + geom_density(alpha=0.5)
#linear regression model
data.combined$Age5 = data.combined$Age
model = lm(Age5 ~ Pclass + New_Title + Number_People + Avg_Fare, data.combined)
I = is.na(data.combined$Age5)
data.combined$Age5[I] = as.integer(predict(model, newdata = data.combined[I, ]))
ggplot(data.combined[1:891,], aes(x=Age5, fill=Survived)) + geom_density(alpha=0.5)
#let's see how Age5 now can explain the survival rate
mutinformation(rf.label, discretize(data.combined$Age5[1:891])) #very few :(
#test correlation between the new Age and the categorical variable New_Title
#read the following link:
#https://stats.stackexchange.com/questions/119835/correlation-between-a-nominal-iv-and-a-continuous-dv-variable
#as a training, let's take a categorical variable (Embarked) and let's do imputation on that
data.combined[which(data.combined$Embarked == ""), "Embarked"] = NA
data.combined[which(is.na(data.combined$Embarked)),]
data.combined$Embarked2 = data.combined$Embarked
set.seed(1234)
data.combined$Embarked2 = impute(data.combined$Embarked2, "random")
data.combined[which(is.na(data.combined$Embarked2)),] #NA values are no longer here
library(missMDA)
data("ozone")
res.impute <- imputeFAMD(ozone, ncp=3)
New_Ozone_df = res.impute$completeObs
#let's apply this to data.combined
data.combined2 = data.combined
data.combined2[, c(3, 8, 10:31)] <- NULL
data.combined2[c(7:9, 25:30, 77:78), 3] <- NA
res.impute2 <- imputeFAMD(data.combined2, ncp=5)
data.combined.new = res.impute2$completeObs
data.combined.new$Survived = as.character(data.combined.new$Survived)
Survived1 = substr(data.combined.new$Survived,nchar(data.combined.new$Survived),nchar(data.combined.new$Survived)+1)[1:891]
Survived2 = substr(data.combined.new$Survived,nchar(data.combined.new$Survived)-3,nchar(data.combined.new$Survived))[892:1309]
Survivedt = c(Survived1, Survived2)
data.combined.new$Survived = as.factor(Survivedt)
data.combined.new$Pclass = as.character(data.combined.new$Pclass)
Classt = substr(data.combined.new$Pclass,nchar(data.combined.new$Pclass),nchar(data.combined.new$Pclass)+1)
data.combined.new$Pclass = as.factor(Classt)
####################################################################################
# OUTLIERS DETECTION AND FIXING
#let's take for example Age6 in order to include it inside the model (even if it doesn't seem to bring a lot of predictive capacity)
boxplot.stats(data.combined$Age5)$out
data.combined3 = data.combined[which(!(data.combined$Age5 %in% boxplot.stats(data.combined$Age5)$out) & data.combined$Survived != "None"),]
rf.label.data.combined3 = data.combined3$Survived
mutinformation(discretize(data.combined3$Age), rf.label.data.combined3) #it doen't make any sense to drop the outliers
#let's make a submission
features = c("Pclass", "New_Title", "Number_People", "Avg_Fare", "Age5")
rpart.train.4 = data.combined[1:891, features]
rpart.3.cv.2 = rpart.cv(94622, rpart.train.4, rf.label.4, control1)
rpart.3.cv.2
prp(rpart.3.cv.1$finalModel, extra=1, under=TRUE)
rpart.3.cv.1$finalModel
test.submit.df5 = data.combined[which(data.combined$Survived == "None"), c("Pclass", "New_Title", "Number_People", "Avg_Fare", "Age5")]
rpart.preds = predict(rpart.3.cv.2$finalModel, test.submit.df5, type = "class")
table(rpart.preds)
submit.df3 = data.frame(PassengerID = rep(892:nrow(data.combined)), Survived = rpart.preds)
write.csv(submit.df3, "RF_20190116_2.csv", row.names = FALSE)
####################################################################################
# SCALING OF EACH VARIABLE
#scaling all the variables included in the model (normalization) and then check mutial information and submit
data.combined_BackUP = data.combined
preProc = preProcess(data.combined[,c("Number_People", "Avg_Fare", "Age5")], method = c("center", "scale"))
data.combined[,c("Number_People", "Avg_Fare", "Age5")] = predict(preProc, data.combined[,c("Number_People", "Avg_Fare", "Age5")])
features = c("Pclass", "New_Title", "Number_People", "Avg_Fare", "Age5")
rpart.train.4 = data.combined[1:891, features]
rpart.3.cv.2 = rpart.cv(94622, rpart.train.4, rf.label.4, control1)
rpart.3.cv.2
prp(rpart.3.cv.1$finalModel, extra=1, under=TRUE)
rpart.3.cv.1$finalModel
test.submit.df5 = data.combined[which(data.combined$Survived == "None"), c("Pclass", "New_Title", "Number_People", "Avg_Fare", "Age5")]
rpart.preds = predict(rpart.3.cv.2$finalModel, test.submit.df5, type = "class")
table(rpart.preds)
submit.df3 = data.frame(PassengerID = rep(892:nrow(data.combined)), Survived = rpart.preds)
write.csv(submit.df3, "RF_20190116_3.csv", row.names = FALSE)
Nessun commento:
Posta un commento