Titanic Data Family Size and Chance of Survival Logistic Regression

Predicting the Survival of Titanic Passengers

Niklas Donges

RMS Titanic

Importing the Libraries

                      # linear algebra            
import numpy every bit np

# data processing
import pandas as pd

# information visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

Getting the Data

          test_df = pd.read_csv("test.csv")
train_df = pd.read_csv("train.csv")

Data Exploration/Analysis

          train_df.info()        

          survival:    Survival            
PassengerId: Unique Id of a passenger.
pclass: Ticket class
sex: Sex
Age: Age in years
sibsp: # of siblings / spouses aboard the Titanic
parch: # of parents / children aboard the Titanic
ticket: Ticket number
fare: Passenger fare
motel: Cabin number
embarked: Port of Embarkation
train_df.describe()

          train_df.caput(8)        

          total = train_df.isnull().sum().sort_values(ascending=Fake)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1, i)).sort_values(ascending=Fake)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.caput(v)

          train_df.columns.values        

          survived = 'survived'
not_survived = 'non survived'
fig, axes = plt.subplots(nrows=ane, ncols=ii,figsize=(10, 4))
women = train_df[train_df['Sex']=='female']
men = train_df[train_df['Sex']=='male']
ax = sns.distplot(women[women['Survived']==one].Age.dropna(), bins=18, characterization = survived, ax = axes[0], kde =False)
ax = sns.distplot(women[women['Survived']==0].Historic period.dropna(), bins=forty, label = not_survived, ax = axes[0], kde =False)
ax.legend()
ax.set_title('Female')
ax = sns.distplot(men[men['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[1], kde = False)
ax = sns.distplot(men[men['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[1], kde = False)
ax.legend()
_ = ax.set_title('Male')

          FacetGrid = sns.FacetGrid(train_df, row='Embarked', size=4.v, aspect=1.vi)
FacetGrid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette=None, order=None, hue_order=None )
FacetGrid.add_legend()

          sns.barplot(ten='Pclass', y='Survived', information=train_df)        

          filigree = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=ii.ii, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend();

          data = [train_df, test_df]
for dataset in data:
dataset['relatives'] = dataset['SibSp'] + dataset['Parch']
dataset.loc[dataset['relatives'] > 0, 'not_alone'] = 0
dataset.loc[dataset['relatives'] == 0, 'not_alone'] = one
dataset['not_alone'] = dataset['not_alone'].astype(int)
train_df['not_alone'].value_counts()

          axes = sns.factorplot('relatives','Survived',            
data=train_df, aspect = 2.5, )

Data Preprocessing

          train_df = train_df.drop(['PassengerId'], axis=1)        

Missing Data:

                      import            re            
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "East": 5, "F": 6, "Yard": vii, "U": viii}
information = [train_df, test_df]

for dataset in data:
dataset['Cabin'] = dataset['Cabin'].fillna("U0")
dataset['Deck'] = dataset['Motel'].map(lambda x: re.compile("([a-zA-Z]+)").search(10).group())
dataset['Deck'] = dataset['Deck'].map(deck)
dataset['Deck'] = dataset['Deck'].fillna(0)
dataset['Deck'] = dataset['Deck'].astype(int)

# we tin can now driblet the cabin characteristic
train_df = train_df.drib(['Motel'], axis=1)
test_df = test_df.drop(['Motel'], axis=1)
          data = [train_df, test_df]

for dataset in data:
mean = train_df["Historic period"].mean()
std = test_df["Age"].std()
is_null = dataset["Age"].isnull().sum()
# compute random numbers between the hateful, std and is_null
rand_age = np.random.randint(mean - std, hateful + std, size = is_null)
# fill NaN values in Age column with random values generated
age_slice = dataset["Historic period"].copy()
age_slice[np.isnan(age_slice)] = rand_age
dataset["Age"] = age_slice
dataset["Age"] = train_df["Age"].astype(int)

train_df["Age"].isnull().sum()

          train_df['Embarked'].draw()        

          common_value = 'S'
data = [train_df, test_df]

for dataset in data:
dataset['Embarked'] = dataset['Embarked'].fillna(common_value)

Converting Features:

          train_df.info()        

          information = [train_df, test_df]

for dataset in data:
dataset['Fare'] = dataset['Fare'].fillna(0)
dataset['Fare'] = dataset['Fare'].astype(int)

          data = [train_df, test_df]
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Primary": iv, "Rare": five}

for dataset in information:
# extract titles
dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
# replace titles with a more common title or as Rare
dataset['Title'] = dataset['Title'].supervene upon(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
dataset['Title'] = dataset['Title'].supplant('Mlle', 'Miss')
dataset['Championship'] = dataset['Title'].replace('Ms', 'Miss')
dataset['Title'] = dataset['Championship'].replace('Mme', 'Mrs')
# catechumen titles into numbers
dataset['Title'] = dataset['Title'].map(titles)
# filling NaN with 0, to get safe
dataset['Title'] = dataset['Title'].fillna(0)

train_df = train_df.driblet(['Name'], axis=ane)
test_df = test_df.drop(['Name'], axis=1)
          genders = {"male": 0, "female": i}
data = [train_df, test_df]

for dataset in data:
dataset['Sex'] = dataset['Sex activity'].map(genders)

          train_df['Ticket'].describe()        

          train_df = train_df.drop(['Ticket'], axis=1)
test_df = test_df.drop(['Ticket'], axis=ane)
          ports = {"S": 0, "C": 1, "Q": ii}
information = [train_df, test_df]

for dataset in information:
dataset['Embarked'] = dataset['Embarked'].map(ports)

Creating Categories:

          information = [train_df, test_df]
for dataset in data:
dataset['Age'] = dataset['Age'].astype(int)
dataset.loc[ dataset['Age'] <= 11, 'Historic period'] = 0
dataset.loc[(dataset['Historic period'] > 11) & (dataset['Historic period'] <= 18), 'Age'] = i
dataset.loc[(dataset['Age'] > xviii) & (dataset['Historic period'] <= 22), 'Historic period'] = 2
dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3
dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4
dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= twoscore), 'Historic period'] = five
dataset.loc[(dataset['Historic period'] > forty) & (dataset['Age'] <= 66), 'Age'] = 6
dataset.loc[ dataset['Age'] > 66, 'Age'] = 6

# let'south see how it's distributed
train_df['Historic period'].value_counts()

          train_df.caput(10)        

          information = [train_df, test_df]

for dataset in information:
dataset.loc[ dataset['Fare'] <= seven.91, 'Fare'] = 0
dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = ane
dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
dataset.loc[(dataset['Fare'] > 31) & (dataset['Fare'] <= 99), 'Fare'] = 3
dataset.loc[(dataset['Fare'] > 99) & (dataset['Fare'] <= 250), 'Fare'] = 4
dataset.loc[ dataset['Fare'] > 250, 'Fare'] = 5
dataset['Fare'] = dataset['Fare'].astype(int)

Creating new Features

          data = [train_df, test_df]
for dataset in information:
dataset['Age_Class']= dataset['Age']* dataset['Pclass']
                      for            dataset            in            data:
dataset['Fare_Per_Person'] = dataset['Fare']/(dataset['relatives']+1)
dataset['Fare_Per_Person'] = dataset['Fare_Per_Person'].astype(int)
# Allow's take a last look at the preparation prepare, earlier we start training the models.
train_df.head(10)

Edifice Motorcar Learning Models

          X_train = train_df.drop("Survived", centrality=i)
Y_train = train_df["Survived"]
X_test = test_df.driblet("PassengerId", axis=one).copy()
          sgd = linear_model.SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)

sgd.score(X_train, Y_train)

acc_sgd = round(sgd.score(X_train, Y_train) * 100, ii)

          random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)

Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)

          logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

acc_log = round(logreg.score(X_train, Y_train) * 100, 2)

                      # KNN            knn = KNeighborsClassifier(n_neighbors = iii) knn.fit(X_train, Y_train)  Y_pred = knn.predict(X_test)  acc_knn = circular(knn.score(X_train, Y_train) * 100, 2)        
          gaussian = GaussianNB() gaussian.fit(X_train, Y_train)  Y_pred = gaussian.predict(X_test)  acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)        
          perceptron = Perceptron(max_iter=5)
perceptron.fit(X_train, Y_train)

Y_pred = perceptron.predict(X_test)

acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, ii)

          linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)

Y_pred = linear_svc.predict(X_test)

acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, ii)

          decision_tree = DecisionTreeClassifier() decision_tree.fit(X_train, Y_train)  Y_pred = decision_tree.predict(X_test)  acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)        

Which is the best Model ?

          results = pd.DataFrame({
'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression',
'Random Forest', 'Naive Bayes', 'Perceptron',
'Stochastic Slope Decent',
'Conclusion Tree'],
'Score': [acc_linear_svc, acc_knn, acc_log,
acc_random_forest, acc_gaussian, acc_perceptron,
acc_sgd, acc_decision_tree]})
result_df = results.sort_values(past='Score', ascending=Fake)
result_df = result_df.set_index('Score')
result_df.caput(9)

K-Fold Cantankerous Validation:

                      from            sklearn.model_selection            import            cross_val_score
rf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(rf, X_train, Y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Divergence:", scores.std())

Random Woods

What is Random Forest ?

Feature Importance

          importances = pd.DataFrame({'characteristic':X_train.columns,'importance':np.round(random_forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.caput(15)

          importances.plot.bar()        

Decision:

          train_df  = train_df.drop("not_alone", axis=one)
test_df = test_df.drop("not_alone", centrality=one)

train_df = train_df.drop("Parch", axis=1)
test_df = test_df.drop("Parch", axis=1)

                      # Random Forest            

random_forest = RandomForestClassifier(n_estimators=100, oob_score = Truthful)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)

acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
print(round(acc_random_forest,2,), "%")

          print("oob score:", round(random_forest.oob_score_, four)*100, "%")        

Hyperparameter Tuning

          param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1, v, 10, 25, 50, 70], "min_samples_split" : [2, iv, ten, 12, xvi, 18, 25, 35], "n_estimators": [100, 400, 700, 1000, 1500]}          from sklearn.model_selection import GridSearchCV, cross_val_score          rf = RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1, n_jobs=-1)          clf = GridSearchCV(calculator=rf, param_grid=param_grid, n_jobs=-ane)          clf.fit(X_train, Y_train)          clf.bestparams                  

                      # Random Forest            
random_forest = RandomForestClassifier(criterion = "gini",
min_samples_leaf = 1,
min_samples_split = 10,
n_estimators=100,
max_features='auto',
oob_score=True,
random_state=1,
n_jobs=-1)

random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)

impress("oob score:", round(random_forest.oob_score_, iv)*100, "%")

Further Evaluation

Confusion Matrix:

                      from            sklearn.model_selection            import            cross_val_predict
from sklearn.metrics import confusion_matrix
predictions = cross_val_predict(random_forest, X_train, Y_train, cv=iii)
confusion_matrix(Y_train, predictions)

Precision and Call back:

                      from            sklearn.metrics            import            precision_score, recall_score

print("Precision:", precision_score(Y_train, predictions))
print("Call back:",recall_score(Y_train, predictions))

F-Score

                      from            sklearn.metrics            import            f1_score
f1_score(Y_train, predictions)

Precision Recall Curve

                      from            sklearn.metrics            import            precision_recall_curve

# getting the probabilities of our predictions
y_scores = random_forest.predict_proba(X_train)
y_scores = y_scores[:,1]

precision, recall, threshold = precision_recall_curve(Y_train, y_scores)

def plot_precision_and_recall(precision, recall, threshold):
plt.plot(threshold, precision[:-1], "r-", characterization="precision", linewidth=five)
plt.plot(threshold, call back[:-1], "b", label="recall", linewidth=5)
plt.xlabel("threshold", fontsize=19)
plt.legend(loc="upper right", fontsize=xix)
plt.ylim([0, 1])

plt.effigy(figsize=(14, 7))
plot_precision_and_recall(precision, retrieve, threshold)
plt.testify()

                      def            plot_precision_vs_recall(precision, retrieve):
plt.plot(call up, precision, "thousand--", linewidth=two.5)
plt.ylabel("recall", fontsize=19)
plt.xlabel("precision", fontsize=19)
plt.axis([0, one.five, 0, 1.5])

plt.figure(figsize=(14, 7))
plot_precision_vs_recall(precision, recollect)
plt.show()

ROC AUC Bend

                      from            sklearn.metrics            import            roc_curve
# compute true positive rate and false positive rate
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, y_scores)
# plotting them against each other
def plot_roc_curve(false_positive_rate, true_positive_rate, characterization=None):
plt.plot(false_positive_rate, true_positive_rate, linewidth=ii, label=label)
plt.plot([0, 1], [0, 1], 'r', linewidth=four)
plt.axis([0, one, 0, 1])
plt.xlabel('False Positive Rate (FPR)', fontsize=16)
plt.ylabel('True Positive Rate (TPR)', fontsize=16)

plt.effigy(figsize=(14, 7))
plot_roc_curve(false_positive_rate, true_positive_rate)
plt.show()

ROC AUC Score

                      from            sklearn.metrics            import            roc_auc_score
r_a_score = roc_auc_score(Y_train, y_scores)
impress("ROC-AUC-Score:", r_a_score)

Summary

cousensfria1950.blogspot.com

Source: https://towardsdatascience.com/predicting-the-survival-of-titanic-passengers-30870ccc7e8

0 Response to "Titanic Data Family Size and Chance of Survival Logistic Regression"

Post a Comment

Iklan Atas Artikel

Iklan Tengah Artikel 1

Iklan Tengah Artikel 2

Iklan Bawah Artikel