Kaggle泰坦尼克号沉船生存预测,已经是数据挖掘界国际经典入门案例了。
那,小试“牛”刀。
#Titanic: Machine Learning from Disaster#
import pandas as pd
import numpy as np
train1= pd. read_csv( "D:/2018_BigData/Python/Kaggle_learning/Titanic Machine Learning from Disaster/titanic/train.csv" )
train1. head( 5 )
PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
0
1
0
3
Braund, Mr. Owen Harris
male
22.0
1
0
A/5 21171
7.2500
NaN
S
1
2
1
1
Cumings, Mrs. John Bradley (Florence Briggs Th…
female
38.0
1
0
PC 17599
71.2833
C85
C
2
3
1
3
Heikkinen, Miss. Laina
female
26.0
0
0
STON/O2. 3101282
7.9250
NaN
S
3
4
1
1
Futrelle, Mrs. Jacques Heath (Lily May Peel)
female
35.0
1
0
113803
53.1000
C123
S
4
5
0
3
Allen, Mr. William Henry
male
35.0
0
0
373450
8.0500
NaN
S
train1. describe( )
PassengerId
Survived
Pclass
Age
SibSp
Parch
Fare
count
891.000000
891.000000
891.000000
714.000000
891.000000
891.000000
891.000000
mean
446.000000
0.383838
2.308642
29.699118
0.523008
0.381594
32.204208
std
257.353842
0.486592
0.836071
14.526497
1.102743
0.806057
49.693429
min
1.000000
0.000000
1.000000
0.420000
0.000000
0.000000
0.000000
25%
223.500000
0.000000
2.000000
20.125000
0.000000
0.000000
7.910400
50%
446.000000
0.000000
3.000000
28.000000
0.000000
0.000000
14.454200
75%
668.500000
1.000000
3.000000
38.000000
1.000000
0.000000
31.000000
max
891.000000
1.000000
3.000000
80.000000
8.000000
6.000000
512.329200
train1. describe( include= "O" )
Name
Sex
Ticket
Cabin
Embarked
count
891
891
891
204
889
unique
891
2
681
147
3
top
Jensen, Mr. Hans Peder
male
347082
B96 B98
S
freq
1
577
7
4
644
train1. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
train1. isnull( ) . any ( )
PassengerId False
Survived False
Pclass False
Name False
Sex False
Age True
SibSp False
Parch False
Ticket False
Fare False
Cabin True
Embarked True
dtype: bool
train1. isnull( ) . sum ( ) . sort_values( ascending= False )
Cabin 687
Age 177
Embarked 2
Fare 0
Ticket 0
Parch 0
SibSp 0
Sex 0
Name 0
Pclass 0
Survived 0
PassengerId 0
dtype: int64
train1[ train1. Embarked. isnull( ) ]
PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
61
62
1
1
Icard, Miss. Amelie
female
38.0
0
0
113572
80.0
B28
NaN
829
830
1
1
Stone, Mrs. George Nelson (Martha Evelyn)
female
62.0
0
0
113572
80.0
B28
NaN
train2= train1. fillna( { "Embarked" : "S" } )
train2. head( 20 )
PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
0
1
0
3
Braund, Mr. Owen Harris
male
22.0
1
0
A/5 21171
7.2500
NaN
S
1
2
1
1
Cumings, Mrs. John Bradley (Florence Briggs Th…
female
38.0
1
0
PC 17599
71.2833
C85
C
2
3
1
3
Heikkinen, Miss. Laina
female
26.0
0
0
STON/O2. 3101282
7.9250
NaN
S
3
4
1
1
Futrelle, Mrs. Jacques Heath (Lily May Peel)
female
35.0
1
0
113803
53.1000
C123
S
4
5
0
3
Allen, Mr. William Henry
male
35.0
0
0
373450
8.0500
NaN
S
5
6
0
3
Moran, Mr. James
male
NaN
0
0
330877
8.4583
NaN
Q
6
7
0
1
McCarthy, Mr. Timothy J
male
54.0
0
0
17463
51.8625
E46
S
7
8
0
3
Palsson, Master. Gosta Leonard
male
2.0
3
1
349909
21.0750
NaN
S
8
9
1
3
Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
female
27.0
0
2
347742
11.1333
NaN
S
9
10
1
2
Nasser, Mrs. Nicholas (Adele Achem)
female
14.0
1
0
237736
30.0708
NaN
C
10
11
1
3
Sandstrom, Miss. Marguerite Rut
female
4.0
1
1
PP 9549
16.7000
G6
S
11
12
1
1
Bonnell, Miss. Elizabeth
female
58.0
0
0
113783
26.5500
C103
S
12
13
0
3
Saundercock, Mr. William Henry
male
20.0
0
0
A/5. 2151
8.0500
NaN
S
13
14
0
3
Andersson, Mr. Anders Johan
male
39.0
1
5
347082
31.2750
NaN
S
14
15
0
3
Vestrom, Miss. Hulda Amanda Adolfina
female
14.0
0
0
350406
7.8542
NaN
S
15
16
1
2
Hewlett, Mrs. (Mary D Kingcome)
female
55.0
0
0
248706
16.0000
NaN
S
16
17
0
3
Rice, Master. Eugene
male
2.0
4
1
382652
29.1250
NaN
Q
17
18
1
2
Williams, Mr. Charles Eugene
male
NaN
0
0
244373
13.0000
NaN
S
18
19
0
3
Vander Planke, Mrs. Julius (Emelia Maria Vande…
female
31.0
1
0
345763
18.0000
NaN
S
19
20
1
3
Masselmani, Mrs. Fatima
female
NaN
0
0
2649
7.2250
NaN
C
train2[ train2. Embarked. isnull( ) ]
PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
train2. isnull( ) . sum ( ) . sort_values( ascending= False )
Cabin 687
Age 177
Embarked 0
Fare 0
Ticket 0
Parch 0
SibSp 0
Sex 0
Name 0
Pclass 0
Survived 0
PassengerId 0
dtype: int64
train3= train2. fillna( train2[ 'Age' ] . median( ) )
train3. head( 20 )
PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
0
1
0
3
Braund, Mr. Owen Harris
male
22.0
1
0
A/5 21171
7.2500
28
S
1
2
1
1
Cumings, Mrs. John Bradley (Florence Briggs Th…
female
38.0
1
0
PC 17599
71.2833
C85
C
2
3
1
3
Heikkinen, Miss. Laina
female
26.0
0
0
STON/O2. 3101282
7.9250
28
S
3
4
1
1
Futrelle, Mrs. Jacques Heath (Lily May Peel)
female
35.0
1
0
113803
53.1000
C123
S
4
5
0
3
Allen, Mr. William Henry
male
35.0
0
0
373450
8.0500
28
S
5
6
0
3
Moran, Mr. James
male
28.0
0
0
330877
8.4583
28
Q
6
7
0
1
McCarthy, Mr. Timothy J
male
54.0
0
0
17463
51.8625
E46
S
7
8
0
3
Palsson, Master. Gosta Leonard
male
2.0
3
1
349909
21.0750
28
S
8
9
1
3
Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
female
27.0
0
2
347742
11.1333
28
S
9
10
1
2
Nasser, Mrs. Nicholas (Adele Achem)
female
14.0
1
0
237736
30.0708
28
C
10
11
1
3
Sandstrom, Miss. Marguerite Rut
female
4.0
1
1
PP 9549
16.7000
G6
S
11
12
1
1
Bonnell, Miss. Elizabeth
female
58.0
0
0
113783
26.5500
C103
S
12
13
0
3
Saundercock, Mr. William Henry
male
20.0
0
0
A/5. 2151
8.0500
28
S
13
14
0
3
Andersson, Mr. Anders Johan
male
39.0
1
5
347082
31.2750
28
S
14
15
0
3
Vestrom, Miss. Hulda Amanda Adolfina
female
14.0
0
0
350406
7.8542
28
S
15
16
1
2
Hewlett, Mrs. (Mary D Kingcome)
female
55.0
0
0
248706
16.0000
28
S
16
17
0
3
Rice, Master. Eugene
male
2.0
4
1
382652
29.1250
28
Q
17
18
1
2
Williams, Mr. Charles Eugene
male
28.0
0
0
244373
13.0000
28
S
18
19
0
3
Vander Planke, Mrs. Julius (Emelia Maria Vande…
female
31.0
1
0
345763
18.0000
28
S
19
20
1
3
Masselmani, Mrs. Fatima
female
28.0
0
0
2649
7.2250
28
C
train3. isnull( ) . sum ( )
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 0
Embarked 0
dtype: int64
train2[ "Age" ] = train2[ "Age" ] . fillna( train2[ "Age" ] . median( ) )
train2. isnull( ) . sum ( ) . sort_values( ascending= False )
Cabin 687
Embarked 0
Fare 0
Ticket 0
Parch 0
SibSp 0
Age 0
Sex 0
Name 0
Pclass 0
Survived 0
PassengerId 0
dtype: int64
train2. head( 20 )
PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
0
1
0
3
Braund, Mr. Owen Harris
male
22.0
1
0
A/5 21171
7.2500
NaN
S
1
2
1
1
Cumings, Mrs. John Bradley (Florence Briggs Th…
female
38.0
1
0
PC 17599
71.2833
C85
C
2
3
1
3
Heikkinen, Miss. Laina
female
26.0
0
0
STON/O2. 3101282
7.9250
NaN
S
3
4
1
1
Futrelle, Mrs. Jacques Heath (Lily May Peel)
female
35.0
1
0
113803
53.1000
C123
S
4
5
0
3
Allen, Mr. William Henry
male
35.0
0
0
373450
8.0500
NaN
S
5
6
0
3
Moran, Mr. James
male
28.0
0
0
330877
8.4583
NaN
Q
6
7
0
1
McCarthy, Mr. Timothy J
male
54.0
0
0
17463
51.8625
E46
S
7
8
0
3
Palsson, Master. Gosta Leonard
male
2.0
3
1
349909
21.0750
NaN
S
8
9
1
3
Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
female
27.0
0
2
347742
11.1333
NaN
S
9
10
1
2
Nasser, Mrs. Nicholas (Adele Achem)
female
14.0
1
0
237736
30.0708
NaN
C
10
11
1
3
Sandstrom, Miss. Marguerite Rut
female
4.0
1
1
PP 9549
16.7000
G6
S
11
12
1
1
Bonnell, Miss. Elizabeth
female
58.0
0
0
113783
26.5500
C103
S
12
13
0
3
Saundercock, Mr. William Henry
male
20.0
0
0
A/5. 2151
8.0500
NaN
S
13
14
0
3
Andersson, Mr. Anders Johan
male
39.0
1
5
347082
31.2750
NaN
S
14
15
0
3
Vestrom, Miss. Hulda Amanda Adolfina
female
14.0
0
0
350406
7.8542
NaN
S
15
16
1
2
Hewlett, Mrs. (Mary D Kingcome)
female
55.0
0
0
248706
16.0000
NaN
S
16
17
0
3
Rice, Master. Eugene
male
2.0
4
1
382652
29.1250
NaN
Q
17
18
1
2
Williams, Mr. Charles Eugene
male
28.0
0
0
244373
13.0000
NaN
S
18
19
0
3
Vander Planke, Mrs. Julius (Emelia Maria Vande…
female
31.0
1
0
345763
18.0000
NaN
S
19
20
1
3
Masselmani, Mrs. Fatima
female
28.0
0
0
2649
7.2250
NaN
C
test1= pd. read_csv( "D:/2018_BigData/Python/Kaggle_learning/Titanic Machine Learning from Disaster/titanic/test.csv" )
test1. head( 5 )
PassengerId
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
0
892
3
Kelly, Mr. James
male
34.5
0
0
330911
7.8292
NaN
Q
1
893
3
Wilkes, Mrs. James (Ellen Needs)
female
47.0
1
0
363272
7.0000
NaN
S
2
894
2
Myles, Mr. Thomas Francis
male
62.0
0
0
240276
9.6875
NaN
Q
3
895
3
Wirz, Mr. Albert
male
27.0
0
0
315154
8.6625
NaN
S
4
896
3
Hirvonen, Mrs. Alexander (Helga E Lindqvist)
female
22.0
1
1
3101298
12.2875
NaN
S
test1. isnull( ) . sum ( ) . sort_values( ascending= False )
Cabin 327
Age 86
Fare 1
Embarked 0
Ticket 0
Parch 0
SibSp 0
Sex 0
Name 0
Pclass 0
PassengerId 0
dtype: int64
test1[ test1. Fare. isnull( ) ]
PassengerId
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
152
1044
3
Storey, Mr. Thomas
male
60.5
0
0
3701
NaN
NaN
S
test1[ "Fare" ] = test1. groupby( "Pclass" ) . transform( lambda x: x. fillna( x. mean( ) ) )
test1. isnull( ) . sum ( ) . sort_values( ascending= False )
Cabin 327
Age 86
Embarked 0
Fare 0
Ticket 0
Parch 0
SibSp 0
Sex 0
Name 0
Pclass 0
PassengerId 0
dtype: int64
test1. loc[ 152 , [ "Pclass" , "Name" , "Age" , "Fare" ] ]
Pclass 3
Name Storey, Mr. Thomas
Age 60.5
Fare 1044
Name: 152, dtype: object
test1. loc[ 152 ]
PassengerId 1044
Pclass 3
Name Storey, Mr. Thomas
Sex male
Age 60.5
SibSp 0
Parch 0
Ticket 3701
Fare 1044
Cabin NaN
Embarked S
Name: 152, dtype: object
Fare_Pclass_mean = test1. groupby( "Pclass" ) [ "Fare" ] . mean( )
Fare_Pclass_mean
Pclass
1 1098.224299
2 1117.935484
3 1094.178899
Name: Fare, dtype: float64
test2= pd. read_csv( "D:/2018_BigData/Python/Kaggle_learning/Titanic Machine Learning from Disaster/titanic/test.csv" )
test2. head( 35 )
PassengerId
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
0
892
3
Kelly, Mr. James
male
34.5
0
0
330911
7.8292
NaN
Q
1
893
3
Wilkes, Mrs. James (Ellen Needs)
female
47.0
1
0
363272
7.0000
NaN
S
2
894
2
Myles, Mr. Thomas Francis
male
62.0
0
0
240276
9.6875
NaN
Q
3
895
3
Wirz, Mr. Albert
male
27.0
0
0
315154
8.6625
NaN
S
4
896
3
Hirvonen, Mrs. Alexander (Helga E Lindqvist)
female
22.0
1
1
3101298
12.2875
NaN
S
5
897
3
Svensson, Mr. Johan Cervin
male
14.0
0
0
7538
9.2250
NaN
S
6
898
3
Connolly, Miss. Kate
female
30.0
0
0
330972
7.6292
NaN
Q
7
899
2
Caldwell, Mr. Albert Francis
male
26.0
1
1
248738
29.0000
NaN
S
8
900
3
Abrahim, Mrs. Joseph (Sophie Halaut Easu)
female
18.0
0
0
2657
7.2292
NaN
C
9
901
3
Davies, Mr. John Samuel
male
21.0
2
0
A/4 48871
24.1500
NaN
S
10
902
3
Ilieff, Mr. Ylio
male
NaN
0
0
349220
7.8958
NaN
S
11
903
1
Jones, Mr. Charles Cresson
male
46.0
0
0
694
26.0000
NaN
S
12
904
1
Snyder, Mrs. John Pillsbury (Nelle Stevenson)
female
23.0
1
0
21228
82.2667
B45
S
13
905
2
Howard, Mr. Benjamin
male
63.0
1
0
24065
26.0000
NaN
S
14
906
1
Chaffee, Mrs. Herbert Fuller (Carrie Constance…
female
47.0
1
0
W.E.P. 5734
61.1750
E31
S
15
907
2
del Carlo, Mrs. Sebastiano (Argenia Genovesi)
female
24.0
1
0
SC/PARIS 2167
27.7208
NaN
C
16
908
2
Keane, Mr. Daniel
male
35.0
0
0
233734
12.3500
NaN
Q
17
909
3
Assaf, Mr. Gerios
male
21.0
0
0
2692
7.2250
NaN
C
18
910
3
Ilmakangas, Miss. Ida Livija
female
27.0
1
0
STON/O2. 3101270
7.9250
NaN
S
19
911
3
Assaf Khalil, Mrs. Mariana (Miriam”)”
female
45.0
0
0
2696
7.2250
NaN
C
20
912
1
Rothschild, Mr. Martin
male
55.0
1
0
PC 17603
59.4000
NaN
C
21
913
3
Olsen, Master. Artur Karl
male
9.0
0
1
C 17368
3.1708
NaN
S
22
914
1
Flegenheim, Mrs. Alfred (Antoinette)
female
NaN
0
0
PC 17598
31.6833
NaN
S
23
915
1
Williams, Mr. Richard Norris II
male
21.0
0
1
PC 17597
61.3792
NaN
C
24
916
1
Ryerson, Mrs. Arthur Larned (Emily Maria Borie)
female
48.0
1
3
PC 17608
262.3750
B57 B59 B63 B66
C
25
917
3
Robins, Mr. Alexander A
male
50.0
1
0
A/5. 3337
14.5000
NaN
S
26
918
1
Ostby, Miss. Helene Ragnhild
female
22.0
0
1
113509
61.9792
B36
C
27
919
3
Daher, Mr. Shedid
male
22.5
0
0
2698
7.2250
NaN
C
28
920
1
Brady, Mr. John Bertram
male
41.0
0
0
113054
30.5000
A21
S
29
921
3
Samaan, Mr. Elias
male
NaN
2
0
2662
21.6792
NaN
C
30
922
2
Louch, Mr. Charles Alexander
male
50.0
1
0
SC/AH 3085
26.0000
NaN
S
31
923
2
Jefferys, Mr. Clifford Thomas
male
24.0
2
0
C.A. 31029
31.5000
NaN
S
32
924
3
Dean, Mrs. Bertram (Eva Georgetta Light)
female
33.0
1
2
C.A. 2315
20.5750
NaN
S
33
925
3
Johnston, Mrs. Andrew G (Elizabeth Lily” Watson)”
female
NaN
1
2
W./C. 6607
23.4500
NaN
S
34
926
1
Mock, Mr. Philipp Edmund
male
30.0
1
0
13236
57.7500
C78
C
test2. isnull( ) . sum ( ) . sort_values( ascending= False )
Cabin 327
Age 86
Fare 1
Embarked 0
Ticket 0
Parch 0
SibSp 0
Sex 0
Name 0
Pclass 0
PassengerId 0
dtype: int64
test2[ "Fare" ] = test2[ "Fare" ] . fillna( "1094" )
test2. isnull( ) . sum ( ) . sort_values( ascending= False )
Cabin 327
Age 86
Embarked 0
Fare 0
Ticket 0
Parch 0
SibSp 0
Sex 0
Name 0
Pclass 0
PassengerId 0
dtype: int64
test2[ "Age" ] = test2[ "Age" ] . fillna( test2[ "Age" ] . median( ) )
test2. isnull( ) . sum ( ) . sort_values( ascending= False )
Cabin 327
Embarked 0
Fare 0
Ticket 0
Parch 0
SibSp 0
Age 0
Sex 0
Name 0
Pclass 0
PassengerId 0
dtype: int64
Age_mean = test2[ "Age" ] . mean( )
Age_mean
29.599282296650717
test2. head( 35 )
PassengerId
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
0
892
3
Kelly, Mr. James
male
34.5
0
0
330911
7.8292
NaN
Q
1
893
3
Wilkes, Mrs. James (Ellen Needs)
female
47.0
1
0
363272
7
NaN
S
2
894
2
Myles, Mr. Thomas Francis
male
62.0
0
0
240276
9.6875
NaN
Q
3
895
3
Wirz, Mr. Albert
male
27.0
0
0
315154
8.6625
NaN
S
4
896
3
Hirvonen, Mrs. Alexander (Helga E Lindqvist)
female
22.0
1
1
3101298
12.2875
NaN
S
5
897
3
Svensson, Mr. Johan Cervin
male
14.0
0
0
7538
9.225
NaN
S
6
898
3
Connolly, Miss. Kate
female
30.0
0
0
330972
7.6292
NaN
Q
7
899
2
Caldwell, Mr. Albert Francis
male
26.0
1
1
248738
29
NaN
S
8
900
3
Abrahim, Mrs. Joseph (Sophie Halaut Easu)
female
18.0
0
0
2657
7.2292
NaN
C
9
901
3
Davies, Mr. John Samuel
male
21.0
2
0
A/4 48871
24.15
NaN
S
10
902
3
Ilieff, Mr. Ylio
male
27.0
0
0
349220
7.8958
NaN
S
11
903
1
Jones, Mr. Charles Cresson
male
46.0
0
0
694
26
NaN
S
12
904
1
Snyder, Mrs. John Pillsbury (Nelle Stevenson)
female
23.0
1
0
21228
82.2667
B45
S
13
905
2
Howard, Mr. Benjamin
male
63.0
1
0
24065
26
NaN
S
14
906
1
Chaffee, Mrs. Herbert Fuller (Carrie Constance…
female
47.0
1
0
W.E.P. 5734
61.175
E31
S
15
907
2
del Carlo, Mrs. Sebastiano (Argenia Genovesi)
female
24.0
1
0
SC/PARIS 2167
27.7208
NaN
C
16
908
2
Keane, Mr. Daniel
male
35.0
0
0
233734
12.35
NaN
Q
17
909
3
Assaf, Mr. Gerios
male
21.0
0
0
2692
7.225
NaN
C
18
910
3
Ilmakangas, Miss. Ida Livija
female
27.0
1
0
STON/O2. 3101270
7.925
NaN
S
19
911
3
Assaf Khalil, Mrs. Mariana (Miriam”)”
female
45.0
0
0
2696
7.225
NaN
C
20
912
1
Rothschild, Mr. Martin
male
55.0
1
0
PC 17603
59.4
NaN
C
21
913
3
Olsen, Master. Artur Karl
male
9.0
0
1
C 17368
3.1708
NaN
S
22
914
1
Flegenheim, Mrs. Alfred (Antoinette)
female
27.0
0
0
PC 17598
31.6833
NaN
S
23
915
1
Williams, Mr. Richard Norris II
male
21.0
0
1
PC 17597
61.3792
NaN
C
24
916
1
Ryerson, Mrs. Arthur Larned (Emily Maria Borie)
female
48.0
1
3
PC 17608
262.375
B57 B59 B63 B66
C
25
917
3
Robins, Mr. Alexander A
male
50.0
1
0
A/5. 3337
14.5
NaN
S
26
918
1
Ostby, Miss. Helene Ragnhild
female
22.0
0
1
113509
61.9792
B36
C
27
919
3
Daher, Mr. Shedid
male
22.5
0
0
2698
7.225
NaN
C
28
920
1
Brady, Mr. John Bertram
male
41.0
0
0
113054
30.5
A21
S
29
921
3
Samaan, Mr. Elias
male
27.0
2
0
2662
21.6792
NaN
C
30
922
2
Louch, Mr. Charles Alexander
male
50.0
1
0
SC/AH 3085
26
NaN
S
31
923
2
Jefferys, Mr. Clifford Thomas
male
24.0
2
0
C.A. 31029
31.5
NaN
S
32
924
3
Dean, Mrs. Bertram (Eva Georgetta Light)
female
33.0
1
2
C.A. 2315
20.575
NaN
S
33
925
3
Johnston, Mrs. Andrew G (Elizabeth Lily” Watson)”
female
27.0
1
2
W./C. 6607
23.45
NaN
S
34
926
1
Mock, Mr. Philipp Edmund
male
30.0
1
0
13236
57.75
C78
C
train2[ [ "Survived" , "Pclass" , "Sex" , "Age" , "SibSp" , "Parch" , "Fare" , "Embarked" ] ] . corr( method= "pearson" )
Survived
Pclass
Age
SibSp
Parch
Fare
Survived
1.000000
-0.338481
-0.064910
-0.035322
0.081629
0.257307
Pclass
-0.338481
1.000000
-0.339898
0.083081
0.018443
-0.549500
Age
-0.064910
-0.339898
1.000000
-0.233296
-0.172482
0.096688
SibSp
-0.035322
0.083081
-0.233296
1.000000
0.414838
0.159651
Parch
0.081629
0.018443
-0.172482
0.414838
1.000000
0.216225
Fare
0.257307
-0.549500
0.096688
0.159651
0.216225
1.000000
train2[ "Family" ] = train2[ "SibSp" ] + train2[ "Parch" ]
train2. head( 5 )
PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
Family
0
1
0
3
Braund, Mr. Owen Harris
male
22.0
1
0
A/5 21171
7.2500
NaN
S
1
1
2
1
1
Cumings, Mrs. John Bradley (Florence Briggs Th…
female
38.0
1
0
PC 17599
71.2833
C85
C
1
2
3
1
3
Heikkinen, Miss. Laina
female
26.0
0
0
STON/O2. 3101282
7.9250
NaN
S
0
3
4
1
1
Futrelle, Mrs. Jacques Heath (Lily May Peel)
female
35.0
1
0
113803
53.1000
C123
S
1
4
5
0
3
Allen, Mr. William Henry
male
35.0
0
0
373450
8.0500
NaN
S
0
bins= [ - 1 , 12 , 19 , 36 , 53 , 66 , 150 ]
labels= [ '婴幼儿童' , '少年' , '青年' , '中年' , '中老年' , '老年' ]
train2[ 'age_group' ] = pd. cut(
train2[ 'Age' ] ,
bins,
right= False ,
labels= labels)
train2. head( 5 )
PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
Family
age_group
0
1
0
3
Braund, Mr. Owen Harris
male
22.0
1
0
A/5 21171
7.2500
NaN
S
1
青年
1
2
1
1
Cumings, Mrs. John Bradley (Florence Briggs Th…
female
38.0
1
0
PC 17599
71.2833
C85
C
1
中年
2
3
1
3
Heikkinen, Miss. Laina
female
26.0
0
0
STON/O2. 3101282
7.9250
NaN
S
0
青年
3
4
1
1
Futrelle, Mrs. Jacques Heath (Lily May Peel)
female
35.0
1
0
113803
53.1000
C123
S
1
青年
4
5
0
3
Allen, Mr. William Henry
male
35.0
0
0
373450
8.0500
NaN
S
0
青年
train2[ 'age_group0' ] = train2[ 'age_group' ] . map ( { '婴幼儿童' : 1 , '少年' : 2 , '青年' : 3 , '中年' : 4 , '中老年' : 5 , '老年' : 6 } ) . astype( int )
train2. head( 5 )
PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
Family
age_group
age_group0
0
1
0
3
Braund, Mr. Owen Harris
male
22.0
1
0
A/5 21171
7.2500
NaN
S
1
青年
3
1
2
1
1
Cumings, Mrs. John Bradley (Florence Briggs Th…
female
38.0
1
0
PC 17599
71.2833
C85
C
1
中年
4
2
3
1
3
Heikkinen, Miss. Laina
female
26.0
0
0
STON/O2. 3101282
7.9250
NaN
S
0
青年
3
3
4
1
1
Futrelle, Mrs. Jacques Heath (Lily May Peel)
female
35.0
1
0
113803
53.1000
C123
S
1
青年
3
4
5
0
3
Allen, Mr. William Henry
male
35.0
0
0
373450
8.0500
NaN
S
0
青年
3
train2[ 'Sex0' ] = train2[ 'Sex' ] . map ( { 'female' : 1 , 'male' : 2 } ) . astype( int )
train2. head( 5 )
PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
Family
age_group
age_group0
Sex0
0
1
0
3
Braund, Mr. Owen Harris
male
22.0
1
0
A/5 21171
7.2500
NaN
S
1
青年
3
2
1
2
1
1
Cumings, Mrs. John Bradley (Florence Briggs Th…
female
38.0
1
0
PC 17599
71.2833
C85
C
1
中年
4
1
2
3
1
3
Heikkinen, Miss. Laina
female
26.0
0
0
STON/O2. 3101282
7.9250
NaN
S
0
青年
3
1
3
4
1
1
Futrelle, Mrs. Jacques Heath (Lily May Peel)
female
35.0
1
0
113803
53.1000
C123
S
1
青年
3
1
4
5
0
3
Allen, Mr. William Henry
male
35.0
0
0
373450
8.0500
NaN
S
0
青年
3
2
train2[ 'Embarked' ] . value_counts( )
S 646
C 168
Q 77
Name: Embarked, dtype: int64
train2[ "Embarked0" ] = train2[ "Embarked" ] . map ( { 'S' : 1 , 'C' : 2 , 'Q' : 2 } ) . astype( int )
train2. head( 5 )
PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
Family
age_group
age_group0
Sex0
Embarked0
0
1
0
3
Braund, Mr. Owen Harris
male
22.0
1
0
A/5 21171
7.2500
NaN
S
1
青年
3
2
1
1
2
1
1
Cumings, Mrs. John Bradley (Florence Briggs Th…
female
38.0
1
0
PC 17599
71.2833
C85
C
1
中年
4
1
2
2
3
1
3
Heikkinen, Miss. Laina
female
26.0
0
0
STON/O2. 3101282
7.9250
NaN
S
0
青年
3
1
1
3
4
1
1
Futrelle, Mrs. Jacques Heath (Lily May Peel)
female
35.0
1
0
113803
53.1000
C123
S
1
青年
3
1
1
4
5
0
3
Allen, Mr. William Henry
male
35.0
0
0
373450
8.0500
NaN
S
0
青年
3
2
1
train2[ [ "Survived" , "Pclass" , "Sex0" , "age_group0" , "Family" , "Fare" , "Embarked0" ] ] . corr( method= "pearson" )
Survived
Pclass
Sex0
age_group0
Family
Fare
Embarked0
Survived
1.000000
-0.338481
-0.543351
-0.086879
0.016639
0.257307
0.149683
Pclass
-0.338481
1.000000
0.131900
-0.308349
0.065997
-0.549500
-0.074053
Sex0
-0.543351
0.131900
1.000000
0.095705
-0.200988
-0.182333
-0.119224
age_group0
-0.086879
-0.308349
0.095705
1.000000
-0.293598
0.077438
0.002818
Family
0.016639
0.065997
-0.200988
-0.293598
1.000000
0.217138
-0.077359
Fare
0.257307
-0.549500
-0.182333
0.077438
0.217138
1.000000
0.162184
Embarked0
0.149683
-0.074053
-0.119224
0.002818
-0.077359
0.162184
1.000000
train2[ [ "Family" , "Survived" ] ] . groupby( "Family" , as_index= False ) . mean( ) . sort_values( by= "Survived" , ascending= False )
Family
Survived
3
3
0.724138
2
2
0.578431
1
1
0.552795
6
6
0.333333
0
0
0.303538
4
4
0.200000
5
5
0.136364
7
7
0.000000
8
10
0.000000
train2[ [ "age_group" , "Survived" ] ] . groupby( "age_group" , as_index= False ) . mean( ) . sort_values( by= "Survived" , ascending= False )
age_group
Survived
0
婴幼儿童
0.573529
1
少年
0.436620
3
中年
0.397590
4
中老年
0.372093
2
青年
0.353271
5
老年
0.125000
test2[ "Family" ] = test2[ "SibSp" ] + test2[ "Parch" ]
test2. head( 5 )
PassengerId
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
Family
0
892
3
Kelly, Mr. James
male
34.5
0
0
330911
7.8292
NaN
Q
0
1
893
3
Wilkes, Mrs. James (Ellen Needs)
female
47.0
1
0
363272
7
NaN
S
1
2
894
2
Myles, Mr. Thomas Francis
male
62.0
0
0
240276
9.6875
NaN
Q
0
3
895
3
Wirz, Mr. Albert
male
27.0
0
0
315154
8.6625
NaN
S
0
4
896
3
Hirvonen, Mrs. Alexander (Helga E Lindqvist)
female
22.0
1
1
3101298
12.2875
NaN
S
2
bins= [ - 1 , 12 , 19 , 36 , 53 , 66 , 150 ]
labels= [ '婴幼儿童' , '少年' , '青年' , '中年' , '中老年' , '老年' ]
test2[ 'age_group' ] = pd. cut(
test2[ 'Age' ] ,
bins,
right= False ,
labels= labels)
test2. head( 5 )
PassengerId
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
Family
age_group
0
892
3
Kelly, Mr. James
male
34.5
0
0
330911
7.8292
NaN
Q
0
青年
1
893
3
Wilkes, Mrs. James (Ellen Needs)
female
47.0
1
0
363272
7
NaN
S
1
中年
2
894
2
Myles, Mr. Thomas Francis
male
62.0
0
0
240276
9.6875
NaN
Q
0
中老年
3
895
3
Wirz, Mr. Albert
male
27.0
0
0
315154
8.6625
NaN
S
0
青年
4
896
3
Hirvonen, Mrs. Alexander (Helga E Lindqvist)
female
22.0
1
1
3101298
12.2875
NaN
S
2
青年
test2[ 'age_group0' ] = test2[ 'age_group' ] . map ( { '婴幼儿童' : 1 , '少年' : 2 , '青年' : 3 , '中年' : 4 , '中老年' : 5 , '老年' : 6 } ) . astype( int )
test2. head( 5 )
PassengerId
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
Family
age_group
age_group0
0
892
3
Kelly, Mr. James
male
34.5
0
0
330911
7.8292
NaN
Q
0
青年
3
1
893
3
Wilkes, Mrs. James (Ellen Needs)
female
47.0
1
0
363272
7
NaN
S
1
中年
4
2
894
2
Myles, Mr. Thomas Francis
male
62.0
0
0
240276
9.6875
NaN
Q
0
中老年
5
3
895
3
Wirz, Mr. Albert
male
27.0
0
0
315154
8.6625
NaN
S
0
青年
3
4
896
3
Hirvonen, Mrs. Alexander (Helga E Lindqvist)
female
22.0
1
1
3101298
12.2875
NaN
S
2
青年
3
test2[ 'Sex0' ] = test2[ 'Sex' ] . map ( { 'female' : 1 , 'male' : 2 } ) . astype( int )
test2. head( 5 )
PassengerId
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
Family
age_group
age_group0
Sex0
0
892
3
Kelly, Mr. James
male
34.5
0
0
330911
7.8292
NaN
Q
0
青年
3
2
1
893
3
Wilkes, Mrs. James (Ellen Needs)
female
47.0
1
0
363272
7
NaN
S
1
中年
4
1
2
894
2
Myles, Mr. Thomas Francis
male
62.0
0
0
240276
9.6875
NaN
Q
0
中老年
5
2
3
895
3
Wirz, Mr. Albert
male
27.0
0
0
315154
8.6625
NaN
S
0
青年
3
2
4
896
3
Hirvonen, Mrs. Alexander (Helga E Lindqvist)
female
22.0
1
1
3101298
12.2875
NaN
S
2
青年
3
1
test2[ "Embarked0" ] = test2[ "Embarked" ] . map ( { 'S' : 1 , 'C' : 2 , 'Q' : 2 } ) . astype( int )
test2. head( 5 )
PassengerId
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
Family
age_group
age_group0
Sex0
Embarked0
0
892
3
Kelly, Mr. James
male
34.5
0
0
330911
7.8292
NaN
Q
0
青年
3
2
2
1
893
3
Wilkes, Mrs. James (Ellen Needs)
female
47.0
1
0
363272
7
NaN
S
1
中年
4
1
1
2
894
2
Myles, Mr. Thomas Francis
male
62.0
0
0
240276
9.6875
NaN
Q
0
中老年
5
2
2
3
895
3
Wirz, Mr. Albert
male
27.0
0
0
315154
8.6625
NaN
S
0
青年
3
2
1
4
896
3
Hirvonen, Mrs. Alexander (Helga E Lindqvist)
female
22.0
1
1
3101298
12.2875
NaN
S
2
青年
3
1
1
x_train = train2[ [ "Pclass" , "Fare" , "Family" , "age_group0" , "Sex0" , "Embarked0" ] ]
y_train = train2[ "Survived" ]
x_test = test2[ [ "Pclass" , "Fare" , "Family" , "age_group0" , "Sex0" , "Embarked0" ] ]
from sklearn. linear_model import LogisticRegression
Classifier1 = LogisticRegression( )
Classifier1. fit( x_train, y_train)
Y1_prediction = Classifier1. predict( x_test)
score_Logit = Classifier1. score( x_train, y_train)
score_Logit
0.8047138047138047
Classifier1. coef_
array([[-0.74124862, 0.00496064, -0.17449562, -0.33260786, -2.28141631,
0.58615422]])
Final = pd. DataFrame( { "PassengerId" : test2[ "PassengerId" ] ,
"Survived" : Y1_prediction
} )
Final. head( 10 )
PassengerId
Survived
0
892
0
1
893
0
2
894
0
3
895
0
4
896
0
5
897
0
6
898
1
7
899
0
8
900
1
9
901
0
Final. to_csv( r"D:/2018_BigData/Python/Kaggle_learning/Titanic Machine Learning from Disaster/titanic/Final1.csv" , index= False )
x_train1 = train2[ [ "Pclass" , "Family" , "age_group0" , "Sex0" , "Embarked0" ] ]
y_train1 = train2[ "Survived" ]
x_test1 = test2[ [ "Pclass" , "Family" , "age_group0" , "Sex0" , "Embarked0" ] ]
from sklearn. linear_model import LogisticRegression
Classifier1 = LogisticRegression( )
Classifier1. fit( x_train1, y_train1)
Y1_prediction = Classifier1. predict( x_test1)
score_Logit = Classifier1. score( x_train1, y_train1)
score_Logit
0.7991021324354658
Final = pd. DataFrame( { "PassengerId" : test2[ "PassengerId" ] ,
"Survived" : Y1_prediction
} )
Final. to_csv( r"D:/2018_BigData/Python/Kaggle_learning/Titanic Machine Learning from Disaster/titanic/Final2.csv" , index= False )
x_train2 = train2[ [ "Pclass" , "Family" , "age_group0" , "Sex0" ] ]
y_train2 = train2[ "Survived" ]
x_test2 = test2[ [ "Pclass" , "Family" , "age_group0" , "Sex0" ] ]
from sklearn. linear_model import LogisticRegression
Classifier1 = LogisticRegression( )
Classifier1. fit( x_train2, y_train2)
Y1_prediction = Classifier1. predict( x_test2)
score_Logit = Classifier1. score( x_train2, y_train2)
score_Logit
0.8058361391694725
Final = pd. DataFrame( { "PassengerId" : test2[ "PassengerId" ] ,
"Survived" : Y1_prediction
} )
Final. to_csv( r"D:/2018_BigData/Python/Kaggle_learning/Titanic Machine Learning from Disaster/titanic/Final3.csv" , index= False )
x_train3 = train2[ [ "Pclass" , "Family" , "Sex0" ] ]
y_train3 = train2[ "Survived" ]
x_test3 = test2[ [ "Pclass" , "Family" , "Sex0" ] ]
from sklearn. linear_model import LogisticRegression
Classifier1 = LogisticRegression( )
Classifier1. fit( x_train3, y_train3)
Y1_prediction = Classifier1. predict( x_test3)
score_Logit = Classifier1. score( x_train3, y_train3)
score_Logit
0.8002244668911336
Final = pd. DataFrame( { "PassengerId" : test2[ "PassengerId" ] ,
"Survived" : Y1_prediction
} )
Final. to_csv( r"D:/2018_BigData/Python/Kaggle_learning/Titanic Machine Learning from Disaster/titanic/Final4.csv" , index= False )
x_train4 = train2[ [ "Pclass" , "age_group0" , "Sex0" ] ]
y_train4 = train2[ "Survived" ]
x_test4 = test2[ [ "Pclass" , "age_group0" , "Sex0" ] ]
from sklearn. linear_model import LogisticRegression
Classifier1 = LogisticRegression( )
Classifier1. fit( x_train4, y_train4)
Y1_prediction = Classifier1. predict( x_test4)
score_Logit = Classifier1. score( x_train4, y_train4)
score_Logit
0.8002244668911336
Final = pd. DataFrame( { "PassengerId" : test2[ "PassengerId" ] ,
"Survived" : Y1_prediction
} )
Final. to_csv( r"D:/2018_BigData/Python/Kaggle_learning/Titanic Machine Learning from Disaster/titanic/Final5.csv" , index= False )
x_train5 = train2[ [ "Pclass" , "Sex0" ] ]
y_train5 = train2[ "Survived" ]
x_test5 = test2[ [ "Pclass" , "Sex0" ] ]
from sklearn. linear_model import LogisticRegression
Classifier1 = LogisticRegression( )
Classifier1. fit( x_train5, y_train5)
Y1_prediction = Classifier1. predict( x_test5)
score_Logit = Classifier1. score( x_train5, y_train5)
score_Logit
0.7867564534231201
Final = pd. DataFrame( { "PassengerId" : test2[ "PassengerId" ] ,
"Survived" : Y1_prediction
} )
Final. to_csv( r"D:/2018_BigData/Python/Kaggle_learning/Titanic Machine Learning from Disaster/titanic/Final6.csv" , index= False )
train2. to_csv( r"D:/2018_BigData/Python/Kaggle_learning/Titanic Machine Learning from Disaster/titanic/train2.csv" , index= False )
test2. to_csv( r"D:/2018_BigData/Python/Kaggle_learning/Titanic Machine Learning from Disaster/titanic/test2.csv" , index= False )