Titanic Notebook

# import library
# Import numpy and pandas and name them np, pd
import numpy as np
import pandas as pd
# import csv data using relative paths, and
df = pd.read_csv('train.csv')

# Display the first three rows of data - observe the data
print(df.head(3))
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
# Import data, (names) rename column names and (index_col) row index names, and (header=0) ignore original column names
df = pd.read_csv('train.csv', names=['passenger ID','survived','Position level','Name','gender','age','number of siblings','Number of parents and children','Ticket information','fare','cabin','boarding port'],index_col='passenger ID',header=0)

# Display the first three rows of data - observe the data
print(df.head(3))
      Survive or not Position level                                                 Name      gender  \
passenger ID                                                                          
1        0     3                            Braund, Mr. Owen Harris    male   
2        1     1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   
3        1     3                             Heikkinen, Miss. Laina  female   

        Age Number of siblings Number of parents and children              Ticket information       Fare Cabin Port of Embarkation  
passenger ID                                                             
1     22.0       1       0         A/5 21171   7.2500  NaN    S  
2     38.0       1       0          PC 17599  71.2833  C85    C  
3     26.0       0       0  STON/O2. 3101282   7.9250  NaN    S  
# View the basic information of the data (info) The number of non-null values ​​in each column, data type, file size - observe data
print(df.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   survived    891 non-null    int64  
 1   Position level    891 non-null    int64  
 2   Name      891 non-null    object 
 3   gender      891 non-null    object 
 4   age      714 non-null    float64
 5   Number of siblings 891 non-null    int64  
 6   Number of parents and children 891 non-null    int64  
 7   Ticket information    891 non-null    object 
 8   fare      891 non-null    float64
 9   cabin      204 non-null    object 
 10  boarding port    889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
None
# Observing 10 rows before (head) and 10 rows after (tail) - observe the data
print(df.head(10))
print(df.tail(10))
      Survive or not Position level                                                 Name      gender  \
passenger ID                                                                          
1        0     3                            Braund, Mr. Owen Harris    male   
2        1     1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   
3        1     3                             Heikkinen, Miss. Laina  female   
4        1     1       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   
5        0     3                           Allen, Mr. William Henry    male   
6        0     3                                   Moran, Mr. James    male   
7        0     1                            McCarthy, Mr. Timothy J    male   
8        0     3                     Palsson, Master. Gosta Leonard    male   
9        1     3  Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female   
10       1     2                Nasser, Mrs. Nicholas (Adele Achem)  female   

        Age Number of siblings Number of parents and children              Ticket information       fare    Cabin Port of Embarkation  
passenger ID                                                              
1     22.0       1       0         A/5 21171   7.2500   NaN    S  
2     38.0       1       0          PC 17599  71.2833   C85    C  
3     26.0       0       0  STON/O2. 3101282   7.9250   NaN    S  
4     35.0       1       0            113803  53.1000  C123    S  
5     35.0       0       0            373450   8.0500   NaN    S  
6      NaN       0       0            330877   8.4583   NaN    Q  
7     54.0       0       0             17463  51.8625   E46    S  
8      2.0       3       1            349909  21.0750   NaN    S  
9     27.0       0       2            347742  11.1333   NaN    S  
10    14.0       1       0            237736  30.0708   NaN    C  
      Survive or not Position level                                        Name      gender    age  \
passenger ID                                                                       
882      0     3                        Markun, Mr. Johann    male  33.0   
883      0     3              Dahlberg, Miss. Gerda Ulrika  female  22.0   
884      0     2             Banfield, Mr. Frederick James    male  28.0   
885      0     3                    Sutehall, Mr. Henry Jr    male  25.0   
886      0     3      Rice, Mrs. William (Margaret Norton)  female  39.0   
887      0     2                     Montvila, Rev. Juozas    male  27.0   
888      1     1              Graham, Miss. Margaret Edith  female  19.0   
889      0     3  Johnston, Miss. Catherine Helen "Carrie"  female   NaN   
890      1     1                     Behr, Mr. Karl Howell    male  26.0   
891      0     3                       Dooley, Mr. Patrick    male  32.0   

      Number of siblings Number of parents and children              Ticket information       fare    Cabin Port of Embarkation  
passenger ID                                                        
882        0       0            349257   7.8958   NaN    S  
883        0       0              7552  10.5167   NaN    S  
884        0       0  C.A./SOTON 34068  10.5000   NaN    S  
885        0       0   SOTON/OQ 392076   7.0500   NaN    S  
886        0       5            382652  29.1250   NaN    Q  
887        0       0            211536  13.0000   NaN    S  
888        0       0            112053  30.0000   B42    S  
889        1       2        W./C. 6607  23.4500   NaN    S  
890        0       0            111369  30.0000  C148    C  
891        0       0            370376   7.7500   NaN    Q  
# Determine if the data is empty (return a true/false) - observe the data
df.isnull()
survivedPosition levelNamegenderagenumber of siblingsNumber of parents and childrenTicket informationfarecabinboarding port
Passenger ID
1FalseFalseFalseFalseFalseFalseFalseFalseFalseTrueFalse
2FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
3FalseFalseFalseFalseFalseFalseFalseFalseFalseTrueFalse
4FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
5FalseFalseFalseFalseFalseFalseFalseFalseFalseTrueFalse
....................................
887FalseFalseFalseFalseFalseFalseFalseFalseFalseTrueFalse
888FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
889FalseFalseFalseFalseTrueFalseFalseFalseFalseTrueFalse
890FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
891FalseFalseFalseFalseFalseFalseFalseFalseFalseTrueFalse

891 rows × 11 columns

# Save as current to change data (to csv) to csv - save data
df.to_csv('tain_cn.csv')
# Query for the first 10 rows sorted by fare and age in descending order
print(df.sort_values(by=['fare','age'],ascending=False).head(10))
'''
According to common sense, I know that the higher the fare, the better the cabin,
So we can clearly see that 8 of the top 10 passengers survived,
This is a fairly high proportion. Later, you can try to analyze the relationship between ticket price and age, and the relationship between ticket price and survival rate.
'''

      Survive or not Position level                                               Name      gender  \
passenger ID                                                                        
680      1     1               Cardeza, Mr. Thomas Drake Martinez    male   
259      1     1                                 Ward, Miss. Anna  female   
738      1     1                           Lesurer, Mr. Gustave J    male   
439      0     1                                Fortune, Mr. Mark    male   
342      1     1                   Fortune, Miss. Alice Elizabeth  female   
89       1     1                       Fortune, Miss. Mabel Helen  female   
28       0     1                   Fortune, Mr. Charles Alexander    male   
743      1     1            Ryerson, Miss. Susan Parker "Suzette"  female   
312      1     1                       Ryerson, Miss. Emily Borie  female   
300      1     1  Baxter, Mrs. James (Helene DeLaudeniere Chaput)  female   

        Age Number of siblings Number of parents and children      Ticket information        fare               Cabin Port of Embarkation  
passenger ID                                                                  
680   36.0       0       1  PC 177 55  512.3292      B51 B53 B55    C  
259   35.0       0       0  PC 17755  512.3292              NaN    C  
738   35.0       0       0  PC 17755  512.3292             B101    C  
439   64.0       1       4     19950  263.0000      C23 C25 C27    S  
342   24.0       3       2     19950  263.0000      C23 C25 C27    S  
89    23.0       3       2     19950  263.0000      C23 C25 C27    S  
28    19.0       3       2     19950  263.0000      C23 C25 C27    S  
743   21.0       2       2  PC 17608  262.3750  B57 B59 B63 B66    C  
312   18.0       2       2  PC 17608  262.3750  B57 B59 B63 B66    C  
300   50.0       0       1  PC 17558  247.5208          B58 B60    C  





'\n According to common sense, I know that the higher the fare, the better the cabin,\n So we can clearly see that 8 of the top 10 passengers survived,\n This is a fairly high proportion. Later, you can try to analyze the relationship between ticket price and age, and the relationship between ticket price and survival rate.\n'
# Query descriptive statistics for fares
print(df['fare'].describe())
'''
There are a total of 891 fare data,
Average is about: 32.20,
The standard deviation is about 49.69,It shows that the price fluctuates greatly.
25%of people whose fare is less than 7.91 yes, 50%of people with fares below 14.45,75%of people with fares below 31.00,
The maximum fare is about 512.33,The minimum value is 0.
'''
count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: fare, dtype: float64





'\n There are a total of 891 fare data,\n Average is about: 32.20,\n The standard deviation is about 49.69,It shows that the price fluctuates greatly.\n25%of people whose fare is less than 7.91 yes, 50%of people with fares below 14.45,75%of people with fares below 31.00,\n The maximum fare is about 512.33,The minimum value is 0.\n'
# Descending by position level and survival View first 10 rows and last 10 rows of data
print(df.sort_values(by=['Position level','survived'],ascending=False).head(10))
print(df.sort_values(by=['Position level','survived'],ascending=False).tail(10))
'''
Through observation, it can be found that the top 10 positions with the highest level (level 3) all survived.
The last 10 positions with the lowest level (level 1) were not spared
 Later, you can try to explore the relationship between position level and survival
'''

# More sorting observations can be made, such as age, gender, etc., and their descriptive statistics can be observed
      Survive or not Position level                                                 Name      gender  \
passenger ID                                                                          
3        1     3                             Heikkinen, Miss. Laina  female   
9        1     3  Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female   
11       1     3                    Sandstrom, Miss. Marguerite Rut  female   
20       1     3                            Masselmani, Mrs. Fatima  female   
23       1     3                        McGowan, Miss. Anna "Annie"  female   
26       1     3  Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...  female   
29       1     3                      O'Dwyer, Miss. Ellen "Nellie"  female   
33       1     3                           Glynn, Miss. Mary Agatha  female   
37       1     3                                   Mamee, Mr. Hanna    male   
40       1     3                        Nicola-Yarred, Miss. Jamila  female   

        Age Number of siblings Number of parents and children              Ticket information       Fare Cabin Port of Embarkation  
passenger ID                                                             
3     26.0       0       0  STON/O2. 3101282   7.9250  NaN    S  
9     27.0       0       2            347742  11.1333  NaN    S  
11     4.0       1       1           PP 9549  16.7000   G6    S  
20     NaN       0       0              2649   7.2250  NaN    C  
23    15.0       0       0            330923   8.0292  NaN    Q  
26    38.0       1       5            347077  31.3875  NaN    S  
29     NaN       0       0            330959   7.8792  NaN    Q  
33     NaN       0       0            335677   7.7500  NaN    Q  
37     NaN       0       0              2677   7.2292  NaN    C  
40    14.0       1       0              2651  11.2417  NaN    C  
      Survive or not Position level                                    Name    gender    Age Number of siblings  \
passenger ID                                                                         
749      0     1             Marvin, Mr. Daniel Warner  male  19.0       1   
767      0     1             Brewe, Dr. Arthur Jackson  male   NaN       0   
783      0     1                Long, Mr. Milton Clyde  male  29.0       0   
790      0     1              Guggenheim, Mr. Benjamin  male  46.0       0   
794      0     1              Hoyt, Mr. William Fisher  male   NaN       0   
807      0     1                Andrews, Mr. Thomas Jr  male  39.0       0   
816      0     1                      Fry, Mr. Richard  male   NaN       0   
823      0     1       Reuchlin, Jonkheer. John George  male  38.0       0   
868      0     1  Roebling, Mr. Washington Augustus II  male  31.0       0   
873      0     1              Carlsson, Mr. Frans Olof  male  33.0       0   

      Number of parents and children      Ticket information       fare           Cabin Port of Embarkation  
passenger ID                                               
749        0    113773  53.1000          D30    S  
767        0    112379  39.6000          NaN    C  
783        0    113501  30.0000           D6    S  
790        0  PC 17593  79.2000      B82 B84    C  
794        0  PC 17600  30.6958          NaN    C  
807        0    112050   0.0000          A36    S  
816        0    112058   0.0000         B102    S  
823        0     19972   0.0000          NaN    S  
868        0  PC 17590  50.4958          A24    S  
873        0       695   5.0000  B51 B53 B55    S  





'\n Through observation, it can be found that the top 10 positions with the highest level (level 3) all survived.\n The last 10 positions with the lowest level (level 1) were not spared\n Later, you can try to explore the relationship between position level and survival\n'
# Missing value observation and handling

# Method 1: info returns the number of non-null values
print(df.info())

# Method 2: Calculate the sum of the number of vacancies
print(df.isnull().sum())
'''
By observation, it can be found that there are missing values ​​for age, cabin, and port of embarkation
'''
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   survived    891 non-null    int64  
 1   Position level    891 non-null    int64  
 2   Name      891 non-null    object 
 3   gender      891 non-null    object 
 4   age      714 non-null    float64
 5   Number of siblings 891 non-null    int64  
 6   Number of parents and children 891 non-null    int64  
 7   Ticket information    891 non-null    object 
 8   fare      891 non-null    float64
 9   cabin      204 non-null    object 
 10  boarding port    889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
None
 survived        0
 Bin level        0
 Name          0
 Gender          0
 age        177
 Number of siblings      0
 Number of parents and children      0
 Ticket information        0
 Ticket Price          0
 cabin        687
 boarding port        2
dtype: int64





'\n By observation, it can be found that there are missing values ​​for age, cabin, and port of embarkation\n'
# remove missing values

# Delete columns and rows with missing values
# df.dropna()

# Find missing values ​​and assign 0 
# df[df['age'].isna()] = 0
df.head()
survivedPosition levelNamegenderagenumber of siblingsNumber of parents and childrenTicket informationfarecabinboarding portage categorygender category
Passenger ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS21
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C52
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS32
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S42
503Allen, Mr. William Henrymale35.0003734508.0500NaNS41
# Fill null values ​​fillna() Fills all numeric null values ​​with 0
df.fillna(0)
survivedPosition levelNamegenderagenumber of siblingsNumber of parents and childrenTicket informationfarecabinboarding port
Passenger ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.25000S
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.92500S
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
503Allen, Mr. William Henrymale35.0003734508.05000S
....................................
88702Montvila, Rev. Juozasmale27.00021153613.00000S
88811Graham, Miss. Margaret Edithfemale19.00011205330.0000B42S
88900000.00000.000000
89011Behr, Mr. Karl Howellmale26.00011136930.0000C148C
89103Dooley, Mr. Patrickmale32.0003703767.75000Q

891 rows × 11 columns

# View duplicate values ​​in your data
df[df.duplicated()]

survivedPosition levelNamegenderagenumber of siblingsNumber of parents and childrenTicket informationfarecabinboarding port
Passenger ID
1800000.00000.000
2000000.00000.000
2700000.00000.000
2900000.00000.000
3000000.00000.000
....................................
86000000.00000.000
86400000.00000.000
86900000.00000.000
87900000.00000.000
88900000.00000.000

176 rows × 11 columns

# Clean up duplicate values
df = df.drop_duplicates()
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 715 entries, 1 to 891
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   survived    715 non-null    int64  
 1   Position level    715 non-null    int64  
 2   Name      715 non-null    object 
 3   gender      715 non-null    object 
 4   age      715 non-null    float64
 5   Number of siblings 715 non-null    int64  
 6   Number of parents and children 715 non-null    int64  
 7   Ticket information    715 non-null    object 
 8   fare      715 non-null    float64
 9   cabin      186 non-null    object 
 10  boarding port    713 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 67.0+ KB
# Feature observation and processing
'''
When we observe the features, we can roughly divide the features into two categories.:   
Numerical features: Survived ,Pclass, Age ,SibSp, Parch, Fare,
in Survived, Pclass is a discrete numerical feature,
Age,SibSp, Parch, Fare is a continuous numerical feature  

textual features: Name, Sex, Cabin,Embarked, Ticket,
in Sex, Cabin, Embarked, Ticket Categorical text features. 

Numerical features can generally be used directly for model training.
But sometimes continuous variables are discretized for the stability and robustness of the model.. 
Textual features often need to be converted into numerical features before they can be used for modeling and analysis. 
'''
'\n When we observe the features, we can roughly divide the features into two categories.:   \n Numerical features: Survived ,Pclass, Age ,SibSp, Parch, Fare,\n in Survived, Pclass is a discrete numerical feature,\nAge,SibSp, Parch, Fare is a continuous numerical feature  \n\n textual features: Name, Sex, Cabin,Embarked, Ticket,\n in Sex, Cabin, Embarked, Ticket Categorical text features. \n\n Numerical features can generally be used directly for model training.\n But sometimes continuous variables are discretized for the stability and robustness of the model.. \n Textual features often need to be converted into numerical features before they can be used for modeling and analysis. \n'
# The continuous variable Age is equally divided into 5 age groups and represented by the categorical variable 12345 respectively
df['age category'] = pd.cut(df['age'], 5,labels = [1,2,3,4,5])
df.head()

survivedPosition levelNamegenderagenumber of siblingsNumber of parents and childrenTicket informationfarecabinboarding portage category
Passenger ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS2
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C3
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS2
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S3
503Allen, Mr. William Henrymale35.0003734508.0500NaNS3
# #The continuous variable Age is divided into five age groups (0,5] (5,15] (15,30] (30,50] (50,80], and they are represented by the categorical variable 12345, respectively.
df['age category'] = pd.cut(df['age'],[0,5,15,30,50,80],labels = [1,2,3,4,5])
df.head()
survivedPosition levelNamegenderagenumber of siblingsNumber of parents and childrenTicket informationfarecabinboarding portage category
Passenger ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS3
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C4
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS3
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S4
503Allen, Mr. William Henrymale35.0003734508.0500NaNS4
#The continuous variable Age is divided into five age groups of 10% 30% 50 70% 90% and represented by the categorical variable 12345
df['age category'] = pd.qcut(df['age'],[0,0.1,0.3,0.5,0.7,0.9],labels = [1,2,3,4,5])
df.head()
survivedPosition levelNamegenderagenumber of siblingsNumber of parents and childrenTicket informationfarecabinboarding portage category
Passenger ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS2
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C5
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS3
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S4
503Allen, Mr. William Henrymale35.0003734508.0500NaNS4
print(df['gender'].value_counts()) # View variables and types in the 'Sex' column
print(df['boarding port'].value_counts()) # View variables and types in the 'Cabin' column
print(df['cabin'].value_counts()) # View variables and types in the 'Embarked' column
male      453
female    261
0           1
Name: gender, dtype: int64
S    554
C    130
Q     28
0      1
Name: boarding port, dtype: int64
G6             4
C23 C25 C27    4
B96 B98        4
F2             3
F33            3
              ..
A6             1
C104           1
B39            1
B69            1
0              1
Name: cabin, Length: 135, dtype: int64
#Convert category text to 12345

#Method 1: replace
# Male 1 Female 2
# The replace() method replaces the old (old string) in the string with new (new string)
df['gender category'] = df['gender'].replace(['male','female'],[1,2])
df.head()
survivedPosition levelNamegenderagenumber of siblingsNumber of parents and childrenTicket informationfarecabinboarding portage categorygender category
Passenger ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS21
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C52
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS32
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S42
503Allen, Mr. William Henrymale35.0003734508.0500NaNS41
from sklearn.preprocessing import LabelEncoder
df['cabin'] = LabelEncoder().fit_transform(df['cabin'])
df.head()
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-28-a090f7066f88> in <module>
      1 from sklearn.preprocessing import LabelEncoder
----> 2 df['cabin'] = LabelEncoder().fit_transform(df['cabin'])
      3 df.head()


/opt/conda/lib/python3.6/site-packages/sklearn/preprocessing/label.py in fit_transform(self, y)
    110         """
    111         y = column_or_1d(y, warn=True)
--> 112         self.classes_, y = np.unique(y, return_inverse=True)
    113         return y
    114 


<__array_function__ internals> in unique(*args, **kwargs)


/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
    259     ar = np.asanyarray(ar)
    260     if axis is None:
--> 261         ret = _unique1d(ar, return_index, return_inverse, return_counts)
    262         return _unpack_tuple(ret)
    263 


/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
    317 
    318     if optional_indices:
--> 319         perm = ar.argsort(kind='mergesort' if return_index else 'quicksort')
    320         aux = ar[perm]
    321     else:


TypeError: '<' not supported between instances of 'str' and 'float'
df['boarding port'] = LabelEncoder().fit_transform(df['boarding port'])
df.head()

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-34-f52d686f7adb> in <module>
----> 1 df['boarding port'] = LabelEncoder().fit_transform(df['boarding port'])
      2 df.head()


/opt/conda/lib/python3.6/site-packages/sklearn/preprocessing/label.py in fit_transform(self, y)
    110         """
    111         y = column_or_1d(y, warn=True)
--> 112         self.classes_, y = np.unique(y, return_inverse=True)
    113         return y
    114 


<__array_function__ internals> in unique(*args, **kwargs)


/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
    259     ar = np.asanyarray(ar)
    260     if axis is None:
--> 261         ret = _unique1d(ar, return_index, return_inverse, return_counts)
    262         return _unpack_tuple(ret)
    263 


/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
    317 
    318     if optional_indices:
--> 319         perm = ar.argsort(kind='mergesort' if return_index else 'quicksort')
    320         aux = ar[perm]
    321     else:


TypeError: '<' not supported between instances of 'int' and 'str'
# one hot encoding
# Turn into 01 sequence to save computing performance
x = pd.get_dummies(df['age'],prefix='age')

# concatenate data onto original data concatenate by column
df = pd.concat([df,x],axis=1)
df
df.to_csv('train_onehot.csv')
for column in ['cabin','boarding port']:
    x = pd.get_dummies(df[column],prefix= column)
    df = pd.concat([df,x],axis=1)
df.head()
survivedPosition levelNamegenderagenumber of siblingsNumber of parents and childrenTicket informationfarecabin...Cabin_F G73Cabin_F2Cabin_F33Cabin_F4Cabin_G6Cabin_TEmbarkation port_0Embarkation port_CEmbarkation port_QEmbarkation port_S
Passenger ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaN...0000000001
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85...0000000100
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaN...0000000001
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123...0000000001
503Allen, Mr. William Henrymale35.0003734508.0500NaN...0000000001

5 rows × 643 columns

# series.str.extract string extraction expression
# pat regular expression
# flags 
df['Title'] = df.Name.str.extract('([A-Za-z]+)\.')
df
survivedPosition levelNamegenderagenumber of siblingsNumber of parents and childrenTicket informationfarecabin...Cabin_F2Cabin_F33Cabin_F4Cabin_G6Cabin_TEmbarkation port_0Embarkation port_CEmbarkation port_QEmbarkation port_STitle
Passenger ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaN...000000001Mr
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85...000000100Mrs
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaN...000000001Miss
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123...000000001Mrs
503Allen, Mr. William Henrymale35.0003734508.0500NaN...000000001Mr
..................................................................
88603Rice, Mrs. William (Margaret Norton)female39.00538265229.1250NaN...000000010Mrs
88702Montvila, Rev. Juozasmale27.00021153613.0000NaN...000000001Rev
88811Graham, Miss. Margaret Edithfemale19.00011205330.0000B42...000000001Miss
89011Behr, Mr. Karl Howellmale26.00011136930.0000C148...000000100Mr
89103Dooley, Mr. Patrickmale32.0003703767.7500NaN...000000010Mr

715 rows × 644 columns

df.to_csv('test_fin.csv')

Tags: Python Data Analysis AI Data Mining

Posted by designsubway on Thu, 28 Jul 2022 22:17:31 +0530