# import library
# Import numpy and pandas and name them np, pd
import numpy as np
import pandas as pd
# import csv data using relative paths, and
df = pd.read_csv('train.csv')
# Display the first three rows of data - observe the data
print(df.head(3))
PassengerId Survived Pclass \
0 1 0 3
1 2 1 1
2 3 1 3
Name Sex Age SibSp \
0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
Parch Ticket Fare Cabin Embarked
0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
# Import data, (names) rename column names and (index_col) row index names, and (header=0) ignore original column names
df = pd.read_csv('train.csv', names=['passenger ID','survived','Position level','Name','gender','age','number of siblings','Number of parents and children','Ticket information','fare','cabin','boarding port'],index_col='passenger ID',header=0)
# Display the first three rows of data - observe the data
print(df.head(3))
Survive or not Position level Name gender \
passenger ID
1 0 3 Braund, Mr. Owen Harris male
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female
3 1 3 Heikkinen, Miss. Laina female
Age Number of siblings Number of parents and children Ticket information Fare Cabin Port of Embarkation
passenger ID
1 22.0 1 0 A/5 21171 7.2500 NaN S
2 38.0 1 0 PC 17599 71.2833 C85 C
3 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
# View the basic information of the data (info) The number of non-null values in each column, data type, file size - observe data
print(df.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 891 non-null int64
1 Position level 891 non-null int64
2 Name 891 non-null object
3 gender 891 non-null object
4 age 714 non-null float64
5 Number of siblings 891 non-null int64
6 Number of parents and children 891 non-null int64
7 Ticket information 891 non-null object
8 fare 891 non-null float64
9 cabin 204 non-null object
10 boarding port 889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
None
# Observing 10 rows before (head) and 10 rows after (tail) - observe the data
print(df.head(10))
print(df.tail(10))
Survive or not Position level Name gender \
passenger ID
1 0 3 Braund, Mr. Owen Harris male
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female
3 1 3 Heikkinen, Miss. Laina female
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female
5 0 3 Allen, Mr. William Henry male
6 0 3 Moran, Mr. James male
7 0 1 McCarthy, Mr. Timothy J male
8 0 3 Palsson, Master. Gosta Leonard male
9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female
10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female
Age Number of siblings Number of parents and children Ticket information fare Cabin Port of Embarkation
passenger ID
1 22.0 1 0 A/5 21171 7.2500 NaN S
2 38.0 1 0 PC 17599 71.2833 C85 C
3 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
4 35.0 1 0 113803 53.1000 C123 S
5 35.0 0 0 373450 8.0500 NaN S
6 NaN 0 0 330877 8.4583 NaN Q
7 54.0 0 0 17463 51.8625 E46 S
8 2.0 3 1 349909 21.0750 NaN S
9 27.0 0 2 347742 11.1333 NaN S
10 14.0 1 0 237736 30.0708 NaN C
Survive or not Position level Name gender age \
passenger ID
882 0 3 Markun, Mr. Johann male 33.0
883 0 3 Dahlberg, Miss. Gerda Ulrika female 22.0
884 0 2 Banfield, Mr. Frederick James male 28.0
885 0 3 Sutehall, Mr. Henry Jr male 25.0
886 0 3 Rice, Mrs. William (Margaret Norton) female 39.0
887 0 2 Montvila, Rev. Juozas male 27.0
888 1 1 Graham, Miss. Margaret Edith female 19.0
889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN
890 1 1 Behr, Mr. Karl Howell male 26.0
891 0 3 Dooley, Mr. Patrick male 32.0
Number of siblings Number of parents and children Ticket information fare Cabin Port of Embarkation
passenger ID
882 0 0 349257 7.8958 NaN S
883 0 0 7552 10.5167 NaN S
884 0 0 C.A./SOTON 34068 10.5000 NaN S
885 0 0 SOTON/OQ 392076 7.0500 NaN S
886 0 5 382652 29.1250 NaN Q
887 0 0 211536 13.0000 NaN S
888 0 0 112053 30.0000 B42 S
889 1 2 W./C. 6607 23.4500 NaN S
890 0 0 111369 30.0000 C148 C
891 0 0 370376 7.7500 NaN Q
# Determine if the data is empty (return a true/false) - observe the data
df.isnull()
| survived | Position level | Name | gender | age | number of siblings | Number of parents and children | Ticket information | fare | cabin | boarding port |
---|
Passenger ID | | | | | | | | | | | |
---|
1 | False | False | False | False | False | False | False | False | False | True | False |
---|
2 | False | False | False | False | False | False | False | False | False | False | False |
---|
3 | False | False | False | False | False | False | False | False | False | True | False |
---|
4 | False | False | False | False | False | False | False | False | False | False | False |
---|
5 | False | False | False | False | False | False | False | False | False | True | False |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
887 | False | False | False | False | False | False | False | False | False | True | False |
---|
888 | False | False | False | False | False | False | False | False | False | False | False |
---|
889 | False | False | False | False | True | False | False | False | False | True | False |
---|
890 | False | False | False | False | False | False | False | False | False | False | False |
---|
891 | False | False | False | False | False | False | False | False | False | True | False |
---|
891 rows × 11 columns
# Save as current to change data (to csv) to csv - save data
df.to_csv('tain_cn.csv')
# Query for the first 10 rows sorted by fare and age in descending order
print(df.sort_values(by=['fare','age'],ascending=False).head(10))
'''
According to common sense, I know that the higher the fare, the better the cabin,
So we can clearly see that 8 of the top 10 passengers survived,
This is a fairly high proportion. Later, you can try to analyze the relationship between ticket price and age, and the relationship between ticket price and survival rate.
'''
Survive or not Position level Name gender \
passenger ID
680 1 1 Cardeza, Mr. Thomas Drake Martinez male
259 1 1 Ward, Miss. Anna female
738 1 1 Lesurer, Mr. Gustave J male
439 0 1 Fortune, Mr. Mark male
342 1 1 Fortune, Miss. Alice Elizabeth female
89 1 1 Fortune, Miss. Mabel Helen female
28 0 1 Fortune, Mr. Charles Alexander male
743 1 1 Ryerson, Miss. Susan Parker "Suzette" female
312 1 1 Ryerson, Miss. Emily Borie female
300 1 1 Baxter, Mrs. James (Helene DeLaudeniere Chaput) female
Age Number of siblings Number of parents and children Ticket information fare Cabin Port of Embarkation
passenger ID
680 36.0 0 1 PC 177 55 512.3292 B51 B53 B55 C
259 35.0 0 0 PC 17755 512.3292 NaN C
738 35.0 0 0 PC 17755 512.3292 B101 C
439 64.0 1 4 19950 263.0000 C23 C25 C27 S
342 24.0 3 2 19950 263.0000 C23 C25 C27 S
89 23.0 3 2 19950 263.0000 C23 C25 C27 S
28 19.0 3 2 19950 263.0000 C23 C25 C27 S
743 21.0 2 2 PC 17608 262.3750 B57 B59 B63 B66 C
312 18.0 2 2 PC 17608 262.3750 B57 B59 B63 B66 C
300 50.0 0 1 PC 17558 247.5208 B58 B60 C
'\n According to common sense, I know that the higher the fare, the better the cabin,\n So we can clearly see that 8 of the top 10 passengers survived,\n This is a fairly high proportion. Later, you can try to analyze the relationship between ticket price and age, and the relationship between ticket price and survival rate.\n'
# Query descriptive statistics for fares
print(df['fare'].describe())
'''
There are a total of 891 fare data,
Average is about: 32.20,
The standard deviation is about 49.69,It shows that the price fluctuates greatly.
25%of people whose fare is less than 7.91 yes, 50%of people with fares below 14.45,75%of people with fares below 31.00,
The maximum fare is about 512.33,The minimum value is 0.
'''
count 891.000000
mean 32.204208
std 49.693429
min 0.000000
25% 7.910400
50% 14.454200
75% 31.000000
max 512.329200
Name: fare, dtype: float64
'\n There are a total of 891 fare data,\n Average is about: 32.20,\n The standard deviation is about 49.69,It shows that the price fluctuates greatly.\n25%of people whose fare is less than 7.91 yes, 50%of people with fares below 14.45,75%of people with fares below 31.00,\n The maximum fare is about 512.33,The minimum value is 0.\n'
# Descending by position level and survival View first 10 rows and last 10 rows of data
print(df.sort_values(by=['Position level','survived'],ascending=False).head(10))
print(df.sort_values(by=['Position level','survived'],ascending=False).tail(10))
'''
Through observation, it can be found that the top 10 positions with the highest level (level 3) all survived.
The last 10 positions with the lowest level (level 1) were not spared
Later, you can try to explore the relationship between position level and survival
'''
# More sorting observations can be made, such as age, gender, etc., and their descriptive statistics can be observed
Survive or not Position level Name gender \
passenger ID
3 1 3 Heikkinen, Miss. Laina female
9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female
11 1 3 Sandstrom, Miss. Marguerite Rut female
20 1 3 Masselmani, Mrs. Fatima female
23 1 3 McGowan, Miss. Anna "Annie" female
26 1 3 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... female
29 1 3 O'Dwyer, Miss. Ellen "Nellie" female
33 1 3 Glynn, Miss. Mary Agatha female
37 1 3 Mamee, Mr. Hanna male
40 1 3 Nicola-Yarred, Miss. Jamila female
Age Number of siblings Number of parents and children Ticket information Fare Cabin Port of Embarkation
passenger ID
3 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
9 27.0 0 2 347742 11.1333 NaN S
11 4.0 1 1 PP 9549 16.7000 G6 S
20 NaN 0 0 2649 7.2250 NaN C
23 15.0 0 0 330923 8.0292 NaN Q
26 38.0 1 5 347077 31.3875 NaN S
29 NaN 0 0 330959 7.8792 NaN Q
33 NaN 0 0 335677 7.7500 NaN Q
37 NaN 0 0 2677 7.2292 NaN C
40 14.0 1 0 2651 11.2417 NaN C
Survive or not Position level Name gender Age Number of siblings \
passenger ID
749 0 1 Marvin, Mr. Daniel Warner male 19.0 1
767 0 1 Brewe, Dr. Arthur Jackson male NaN 0
783 0 1 Long, Mr. Milton Clyde male 29.0 0
790 0 1 Guggenheim, Mr. Benjamin male 46.0 0
794 0 1 Hoyt, Mr. William Fisher male NaN 0
807 0 1 Andrews, Mr. Thomas Jr male 39.0 0
816 0 1 Fry, Mr. Richard male NaN 0
823 0 1 Reuchlin, Jonkheer. John George male 38.0 0
868 0 1 Roebling, Mr. Washington Augustus II male 31.0 0
873 0 1 Carlsson, Mr. Frans Olof male 33.0 0
Number of parents and children Ticket information fare Cabin Port of Embarkation
passenger ID
749 0 113773 53.1000 D30 S
767 0 112379 39.6000 NaN C
783 0 113501 30.0000 D6 S
790 0 PC 17593 79.2000 B82 B84 C
794 0 PC 17600 30.6958 NaN C
807 0 112050 0.0000 A36 S
816 0 112058 0.0000 B102 S
823 0 19972 0.0000 NaN S
868 0 PC 17590 50.4958 A24 S
873 0 695 5.0000 B51 B53 B55 S
'\n Through observation, it can be found that the top 10 positions with the highest level (level 3) all survived.\n The last 10 positions with the lowest level (level 1) were not spared\n Later, you can try to explore the relationship between position level and survival\n'
# Missing value observation and handling
# Method 1: info returns the number of non-null values
print(df.info())
# Method 2: Calculate the sum of the number of vacancies
print(df.isnull().sum())
'''
By observation, it can be found that there are missing values for age, cabin, and port of embarkation
'''
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 891 non-null int64
1 Position level 891 non-null int64
2 Name 891 non-null object
3 gender 891 non-null object
4 age 714 non-null float64
5 Number of siblings 891 non-null int64
6 Number of parents and children 891 non-null int64
7 Ticket information 891 non-null object
8 fare 891 non-null float64
9 cabin 204 non-null object
10 boarding port 889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
None
survived 0
Bin level 0
Name 0
Gender 0
age 177
Number of siblings 0
Number of parents and children 0
Ticket information 0
Ticket Price 0
cabin 687
boarding port 2
dtype: int64
'\n By observation, it can be found that there are missing values for age, cabin, and port of embarkation\n'
# remove missing values
# Delete columns and rows with missing values
# df.dropna()
# Find missing values and assign 0
# df[df['age'].isna()] = 0
df.head()
| survived | Position level | Name | gender | age | number of siblings | Number of parents and children | Ticket information | fare | cabin | boarding port | age category | gender category |
---|
Passenger ID | | | | | | | | | | | | | |
---|
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 2 | 1 |
---|
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 5 | 2 |
---|
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 3 | 2 |
---|
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 4 | 2 |
---|
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 4 | 1 |
---|
# Fill null values fillna() Fills all numeric null values with 0
df.fillna(0)
| survived | Position level | Name | gender | age | number of siblings | Number of parents and children | Ticket information | fare | cabin | boarding port |
---|
Passenger ID | | | | | | | | | | | |
---|
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | 0 | S |
---|
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | 0 | S |
---|
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
---|
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | 0 | S |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | 0 | S |
---|
888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
---|
889 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0000 | 0 | 0 |
---|
890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
---|
891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | 0 | Q |
---|
891 rows × 11 columns
# View duplicate values in your data
df[df.duplicated()]
| survived | Position level | Name | gender | age | number of siblings | Number of parents and children | Ticket information | fare | cabin | boarding port |
---|
Passenger ID | | | | | | | | | | | |
---|
18 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
---|
20 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
---|
27 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
---|
29 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
---|
30 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
860 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
---|
864 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
---|
869 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
---|
879 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
---|
889 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0 | 0 | 0.0 | 0 | 0 |
---|
176 rows × 11 columns
# Clean up duplicate values
df = df.drop_duplicates()
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 715 entries, 1 to 891
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 survived 715 non-null int64
1 Position level 715 non-null int64
2 Name 715 non-null object
3 gender 715 non-null object
4 age 715 non-null float64
5 Number of siblings 715 non-null int64
6 Number of parents and children 715 non-null int64
7 Ticket information 715 non-null object
8 fare 715 non-null float64
9 cabin 186 non-null object
10 boarding port 713 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 67.0+ KB
# Feature observation and processing
'''
When we observe the features, we can roughly divide the features into two categories.:
Numerical features: Survived ,Pclass, Age ,SibSp, Parch, Fare,
in Survived, Pclass is a discrete numerical feature,
Age,SibSp, Parch, Fare is a continuous numerical feature
textual features: Name, Sex, Cabin,Embarked, Ticket,
in Sex, Cabin, Embarked, Ticket Categorical text features.
Numerical features can generally be used directly for model training.
But sometimes continuous variables are discretized for the stability and robustness of the model..
Textual features often need to be converted into numerical features before they can be used for modeling and analysis.
'''
'\n When we observe the features, we can roughly divide the features into two categories.: \n Numerical features: Survived ,Pclass, Age ,SibSp, Parch, Fare,\n in Survived, Pclass is a discrete numerical feature,\nAge,SibSp, Parch, Fare is a continuous numerical feature \n\n textual features: Name, Sex, Cabin,Embarked, Ticket,\n in Sex, Cabin, Embarked, Ticket Categorical text features. \n\n Numerical features can generally be used directly for model training.\n But sometimes continuous variables are discretized for the stability and robustness of the model.. \n Textual features often need to be converted into numerical features before they can be used for modeling and analysis. \n'
# The continuous variable Age is equally divided into 5 age groups and represented by the categorical variable 12345 respectively
df['age category'] = pd.cut(df['age'], 5,labels = [1,2,3,4,5])
df.head()
| survived | Position level | Name | gender | age | number of siblings | Number of parents and children | Ticket information | fare | cabin | boarding port | age category |
---|
Passenger ID | | | | | | | | | | | | |
---|
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 2 |
---|
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 3 |
---|
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 2 |
---|
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 3 |
---|
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 3 |
---|
# #The continuous variable Age is divided into five age groups (0,5] (5,15] (15,30] (30,50] (50,80], and they are represented by the categorical variable 12345, respectively.
df['age category'] = pd.cut(df['age'],[0,5,15,30,50,80],labels = [1,2,3,4,5])
df.head()
| survived | Position level | Name | gender | age | number of siblings | Number of parents and children | Ticket information | fare | cabin | boarding port | age category |
---|
Passenger ID | | | | | | | | | | | | |
---|
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 3 |
---|
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 4 |
---|
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 3 |
---|
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 4 |
---|
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 4 |
---|
#The continuous variable Age is divided into five age groups of 10% 30% 50 70% 90% and represented by the categorical variable 12345
df['age category'] = pd.qcut(df['age'],[0,0.1,0.3,0.5,0.7,0.9],labels = [1,2,3,4,5])
df.head()
| survived | Position level | Name | gender | age | number of siblings | Number of parents and children | Ticket information | fare | cabin | boarding port | age category |
---|
Passenger ID | | | | | | | | | | | | |
---|
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 2 |
---|
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 5 |
---|
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 3 |
---|
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 4 |
---|
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 4 |
---|
print(df['gender'].value_counts()) # View variables and types in the 'Sex' column
print(df['boarding port'].value_counts()) # View variables and types in the 'Cabin' column
print(df['cabin'].value_counts()) # View variables and types in the 'Embarked' column
male 453
female 261
0 1
Name: gender, dtype: int64
S 554
C 130
Q 28
0 1
Name: boarding port, dtype: int64
G6 4
C23 C25 C27 4
B96 B98 4
F2 3
F33 3
..
A6 1
C104 1
B39 1
B69 1
0 1
Name: cabin, Length: 135, dtype: int64
#Convert category text to 12345
#Method 1: replace
# Male 1 Female 2
# The replace() method replaces the old (old string) in the string with new (new string)
df['gender category'] = df['gender'].replace(['male','female'],[1,2])
df.head()
| survived | Position level | Name | gender | age | number of siblings | Number of parents and children | Ticket information | fare | cabin | boarding port | age category | gender category |
---|
Passenger ID | | | | | | | | | | | | | |
---|
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 2 | 1 |
---|
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 5 | 2 |
---|
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 3 | 2 |
---|
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 4 | 2 |
---|
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 4 | 1 |
---|
from sklearn.preprocessing import LabelEncoder
df['cabin'] = LabelEncoder().fit_transform(df['cabin'])
df.head()
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-28-a090f7066f88> in <module>
1 from sklearn.preprocessing import LabelEncoder
----> 2 df['cabin'] = LabelEncoder().fit_transform(df['cabin'])
3 df.head()
/opt/conda/lib/python3.6/site-packages/sklearn/preprocessing/label.py in fit_transform(self, y)
110 """
111 y = column_or_1d(y, warn=True)
--> 112 self.classes_, y = np.unique(y, return_inverse=True)
113 return y
114
<__array_function__ internals> in unique(*args, **kwargs)
/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
259 ar = np.asanyarray(ar)
260 if axis is None:
--> 261 ret = _unique1d(ar, return_index, return_inverse, return_counts)
262 return _unpack_tuple(ret)
263
/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
317
318 if optional_indices:
--> 319 perm = ar.argsort(kind='mergesort' if return_index else 'quicksort')
320 aux = ar[perm]
321 else:
TypeError: '<' not supported between instances of 'str' and 'float'
df['boarding port'] = LabelEncoder().fit_transform(df['boarding port'])
df.head()
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-34-f52d686f7adb> in <module>
----> 1 df['boarding port'] = LabelEncoder().fit_transform(df['boarding port'])
2 df.head()
/opt/conda/lib/python3.6/site-packages/sklearn/preprocessing/label.py in fit_transform(self, y)
110 """
111 y = column_or_1d(y, warn=True)
--> 112 self.classes_, y = np.unique(y, return_inverse=True)
113 return y
114
<__array_function__ internals> in unique(*args, **kwargs)
/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
259 ar = np.asanyarray(ar)
260 if axis is None:
--> 261 ret = _unique1d(ar, return_index, return_inverse, return_counts)
262 return _unpack_tuple(ret)
263
/opt/conda/lib/python3.6/site-packages/numpy/lib/arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
317
318 if optional_indices:
--> 319 perm = ar.argsort(kind='mergesort' if return_index else 'quicksort')
320 aux = ar[perm]
321 else:
TypeError: '<' not supported between instances of 'int' and 'str'
# one hot encoding
# Turn into 01 sequence to save computing performance
x = pd.get_dummies(df['age'],prefix='age')
# concatenate data onto original data concatenate by column
df = pd.concat([df,x],axis=1)
df
df.to_csv('train_onehot.csv')
for column in ['cabin','boarding port']:
x = pd.get_dummies(df[column],prefix= column)
df = pd.concat([df,x],axis=1)
df.head()
| survived | Position level | Name | gender | age | number of siblings | Number of parents and children | Ticket information | fare | cabin | ... | Cabin_F G73 | Cabin_F2 | Cabin_F33 | Cabin_F4 | Cabin_G6 | Cabin_T | Embarkation port_0 | Embarkation port_C | Embarkation port_Q | Embarkation port_S |
---|
Passenger ID | | | | | | | | | | | | | | | | | | | | | |
---|
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
---|
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
---|
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
---|
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
---|
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
---|
5 rows × 643 columns
# series.str.extract string extraction expression
# pat regular expression
# flags
df['Title'] = df.Name.str.extract('([A-Za-z]+)\.')
df
| survived | Position level | Name | gender | age | number of siblings | Number of parents and children | Ticket information | fare | cabin | ... | Cabin_F2 | Cabin_F33 | Cabin_F4 | Cabin_G6 | Cabin_T | Embarkation port_0 | Embarkation port_C | Embarkation port_Q | Embarkation port_S | Title |
---|
Passenger ID | | | | | | | | | | | | | | | | | | | | | |
---|
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Mr |
---|
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | Mrs |
---|
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Miss |
---|
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Mrs |
---|
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Mr |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
886 | 0 | 3 | Rice, Mrs. William (Margaret Norton) | female | 39.0 | 0 | 5 | 382652 | 29.1250 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | Mrs |
---|
887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Rev |
---|
888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | Miss |
---|
890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | Mr |
---|
891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | Mr |
---|
715 rows × 644 columns
df.to_csv('test_fin.csv')