Read and format project data
= "https://github.com/fivethirtyeight/data/blob/master/star-wars-survey/StarWars.csv?raw=true"
url = pd.read_csv(url, encoding="ISO-8859-1")
data_sw = data_sw.drop(index=0) data_sw
Course DS 250
Nefi Melgar
Survey data is notoriously difficult to munge. Even when the data is recorded cleanly the options for ‘write in questions’, ‘choose from multiple answers’, ‘pick all that are right’, and ‘multiple choice questions’ makes storing the data in a tidy format difficult.
In 2014, FiveThirtyEight surveyed over 1000 people to write the article titled, America’s Favorite ‘Star Wars’ Movies (And Least Favorite Characters). They have provided the data on GitHub.
For this project, your client would like to use the Star Wars survey data to figure out if they can predict an interviewing job candidate’s current income based on a few responses about Star Wars movies.
SHORTEN THE COLUMNS AND CLEAN THEM UP FOR EASIER USE WITH PANDAS
We wrangled the messy Star Wars survey data by shortening and cleaning column names for smoother analysis with pandas. We are going to see a glimpse of the transformation:
# Code to execute
# create a df to display the names of the columns olds and news
new_columns_df = pd.DataFrame(columns=["old_columns", "new_columns"])
old_cols = data_sw.columns.tolist()
# insert old column names from data_sw to new 'old_columns' column in new cols df
new_columns_df["old_columns"] = old_cols
def new_columns(df_frame):
# new names for the columns
df_frame.columns = [
"respondent_id",
"seen_any_sw_film",
"sw_fan",
"seen_ep1",
"seen_ep2",
"seen_ep3",
"seen_ep4",
"seen_ep5",
"seen_ep6",
"rank_ep1",
"rank_ep2",
"rank_ep3",
"rank_ep4",
"rank_ep5",
"rank_ep6",
"fav_han",
"fav_luke",
"fav_leia",
"fav_anakin",
"fav_obi",
"fav_emperor",
"fav_darth",
"fav_lando",
"fav_boba",
"fav_c3po",
"fav_r2d2",
"fav_jarjar",
"fav_padme",
"fav_yoda",
"shot_first",
"expanded_universe_fam",
"expanded_universe_fan",
"star_trek_fan",
"gender",
"age",
"income",
"education",
"location",
]
new_columns(data_sw)
# insert new column names in the new cols df
new_columns_df["new_columns"] = data_sw.columns.tolist()
new_columns_df.head(10)
old_columns | new_columns | |
---|---|---|
0 | RespondentID | respondent_id |
1 | Have you seen any of the 6 films in the Star W... | seen_any_sw_film |
2 | Do you consider yourself to be a fan of the St... | sw_fan |
3 | Which of the following Star Wars films have yo... | seen_ep1 |
4 | Unnamed: 4 | seen_ep2 |
5 | Unnamed: 5 | seen_ep3 |
6 | Unnamed: 6 | seen_ep4 |
7 | Unnamed: 7 | seen_ep5 |
8 | Unnamed: 8 | seen_ep6 |
9 | Please rank the Star Wars films in order of pr... | rank_ep1 |
Columns names were changed for all the dataframe so it can be easier to understand it.
CLEAN AND FORMAT THE DATA SO THAT IT CAN BE USED IN A MACHINE LEARNING MODEL.
As part of the cleaning we are going to complete each of the items listed below, by doing this the data will be better formatted and prepared to use in a ML model: - Tasks to complete: - Filter the dataset to respondents that have seen at least one film - Create a new column that converts the age ranges to a single number. Drop the age range categorical column - Create a new column that converts the education groupings to a single number. Drop the school categorical column - Create a new column that converts the income ranges to a single number. Drop the income range categorical column - Create your target (also known as “y” or “label”) column based on the new income range column - One-hot encode all remaining categorical columns
# add respondent id an easier to read number
data_sw["respondent_id"] = range(1, len(data_sw) + 1)
# Filter the dataset to respondents that have seen at least one film
data_sw = data_sw[data_sw["seen_any_sw_film"] == "Yes"]
# fill NaN for numerical columns usign the median
num_cols = data_sw.select_dtypes(include=["float64", "int64"]).columns
for col in num_cols:
data_sw[col] = data_sw[col].fillna(data_sw[col].median())
# fill NaN for categorical columns using the mode
cat_cols = data_sw.select_dtypes(include=["object"]).columns
for col in cat_cols:
data_sw[col] = data_sw[col].fillna(data_sw[col].mode()[0])
# Create a new column that converts the age ranges to a single number. Drop the age range categorical column
def convert_age(age_range):
if pd.isnull(age_range):
return None
age_mapping = {
"18-29": 24,
"30-44": 37,
"45-60": 53,
"> 60": 65,
}
return age_mapping.get(age_range, None)
data_sw["age_range"] = data_sw["age"].apply(convert_age)
data_sw = data_sw.drop(columns=["age"])
# Create a new column that converts the education groupings to a single number. Drop the school categorical column
def convert_education(education):
if pd.isnull(education):
return None
edu_mapping = {
"Less than high school degree": 1,
"High school degree": 2,
"Some college or Associate degree": 3,
"Bachelor degree": 4,
"Graduate degree": 5,
}
return edu_mapping.get(education, None)
data_sw["education_group"] = data_sw["education"].apply(convert_education)
data_sw = data_sw.drop(columns=["education"])
# Create a new column that converts the income ranges to a single number. Drop the income range categorical column
def convert_income(income_range):
if pd.isnull(income_range):
return None
income_mapping = {
"$0 - $24,999": 125000,
"$25,000 - $49,999": 375000,
"$50,000 - $99,999": 75000,
"$100,000 - $149,999": 125000,
"$150,000+": 150000,
}
return income_mapping.get(income_range, None)
data_sw["income_group"] = data_sw["income"].apply(convert_income)
data_sw = data_sw.drop(columns=["income"])
# Create your target (also known as “y” or “label”) column based on the new income range column
data_sw["target_income"] = data_sw["income_group"].apply(
lambda x: 1 if x > 50000 else 0
)
# One-hot encode all remaining categorical columns
data_sw = pd.get_dummies(data_sw, columns=["sw_fan", "gender", "location"])
data_sw.head(2)
respondent_id | seen_any_sw_film | seen_ep1 | seen_ep2 | seen_ep3 | seen_ep4 | seen_ep5 | seen_ep6 | rank_ep1 | rank_ep2 | ... | gender_Male | location_East North Central | location_East South Central | location_Middle Atlantic | location_Mountain | location_New England | location_Pacific | location_South Atlantic | location_West North Central | location_West South Central | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 1 | Yes | Star Wars: Episode I The Phantom Menace | Star Wars: Episode II Attack of the Clones | Star Wars: Episode III Revenge of the Sith | Star Wars: Episode IV A New Hope | Star Wars: Episode V The Empire Strikes Back | Star Wars: Episode VI Return of the Jedi | 3 | 2 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
3 | 3 | Yes | Star Wars: Episode I The Phantom Menace | Star Wars: Episode II Attack of the Clones | Star Wars: Episode III Revenge of the Sith | Star Wars: Episode IV A New Hope | Star Wars: Episode V The Empire Strikes Back | Star Wars: Episode VI Return of the Jedi | 1 | 2 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
2 rows × 49 columns
Columns were cleaned, NaN were filled and new columns were added.
Machine Learning Model whether a person makes more than $50k
We are going to build a Machine Learning model to predict whether a person makes more than $50k.
The machine learning model created for this project is a Random Forest Classifier, designed to predict whether a person makes more than $50k based on their responses to the Star Wars survey. The dataset was preprocessed to handle missing values, encode categorical variables, and convert age, education, and income ranges into numerical values.
# split the data in features and target label
X = data_sw.drop(
columns=[
"target_income",
"seen_any_sw_film",
"seen_ep1",
"seen_ep2",
"seen_ep3",
"seen_ep4",
"seen_ep5",
"seen_ep6",
"fav_han",
"fav_luke",
"fav_leia",
"fav_anakin",
"fav_obi",
"fav_emperor",
"fav_darth",
"fav_lando",
"fav_boba",
"fav_c3po",
"fav_r2d2",
"fav_jarjar",
"fav_padme",
"fav_yoda",
"shot_first",
"expanded_universe_fam",
"expanded_universe_fan",
"star_trek_fan",
]
)
y = data_sw["target_income"]
# split data in training and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
# standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# initialize the random forest classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
# show the accuracy
print(f"The accuracy of the Random Forest Classifier is: {accuracy:.2f}")
print(classification_report(y_test, y_pred))
The accuracy of the Random Forest Classifier is: 1.00
precision recall f1-score support
1 1.00 1.00 1.00 281
accuracy 1.00 281
macro avg 1.00 1.00 1.00 281
weighted avg 1.00 1.00 1.00 281
This report mentions a perfect accuracy of 100%, with a support of 281.
VALIDATE THAT DATA PROVIDED LINES UP WITH THE ARTICLE
We are going to validate that the data provided on GitHub lines up with the article by recreating 2 of the visuals from the original article, that will let us know if it makes sense of not.
Based on the “Who Shot First” column a chart was recreated, showing the character percentages for this column.
url = "https://github.com/fivethirtyeight/data/blob/master/star-wars-survey/StarWars.csv?raw=true"
data_sw = pd.read_csv(url, encoding="ISO-8859-1")
data_sw = data_sw.drop(index=0)
new_columns(data_sw)
shot_first_counts = data_sw["shot_first"].value_counts(normalize=True) * 100
shot_first_counts = shot_first_counts.reset_index()
shot_first_counts.columns = ["shot_first", "percentage"]
# round to whole number
shot_first_counts["percentage"] = shot_first_counts["percentage"].round(0)
# create chart
fig = px.bar(
shot_first_counts,
y="shot_first",
x="percentage",
title="Who Shot First?",
labels={"shot_first": "Character who shot first", "percentage": "Percentage"},
text="percentage",
orientation="h",
)
fig.show()
This chart show the same percentages than the article, showing consistency with the data.
We then asked respondents which of the films they had seen. With 835 people responding, here’s the probability that someone has seen a given “Star Wars” film given that they have seen any Star Wars film.
data_sw = data_sw[data_sw["seen_any_sw_film"] == "Yes"]
star_wars_movies_list = [
"The Phantom Menace",
"Attack of the Clones",
"Revenge of the Sith",
"A New Hope",
"The Empire Strikes Back",
"Return of the Jedi",
]
see_movies_list = [
"seen_ep1",
"seen_ep2",
"seen_ep3",
"seen_ep4",
"seen_ep5",
"seen_ep6",
]
seen_film_counts = pd.DataFrame(columns=["Movie", "Percentage"])
percentages = []
for seen in see_movies_list:
percentage = data_sw[seen].notna().mean() * 100
percentages.append(percentage)
seen_film_counts = pd.DataFrame(
{"Movie": star_wars_movies_list, "Percentage": percentages}
)
seen_film_counts["Percentage"] = seen_film_counts["Percentage"].round(0)
# create chart
seen_fig = px.bar(
seen_film_counts,
y="Movie",
x="Percentage",
title="Which 'Star Wars' Movie Have You Seen?",
text="Percentage",
orientation="h",
)
seen_fig.show()
This chart shows different percentages than the article, even when filtered.