# Clean BRFSS 2020
# ----------------
#
# This module documents the process used to clean and simplify the 
# BRFSS data set used in this lab. Students don't need to interact with this.
# Read more about BRFSS at https://www.cdc.gov/brfss/annual_data/annual_2020.html

# First, download and unzip https://www.cdc.gov/brfss/annual_data/2020/files/LLCP2020XPT.zip
# You should now have a file called LLCP2020.XPT

import pandas as pd
df = pd.read_sas("LLCP2020.XPT")
df = df[odf.DISPCODE == 1100]
df["sex"] = df["SEXVAR"].map({1: "male", 2: "female"})
df = df[df.GENHLTH <= 5]
df["health"] = df.GENHLTH.map({1:5, 2:4, 3:3, 4:2, 5:1})
df = df[df.MEDCOST <= 2]
df["no_doctor"] = df.MEDCOST.map({1: True, 2: False})
df = df[df.EXERANY2 <= 2]
df["exercise"] = df.EXERANY2.map({1: True, 2: False})
df = df[df.SLEPTIM1 < 25]
df["sleep"] = df.SLEPTIM1.astype(int)
df = df[df.INCOME2 < 9]
df["income"] = df.INCOME2.astype(int)
df = df[~df.WTKG3.isna()]
df["weight"] = df.WTKG3 / 100
df = df[~df.HTM4.isna()]
df["height"] = df.HTM4 / 100
df = df[(df.SOFEMALE.isin([1, 2, 3, 4, 7, 9])) | (df.SOMALE.isin([1, 2, 3, 4, 7, 9]))]
df["sexual_orientation"] = df.SOFEMALE
df["sexual_orientation"].fillna(df.SOMALE, inplace=True)
df["sexual_orientation"] = df["sexual_orientation"].map({1: "homosexual", 2: "heterosexual", 3: "bisexual", 4: "other", 7: "other", 9: "other"})
df = df[df._EDUCAG.isin([1, 2, 3, 4])]
df["education"] = df._EDUCAG.map({1: "none_completed", 2: "high_school", 3: "some_college", 4: "college"})
df["age"] = df._AGE_G.map({1: 18, 2: 25, 3: 35, 4: 45, 5: 55, 6: 65})
df = df[["age", "sex", "income", "education", "sexual_orientation", "height", "weight", "health", "no_doctor", "exercise", "sleep"]]
df.to_csv("brfss_2020.csv", index=False)