博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
集成学习-案例分析(幸福感预测)(待完成)
阅读量:3949 次
发布时间:2019-05-24

本文共 14832 字,大约阅读时间需要 49 分钟。

集成学习-案例分析(幸福感预测)

案例背景

幸福感是一个古老而深刻的话题,是人类世代追求的方向。与幸福感相关的因素成千上万、因人而异,大如国计民生,小如路边烤红薯,都会对幸福感产生影响。

该案例为幸福感预测这一经典课题,希望在现有社会科学研究外有其他维度的算法尝试,结合多学科各自优势,挖掘潜在的影响因素,发现更多可解释、可理解的相关关系。
来说,我们需要使用包括个体变量(性别、年龄、地域、职业、健康、婚姻与政治面貌等等)、家庭变量(父母、配偶、子女、家庭资本等等)、社会态度(公平、信用、公共服务等等)等139维度的信息来预测其对幸福感的影响。
数据来源于国家官方的《中国综合社会调查(CGSS)》文件中的调查结果

数据处理

数据特征查看

train = pd.read_csv("train.csv", parse_dates=['survey_time'],encoding='latin-1') test = pd.read_csv("test.csv", parse_dates=['survey_time'],encoding='latin-1') #latin-1向下兼容ASCIIprint(train.head())print(test.head())print(train.columns)print(train.columns)

数据预处理

#缺失值填补data['work_status'] = data['work_status'].fillna(0)data['work_yr'] = data['work_yr'].fillna(0)data['work_manage'] = data['work_manage'].fillna(0)data['work_type'] = data['work_type'].fillna(0)data['edu_yr'] = data['edu_yr'].fillna(0)data['edu_status'] = data['edu_status'].fillna(0)data['s_work_type'] = data['s_work_type'].fillna(0)data['s_work_status'] = data['s_work_status'].fillna(0)data['s_political'] = data['s_political'].fillna(0)data['s_hukou'] = data['s_hukou'].fillna(0)data['s_income'] = data['s_income'].fillna(0)data['s_birth'] = data['s_birth'].fillna(0)data['s_edu'] = data['s_edu'].fillna(0)data['s_work_exper'] = data['s_work_exper'].fillna(0)data['minor_child'] = data['minor_child'].fillna(0)data['marital_now'] = data['marital_now'].fillna(0)data['marital_1st'] = data['marital_1st'].fillna(0)data['social_neighbor']=data['social_neighbor'].fillna(0)data['social_friend']=data['social_friend'].fillna(0)data['hukou_loc']=data['hukou_loc'].fillna(1)data['family_income']=data['family_income'].fillna(66365) data.loc[data['leisure_4']<0,'leisure_4'] = data['leisure_4'].mode() #取众数data.loc[data['leisure_5']<0,'leisure_5'] = data['leisure_5'].mode()data.loc[data['leisure_6']<0,'leisure_6'] = data['leisure_6'].mode()data.loc[data['leisure_7']<0,'leisure_7'] = data['leisure_7'].mode()data.loc[data['leisure_8']<0,'leisure_8'] = data['leisure_8'].mode()data.loc[data['leisure_9']<0,'leisure_9'] = data['leisure_9'].mode()data.loc[data['leisure_10']<0,'leisure_10'] = data['leisure_10'].mode()data.loc[data['leisure_11']<0,'leisure_11'] = data['leisure_11'].mode()data.loc[data['leisure_12']<0,'leisure_12'] = data['leisure_12'].mode()data.loc[data['socialize']<0,'socialize'] = 2 #很少data.loc[data['relax']<0,'relax'] = 4 #经常data.loc[data['learn']<0,'learn'] = 1 #从不,哈哈哈哈#对‘社交’处理data.loc[data['social_neighbor']<0,'social_neighbor'] = 0data.loc[data['social_friend']<0,'social_friend'] = 0data.loc[data['socia_outing']<0,'socia_outing'] = 1data.loc[data['neighbor_familiarity']<0,'social_neighbor']= 4data.loc[data['equity']<0,'equity'] = 4data.loc[data['class_10_before']<0,'class_10_before'] = 3data.loc[data['class']<0,'class'] = 5data.loc[data['class_10_after']<0,'class_10_after'] = 5data.loc[data['class_14']<0,'class_14'] = 2data.loc[data['work_status']<0,'work_status'] = 0data.loc[data['work_yr']<0,'work_yr'] = 0data.loc[data['work_manage']<0,'work_manage'] = 0data.loc[data['work_type']<0,'work_type'] = 0data.loc[data['insur_1']<0,'insur_1'] = 1data.loc[data['insur_2']<0,'insur_2'] = 1data.loc[data['insur_3']<0,'insur_3'] = 1data.loc[data['insur_4']<0,'insur_4'] = 1data.loc[data['insur_1']==0,'insur_1'] = 0data.loc[data['insur_2']==0,'insur_2'] = 0data.loc[data['insur_3']==0,'insur_3'] = 0data.loc[data['insur_4']==0,'insur_4'] = 0

数据增广

#第一次结婚年龄 147data['marital_1stbir'] = data['marital_1st'] - data['birth'] #最近结婚年龄 148data['marital_nowtbir'] = data['marital_now'] - data['birth'] #是否再婚 149#用最近结婚年龄减去第一次结婚年龄,若不为0,则表示再婚data['mar'] = data['marital_nowtbir'] - data['marital_1stbir']#配偶年龄 150#现在配偶结婚时年龄data['marital_sbir'] = data['marital_now']-data['s_birth']#配偶年龄差 151data['age_'] = data['marital_nowtbir'] - data['marital_sbir'] #收入比 151+7 =158#与配偶收入比data['income/s_income'] = data['income']/(data['s_income']+1)#与配偶收入和data['income+s_income'] = data['income']+(data['s_income']+1)#自己主要收入在家庭收入中的占比data['income/family_income'] = data['income']/(data['family_income']+1)#自己全部收入在家庭收入中的占比data['all_income/family_income'] = (data['income']+data['s_income'])/(data['family_income']+1)data['income/inc_exp'] = data['income']/(data['inc_exp']+1)data['family_income/m'] = data['family_income']/(data['family_m']+0.01)data['income/m'] = data['income']/(data['family_m']+0.01)#收入/面积比 158+4=162data['income/floor_area'] = data['income']/(data['floor_area']+0.01)data['all_income/floor_area'] = (data['income']+data['s_income'])/(data['floor_area']+0.01)data['family_income/floor_area'] = data['family_income']/(data['floor_area']+0.01)data['floor_area/m'] = data['floor_area']/(data['family_m']+0.01)#class 162+3=165data['class_10_diff'] = (data['class_10_after'] - data['class'])data['class_diff'] = data['class'] - data['class_10_before']data['class_14_diff'] = data['class'] - data['class_14']#悠闲指数 166leisure_fea_lis = ['leisure_'+str(i) for i in range(1,13)]data['leisure_sum'] = data[leisure_fea_lis].sum(axis=1) #skew#满意指数 167public_service_fea_lis = ['public_service_'+str(i) for i in range(1,10)]data['public_service_sum'] = data[public_service_fea_lis].sum(axis=1) #skew#信任指数 168trust_fea_lis = ['trust_'+str(i) for i in range(1,14)]data['trust_sum'] = data[trust_fea_lis].sum(axis=1) #skew#province mean 168+13=181data['province_income_mean'] = data.groupby(['province'])['income'].transform('mean').valuesdata['province_family_income_mean'] = data.groupby(['province'])['family_income'].transform('mean').valuesdata['province_equity_mean'] = data.groupby(['province'])['equity'].transform('mean').valuesdata['province_depression_mean'] = data.groupby(['province'])['depression'].transform('mean').valuesdata['province_floor_area_mean'] = data.groupby(['province'])['floor_area'].transform('mean').valuesdata['province_health_mean'] = data.groupby(['province'])['health'].transform('mean').valuesdata['province_class_10_diff_mean'] = data.groupby(['province'])['class_10_diff'].transform('mean').valuesdata['province_class_mean'] = data.groupby(['province'])['class'].transform('mean').valuesdata['province_health_problem_mean'] = data.groupby(['province'])['health_problem'].transform('mean').valuesdata['province_family_status_mean'] = data.groupby(['province'])['family_status'].transform('mean').valuesdata['province_leisure_sum_mean'] = data.groupby(['province'])['leisure_sum'].transform('mean').valuesdata['province_public_service_sum_mean'] = data.groupby(['province'])['public_service_sum'].transform('mean').valuesdata['province_trust_sum_mean'] = data.groupby(['province'])['trust_sum'].transform('mean').values#city   mean 181+13=194data['city_income_mean'] = data.groupby(['city'])['income'].transform('mean').valuesdata['city_family_income_mean'] = data.groupby(['city'])['family_income'].transform('mean').valuesdata['city_equity_mean'] = data.groupby(['city'])['equity'].transform('mean').valuesdata['city_depression_mean'] = data.groupby(['city'])['depression'].transform('mean').valuesdata['city_floor_area_mean'] = data.groupby(['city'])['floor_area'].transform('mean').valuesdata['city_health_mean'] = data.groupby(['city'])['health'].transform('mean').valuesdata['city_class_10_diff_mean'] = data.groupby(['city'])['class_10_diff'].transform('mean').valuesdata['city_class_mean'] = data.groupby(['city'])['class'].transform('mean').valuesdata['city_health_problem_mean'] = data.groupby(['city'])['health_problem'].transform('mean').valuesdata['city_family_status_mean'] = data.groupby(['city'])['family_status'].transform('mean').valuesdata['city_leisure_sum_mean'] = data.groupby(['city'])['leisure_sum'].transform('mean').valuesdata['city_public_service_sum_mean'] = data.groupby(['city'])['public_service_sum'].transform('mean').valuesdata['city_trust_sum_mean'] = data.groupby(['city'])['trust_sum'].transform('mean').values#county  mean 194 + 13 = 207data['county_income_mean'] = data.groupby(['county'])['income'].transform('mean').valuesdata['county_family_income_mean'] = data.groupby(['county'])['family_income'].transform('mean').valuesdata['county_equity_mean'] = data.groupby(['county'])['equity'].transform('mean').valuesdata['county_depression_mean'] = data.groupby(['county'])['depression'].transform('mean').valuesdata['county_floor_area_mean'] = data.groupby(['county'])['floor_area'].transform('mean').valuesdata['county_health_mean'] = data.groupby(['county'])['health'].transform('mean').valuesdata['county_class_10_diff_mean'] = data.groupby(['county'])['class_10_diff'].transform('mean').valuesdata['county_class_mean'] = data.groupby(['county'])['class'].transform('mean').valuesdata['county_health_problem_mean'] = data.groupby(['county'])['health_problem'].transform('mean').valuesdata['county_family_status_mean'] = data.groupby(['county'])['family_status'].transform('mean').valuesdata['county_leisure_sum_mean'] = data.groupby(['county'])['leisure_sum'].transform('mean').valuesdata['county_public_service_sum_mean'] = data.groupby(['county'])['public_service_sum'].transform('mean').valuesdata['county_trust_sum_mean'] = data.groupby(['county'])['trust_sum'].transform('mean').values#ratio 相比同省 207 + 13 =220data['income/province'] = data['income']/(data['province_income_mean'])                                      data['family_income/province'] = data['family_income']/(data['province_family_income_mean'])   data['equity/province'] = data['equity']/(data['province_equity_mean'])       data['depression/province'] = data['depression']/(data['province_depression_mean'])                                                data['floor_area/province'] = data['floor_area']/(data['province_floor_area_mean'])data['health/province'] = data['health']/(data['province_health_mean'])data['class_10_diff/province'] = data['class_10_diff']/(data['province_class_10_diff_mean'])data['class/province'] = data['class']/(data['province_class_mean'])data['health_problem/province'] = data['health_problem']/(data['province_health_problem_mean'])data['family_status/province'] = data['family_status']/(data['province_family_status_mean'])data['leisure_sum/province'] = data['leisure_sum']/(data['province_leisure_sum_mean'])data['public_service_sum/province'] = data['public_service_sum']/(data['province_public_service_sum_mean'])data['trust_sum/province'] = data['trust_sum']/(data['province_trust_sum_mean']+1)#ratio 相比同市 220 + 13 =233data['income/city'] = data['income']/(data['city_income_mean'])                                      data['family_income/city'] = data['family_income']/(data['city_family_income_mean'])   data['equity/city'] = data['equity']/(data['city_equity_mean'])       data['depression/city'] = data['depression']/(data['city_depression_mean'])                                                data['floor_area/city'] = data['floor_area']/(data['city_floor_area_mean'])data['health/city'] = data['health']/(data['city_health_mean'])data['class_10_diff/city'] = data['class_10_diff']/(data['city_class_10_diff_mean'])data['class/city'] = data['class']/(data['city_class_mean'])data['health_problem/city'] = data['health_problem']/(data['city_health_problem_mean'])data['family_status/city'] = data['family_status']/(data['city_family_status_mean'])data['leisure_sum/city'] = data['leisure_sum']/(data['city_leisure_sum_mean'])data['public_service_sum/city'] = data['public_service_sum']/(data['city_public_service_sum_mean'])data['trust_sum/city'] = data['trust_sum']/(data['city_trust_sum_mean'])#ratio 相比同个地区 233 + 13 =246data['income/county'] = data['income']/(data['county_income_mean'])                                      data['family_income/county'] = data['family_income']/(data['county_family_income_mean'])   data['equity/county'] = data['equity']/(data['county_equity_mean'])       data['depression/county'] = data['depression']/(data['county_depression_mean'])                                                data['floor_area/county'] = data['floor_area']/(data['county_floor_area_mean'])data['health/county'] = data['health']/(data['county_health_mean'])data['class_10_diff/county'] = data['class_10_diff']/(data['county_class_10_diff_mean'])data['class/county'] = data['class']/(data['county_class_mean'])data['health_problem/county'] = data['health_problem']/(data['county_health_problem_mean'])data['family_status/county'] = data['family_status']/(data['county_family_status_mean'])data['leisure_sum/county'] = data['leisure_sum']/(data['county_leisure_sum_mean'])data['public_service_sum/county'] = data['public_service_sum']/(data['county_public_service_sum_mean'])data['trust_sum/county'] = data['trust_sum']/(data['county_trust_sum_mean'])#age   mean 246+ 13 =259data['age_income_mean'] = data.groupby(['age'])['income'].transform('mean').valuesdata['age_family_income_mean'] = data.groupby(['age'])['family_income'].transform('mean').valuesdata['age_equity_mean'] = data.groupby(['age'])['equity'].transform('mean').valuesdata['age_depression_mean'] = data.groupby(['age'])['depression'].transform('mean').valuesdata['age_floor_area_mean'] = data.groupby(['age'])['floor_area'].transform('mean').valuesdata['age_health_mean'] = data.groupby(['age'])['health'].transform('mean').valuesdata['age_class_10_diff_mean'] = data.groupby(['age'])['class_10_diff'].transform('mean').valuesdata['age_class_mean'] = data.groupby(['age'])['class'].transform('mean').valuesdata['age_health_problem_mean'] = data.groupby(['age'])['health_problem'].transform('mean').valuesdata['age_family_status_mean'] = data.groupby(['age'])['family_status'].transform('mean').valuesdata['age_leisure_sum_mean'] = data.groupby(['age'])['leisure_sum'].transform('mean').valuesdata['age_public_service_sum_mean'] = data.groupby(['age'])['public_service_sum'].transform('mean').valuesdata['age_trust_sum_mean'] = data.groupby(['age'])['trust_sum'].transform('mean').values# 和同龄人相比259 + 13 =272data['income/age'] = data['income']/(data['age_income_mean'])                                      data['family_income/age'] = data['family_income']/(data['age_family_income_mean'])   data['equity/age'] = data['equity']/(data['age_equity_mean'])       data['depression/age'] = data['depression']/(data['age_depression_mean'])                                                data['floor_area/age'] = data['floor_area']/(data['age_floor_area_mean'])data['health/age'] = data['health']/(data['age_health_mean'])data['class_10_diff/age'] = data['class_10_diff']/(data['age_class_10_diff_mean'])data['class/age'] = data['class']/(data['age_class_mean'])data['health_problem/age'] = data['health_problem']/(data['age_health_problem_mean'])data['family_status/age'] = data['family_status']/(data['age_family_status_mean'])data['leisure_sum/age'] = data['leisure_sum']/(data['age_leisure_sum_mean'])data['public_service_sum/age'] = data['public_service_sum']/(data['age_public_service_sum_mean'])data['trust_sum/age'] = data['trust_sum']/(data['age_trust_sum_mean'])

特征选择

模型建立

转载地址:http://kogwi.baihongyu.com/

你可能感兴趣的文章
“需求为王”才是根本
查看>>
高效率的危害
查看>>
寻找边缘性创新
查看>>
让创意瞄准市场
查看>>
高效经理人应具有的八个重要习惯
查看>>
优秀的领导者能读懂人才
查看>>
大智若愚也是领导力
查看>>
android如何编译MTK的模拟器
查看>>
android如何添加AP中要使用的第三方JAR文件
查看>>
利用sudo命令为Ubuntu分配管理权限
查看>>
Ubuntu下几个重要apt-get命令用法与加速UBUNTU
查看>>
Ubuntu中网页各种插件安装命令
查看>>
使用tar命令备份Ubuntu系统
查看>>
ubuntu flash 文字乱码解决方案
查看>>
在ubuntu中运行exe文件
查看>>
ubuntu安装命令
查看>>
和上司沟通必备8个黄金句
查看>>
联系查看两张卡的未接电话记录
查看>>
把拒接电话作为已经接电话写到call log中
查看>>
FDN号码完全匹配
查看>>