代码主要实现了对一个数据集(从DM_Project_24.csv
文件读取)的数据预处理、异常值检测、数据重构以及使用多种机器学习模型进行评估和调优的功能
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-10-23T03:53:51.550153600Z",
"start_time": "2024-10-23T03:53:51.542117400Z"
}
},
"outputs": [],
"source": [
"#1. imputation\n",
"#2. outlier detection\n",
"#3. Normalization\n",
"#4."
]
},
{
"cell_type": "code",
"execution_count": 315,
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"np.random.seed(42)\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn import metrics"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:04:48.194406300Z",
"start_time": "2024-10-23T11:04:48.177847900Z"
}
}
},
{
"cell_type": "code",
"execution_count": 316,
"outputs": [],
"source": [
"data = np.genfromtxt('DM_Project_24.csv',delimiter=\",\",skip_header=1)\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:04:48.750530300Z",
"start_time": "2024-10-23T11:04:48.650819800Z"
}
}
},
{
"cell_type": "code",
"execution_count": 317,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total Missing Values is:9499\n",
"Total Missing Values on Label is :0\n",
"Number of missing values by feature:\n",
" [ 67 101 86 83 101 108 78 90 88 91 81 110 83 94 78 80 107 85\n",
" 87 100 96 90 84 96 83 90 90 78 82 95 107 98 80 82 97 83\n",
" 83 86 98 75 83 79 81 95 90 71 100 84 88 77 98 98 91 99\n",
" 104 73 85 103 78 104 84 87 81 85 89 89 94 81 83 88 95 75\n",
" 95 82 81 90 87 84 84 87 86 99 92 85 80 98 102 106 81 101\n",
" 88 101 95 101 95 89 95 98 89 82 109 79 81 151 143 0]\n",
"Numeber of label 0: 1406\n",
"Number of label 1: 194\n"
]
}
],
"source": [
"#calculate total number of missing values\n",
"total_Value = np.sum(np.isnan(data))\n",
"print(f'Total Missing Values is:{total_Value}')\n",
"\n",
"#calculate count of missing values on labels\n",
"missing_on_label = np.sum(np.isnan(data[:,-1]))\n",
"print(f'Total Missing Values on Label is :{missing_on_label}')\n",
"\n",
"#show number of missing values by each feature\n",
"missing_on_feature = np.sum(np.isnan(data),axis=0)\n",
"print(f'Number of missing values by feature:\\n {missing_on_feature}')\n",
"\n",
"#calculate number of 1 and0\n",
"num_label_zero = np.sum(data[:,-1] == 0)\n",
"num_label_one = np.sum(data[:,-1] == 1)\n",
"print(f'Numeber of label 0: {num_label_zero}')\n",
"print(f'Number of label 1: {num_label_one}')\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:04:49.245104700Z",
"start_time": "2024-10-23T11:04:49.212110900Z"
}
}
},
{
"cell_type": "code",
"execution_count": 318,
"outputs": [],
"source": [
"col_features = data[:,:-1]\n",
"col_label = data[:,-1]\n",
"col_numerical = data[:,:103]\n",
"col_nominal = data[:,103:-1]"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:04:49.774289600Z",
"start_time": "2024-10-23T11:04:49.742662500Z"
}
}
},
{
"cell_type": "code",
"execution_count": 319,
"outputs": [
{
"data": {
"text/plain": "(1600, 2)"
},
"execution_count": 319,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"#Missing value\n",
"from sklearn.impute import SimpleImputer\n",
"numerical_imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')\n",
"nominal_imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')\n",
"impute_col_numerical = numerical_imputer.fit_transform(col_numerical)\n",
"impute_col_nominal = nominal_imputer.fit_transform(col_nominal)\n",
"impute_col_nominal.shape"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:04:50.262671100Z",
"start_time": "2024-10-23T11:04:50.230889900Z"
}
}
},
{
"cell_type": "code",
"execution_count": 320,
"outputs": [],
"source": [
"#Normalisaion\n",
"from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
"min_max_scaler = MinMaxScaler()\n",
"standard_scaler = StandardScaler()\n",
"\n",
"minmax_col_numerical = min_max_scaler.fit_transform(impute_col_numerical)\n",
"std_col_numerical = standard_scaler.fit_transform(impute_col_numerical)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:04:50.965009600Z",
"start_time": "2024-10-23T11:04:50.949314900Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"# Z-score"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 320,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-10-23T11:04:52.112475400Z",
"start_time": "2024-10-23T11:04:52.082097100Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"# DBSCAN"
],
"metadata": {
"collapsed": false
}
},
{
"cell_ty