Bootstrap

python实现了对一个数据集(从csv文件读取)的数据预处理、异常值检测、数据重构以及使用多种机器学习模型进行评估和调优的功能

代码主要实现了对一个数据集(从DM_Project_24.csv文件读取)的数据预处理、异常值检测、数据重构以及使用多种机器学习模型进行评估和调优的功能

{
   
 "cells": [
  {
   
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
   
    "collapsed": true,
    "ExecuteTime": {
   
     "end_time": "2024-10-23T03:53:51.550153600Z",
     "start_time": "2024-10-23T03:53:51.542117400Z"
    }
   },
   "outputs": [],
   "source": [
    "#1. imputation\n",
    "#2. outlier detection\n",
    "#3. Normalization\n",
    "#4."
   ]
  },
  {
   
   "cell_type": "code",
   "execution_count": 315,
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "np.random.seed(42)\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import metrics"
   ],
   "metadata": {
   
    "collapsed": false,
    "ExecuteTime": {
   
     "end_time": "2024-10-23T11:04:48.194406300Z",
     "start_time": "2024-10-23T11:04:48.177847900Z"
    }
   }
  },
  {
   
   "cell_type": "code",
   "execution_count": 316,
   "outputs": [],
   "source": [
    "data = np.genfromtxt('DM_Project_24.csv',delimiter=\",\",skip_header=1)\n"
   ],
   "metadata": {
   
    "collapsed": false,
    "ExecuteTime": {
   
     "end_time": "2024-10-23T11:04:48.750530300Z",
     "start_time": "2024-10-23T11:04:48.650819800Z"
    }
   }
  },
  {
   
   "cell_type": "code",
   "execution_count": 317,
   "outputs": [
    {
   
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total Missing Values is:9499\n",
      "Total Missing Values on Label is :0\n",
      "Number of missing values by feature:\n",
      " [ 67 101  86  83 101 108  78  90  88  91  81 110  83  94  78  80 107  85\n",
      "  87 100  96  90  84  96  83  90  90  78  82  95 107  98  80  82  97  83\n",
      "  83  86  98  75  83  79  81  95  90  71 100  84  88  77  98  98  91  99\n",
      " 104  73  85 103  78 104  84  87  81  85  89  89  94  81  83  88  95  75\n",
      "  95  82  81  90  87  84  84  87  86  99  92  85  80  98 102 106  81 101\n",
      "  88 101  95 101  95  89  95  98  89  82 109  79  81 151 143   0]\n",
      "Numeber of label 0: 1406\n",
      "Number of label 1: 194\n"
     ]
    }
   ],
   "source": [
    "#calculate total number of missing values\n",
    "total_Value = np.sum(np.isnan(data))\n",
    "print(f'Total Missing Values is:{total_Value}')\n",
    "\n",
    "#calculate count of missing values on labels\n",
    "missing_on_label = np.sum(np.isnan(data[:,-1]))\n",
    "print(f'Total Missing Values on Label is :{missing_on_label}')\n",
    "\n",
    "#show number of missing values by each feature\n",
    "missing_on_feature = np.sum(np.isnan(data),axis=0)\n",
    "print(f'Number of missing values by feature:\\n {missing_on_feature}')\n",
    "\n",
    "#calculate number of 1 and0\n",
    "num_label_zero = np.sum(data[:,-1] == 0)\n",
    "num_label_one = np.sum(data[:,-1] == 1)\n",
    "print(f'Numeber of label 0: {num_label_zero}')\n",
    "print(f'Number of label 1: {num_label_one}')\n"
   ],
   "metadata": {
   
    "collapsed": false,
    "ExecuteTime": {
   
     "end_time": "2024-10-23T11:04:49.245104700Z",
     "start_time": "2024-10-23T11:04:49.212110900Z"
    }
   }
  },
  {
   
   "cell_type": "code",
   "execution_count": 318,
   "outputs": [],
   "source": [
    "col_features = data[:,:-1]\n",
    "col_label = data[:,-1]\n",
    "col_numerical = data[:,:103]\n",
    "col_nominal = data[:,103:-1]"
   ],
   "metadata": {
   
    "collapsed": false,
    "ExecuteTime": {
   
     "end_time": "2024-10-23T11:04:49.774289600Z",
     "start_time": "2024-10-23T11:04:49.742662500Z"
    }
   }
  },
  {
   
   "cell_type": "code",
   "execution_count": 319,
   "outputs": [
    {
   
     "data": {
   
      "text/plain": "(1600, 2)"
     },
     "execution_count": 319,
     "metadata": {
   },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Missing value\n",
    "from sklearn.impute import SimpleImputer\n",
    "numerical_imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')\n",
    "nominal_imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')\n",
    "impute_col_numerical = numerical_imputer.fit_transform(col_numerical)\n",
    "impute_col_nominal = nominal_imputer.fit_transform(col_nominal)\n",
    "impute_col_nominal.shape"
   ],
   "metadata": {
   
    "collapsed": false,
    "ExecuteTime": {
   
     "end_time": "2024-10-23T11:04:50.262671100Z",
     "start_time": "2024-10-23T11:04:50.230889900Z"
    }
   }
  },
  {
   
   "cell_type": "code",
   "execution_count": 320,
   "outputs": [],
   "source": [
    "#Normalisaion\n",
    "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
    "min_max_scaler = MinMaxScaler()\n",
    "standard_scaler = StandardScaler()\n",
    "\n",
    "minmax_col_numerical = min_max_scaler.fit_transform(impute_col_numerical)\n",
    "std_col_numerical = standard_scaler.fit_transform(impute_col_numerical)"
   ],
   "metadata": {
   
    "collapsed": false,
    "ExecuteTime": {
   
     "end_time": "2024-10-23T11:04:50.965009600Z",
     "start_time": "2024-10-23T11:04:50.949314900Z"
    }
   }
  },
  {
   
   "cell_type": "markdown",
   "source": [
    "# Z-score"
   ],
   "metadata": {
   
    "collapsed": false
   }
  },
  {
   
   "cell_type": "code",
   "execution_count": 320,
   "outputs": [],
   "source": [],
   "metadata": {
   
    "collapsed": false,
    "ExecuteTime": {
   
     "end_time": "2024-10-23T11:04:52.112475400Z",
     "start_time": "2024-10-23T11:04:52.082097100Z"
    }
   }
  },
  {
   
   "cell_type": "markdown",
   "source": [
    "# DBSCAN"
   ],
   "metadata": {
   
    "collapsed": false
   }
  },
  {
   
   "cell_ty
;