#AI夏令营 #Datawhale #夏令营
下载相关库
# 下载相关库
!pip install lightgbm openpyxl rdkit catboost
模块导入
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, cv, Pool, metrics
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import f1_score
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.feature_extraction.text import TfidfVectorizer
import tqdm, sys, os, gc, re, argparse, warnings
warnings.filterwarnings('ignore')
数据预处理
# 读取训练集和测试集
# 使用 read_excel() 函数从文件中读取训练集数据,文件名为 'traindata-new.xlsx'
train = pd.read_excel('./data/data280993/traindata-new.xlsx')
# 使用 read_excel() 函数从文件中读取测试集数据,文件名为 'testdata-new.xlsx'
test = pd.read_excel('./data/data280993/testdata-new.xlsx')
# test数据不包含 DC50 (nM) 和 Dmax (%)
train = train.drop(['DC50 (nM)', 'Dmax (%)'], axis=1)
# 定义了一个空列表drop_cols,用于存储在测试数据集中非空值小于10个的列名。
drop_cols = []
for f in test.columns:
if test[f].notnull().sum() < 10:
drop_cols.append(f)
# 使用drop方法从训练集和测试集中删除了这些列,以避免在后续的分析或建模中使用这些包含大量缺失值的列
train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)
# 使用pd.concat将清洗后的训练集和测试集合并成一个名为data的DataFrame,便于进行统一的特征工程处理
data = pd.concat([train, test], axis=0, ignore_index=True)
cols = data.columns[2:]
data.head()
uuid | Label | Uniprot | Target | E3 ligase | Name | Smiles | Assay (DC50/Dmax) | IC50 (nM, Protac to Target) | Assay (Protac to Target, IC50) | ... | XLogP3 | Heavy Atom Count | Ring Count | Hydrogen Bond Acceptor Count | Hydrogen Bond Donor Count | Rotatable Bond Count | Topological Polar Surface Area | Molecular Formula | InChI | InChI Key | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1.0 | Q9NWZ3 | IRAK4 | CRBN | NaN | CC(C)NC1=CC(N2C=CC3=CC(C#N)=CN=C32)=NC=C1C(=O)... | Degradation of IRAK4 in HEK293T cells after 24... | 7.0 | IC50 was assessed by Caliper biochemistry assay | ... | 2.14 | 62 | 7 | 14 | 5 | 16 | 255.84 | C43H46N10O9 | InChI=1S/C43H46N10O9/c1-24(2)49-31-19-34(52-15... | YNNBDJQWDDDBRU-OAMJFVEXSA-N |
1 | 2 | 1.0 | O15379 | HDAC3 | VHL | XZ9002 | CCCNNC(=O)C1=CC=C(C2=CC=C(NC(=O)CCCCCCC(=O)N[C... | Degradation of HDAC3 in MDA-MB-468 cells after... | 350.0 | IC50 was assessed by HDAC-Glot I/II Assays | ... | 6.81 | 61 | 5 | 9 | 6 | 19 | 181.86 | C47H61N7O6S | InChI=1S/C47H61N7O6S/c1-7-26-49-53-44(58)36-20... | INXGMIWEZJNJOA-NVZKIFIHSA-N |
2 | 3 | 1.0 | P11802 | CDK4 | VHL | CST651 | CC(=O)C1=C(C)C2=CN=C(NC3=CC=C(N4CCN(CCOCCOCCOC... | Degradation of CDK4 in MM.1S cells after 16 h ... | NaN | NaN | ... | 4.76 | 78 | 9 | 18 | 4 | 22 | 235.57 | C56H72FN11O9S | InChI=1S/C56H72FN11O9S/c1-34-42-31-60-54(64-49... | WYPMPGWHIPETEB-VZUPZQALSA-N |
3 | 4 | 1.0 | Q00534 | CDK6 | VHL | CST651 | CC(=O)C1=C(C)C2=CN=C(NC3=CC=C(N4CCN(CCOCCOCCOC... | Degradation of CDK6 in MM.1S cells after 16 h ... | NaN | NaN | ... | 4.76 | 78 | 9 | 18 | 4 | 22 | 235.57 | C56H72FN11O9S | InChI=1S/C56H72FN11O9S/c1-34-42-31-60-54(64-49... | WYPMPGWHIPETEB-VZUPZQALSA-N |
4 | 5 | 1.0 | O14965 | AURKA | CRBN | JB170 | COC1=CC(NC2=NC=C3CN=C(C4=C(F)C=CC=C4OC)C4=CC(C... | Degradation of AURKA in MV4-11 cells after 6 h... | NaN | NaN | ... | 3.69 | 69 | 8 | 15 | 4 | 19 | 238.07 | C48H44ClFN8O11 | InChI=1S/C48H44ClFN8O11/c1-65-35-7-4-6-33(50)4... | GYKNPXCQINZRLL-UHFFFAOYSA-N |
5 rows × 43 columns
特征工程
# 将SMILES转换为分子对象列表,在转换回去,统一格式
data['smiles_list'] = data['Smiles'].apply(lambda x:[Chem.MolToSmiles(mol, isomericSmiles=True) for mol in [Chem.MolFromSmiles(x)]])
data['smiles_list'] = data['smiles_list'].map(lambda x: ' '.join(x))
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704 entries, 0 to 703
Data columns (total 44 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 uuid 704 non-null int64
1 Label 351 non-null float64
2 Uniprot 642 non-null object
3 Target 704 non-null object
4 E3 ligase 704 non-null object
5 Name 277 non-null object
6 Smiles 704 non-null object
7 Assay (DC50/Dmax) 704 non-null object
8 IC50 (nM, Protac to Target) 153 non-null float64
9 Assay (Protac to Target, IC50) 169 non-null object
10 Kd (nM, Protac to Target) 65 non-null float64
11 Assay (Protac to Target, Kd) 67 non-null object
12 Assay (Protac to Target, G/H/-T 10 non-null object
13 IC50 (nM, Protac to E3) 32 non-null float64
14 Assay (Protac to E3, IC50) 35 non-null object
15 Kd (nM, Protac to E3) 60 non-null object
16 Assay (Protac to E3, Kd) 60 non-null object
17 delta G (kcal/mol, Protac to E3 24 non-null float64
18 delta H (kcal/mol, Protac to E3 24 non-null float64
19 -T*delta S (kcal/mol, Protac _1 24 non-null float64
20 Assay (Protac to E3, G/H/-TS) 24 non-null object
21 Kd (nM, Ternary complex) 27 non-null object
22 Assay (Ternary complex, Kd) 27 non-null object
23 Assay (Ternary complex, G/H/-TS 11 non-null object
24 IC50 (nM, Cellular activities) 131 non-null float64
25 Assay (Cellular activities, IC5 190 non-null object
26 EC50 (nM, Cellular activities) 42 non-null object
27 Assay (Cellular activities, EC5 42 non-null object
28 GI50 (nM, Cellular activities) 27 non-null object
29 Assay (Cellular activities, GI5 27 non-null object
30 Article DOI 704 non-null object
31 Molecular Weight 704 non-null float64
32 Exact Mass 704 non-null float64
33 XLogP3 704 non-null float64
34 Heavy Atom Count 704 non-null int64
35 Ring Count 704 non-null int64
36 Hydrogen Bond Acceptor Count 704 non-null int64
37 Hydrogen Bond Donor Count 704 non-null int64
38 Rotatable Bond Count 704 non-null int64
39 Topological Polar Surface Area 704 non-null float64
40 Molecular Formula 704 non-null object
41 InChI 704 non-null object
42 InChI Key 704 non-null object
43 smiles_list 704 non-null object
dtypes: float64(12), int64(6), object(26)
memory usage: 242.1+ KB
# 使用TfidfVectorizer计算TF-IDF
tfidf = TfidfVectorizer(max_df = 0.9, min_df = 1, sublinear_tf = True)
res = tfidf.fit_transform(data['smiles_list'])
# 将结果转为dataframe格式
tfidf_df = pd.DataFrame(res.toarray())
tfidf_df.columns = [f'smiles_tfidf_{i}' for i in range(tfidf_df.shape[1])]
# 按列合并到data数据
data = pd.concat([data, tfidf_df], axis=1)
data.head()
uuid | Label | Uniprot | Target | E3 ligase | Name | Smiles | Assay (DC50/Dmax) | IC50 (nM, Protac to Target) | Assay (Protac to Target, IC50) | ... | smiles_tfidf_940 | smiles_tfidf_941 | smiles_tfidf_942 | smiles_tfidf_943 | smiles_tfidf_944 | smiles_tfidf_945 | smiles_tfidf_946 | smiles_tfidf_947 | smiles_tfidf_948 | smiles_tfidf_949 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1.0 | Q9NWZ3 | IRAK4 | CRBN | NaN | CC(C)NC1=CC(N2C=CC3=CC(C#N)=CN=C32)=NC=C1C(=O)... | Degradation of IRAK4 in HEK293T cells after 24... | 7.0 | IC50 was assessed by Caliper biochemistry assay | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 2 | 1.0 | O15379 | HDAC3 | VHL | XZ9002 | CCCNNC(=O)C1=CC=C(C2=CC=C(NC(=O)CCCCCCC(=O)N[C... | Degradation of HDAC3 in MDA-MB-468 cells after... | 350.0 | IC50 was assessed by HDAC-Glot I/II Assays | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 3 | 1.0 | P11802 | CDK4 | VHL | CST651 | CC(=O)C1=C(C)C2=CN=C(NC3=CC=C(N4CCN(CCOCCOCCOC... | Degradation of CDK4 in MM.1S cells after 16 h ... | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 4 | 1.0 | Q00534 | CDK6 | VHL | CST651 | CC(=O)C1=C(C)C2=CN=C(NC3=CC=C(N4CCN(CCOCCOCCOC... | Degradation of CDK6 in MM.1S cells after 16 h ... | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 5 | 1.0 | O14965 | AURKA | CRBN | JB170 | COC1=CC(NC2=NC=C3CN=C(C4=C(F)C=CC=C4OC)C4=CC(C... | Degradation of AURKA in MV4-11 cells after 6 h... | NaN | NaN | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 994 columns
data['Name'].nunique()
208
# 自然数编码
def label_encode(series):
unique = list(series.unique())
return series.map(dict(zip(unique, range(series.nunique()))))
for col in cols:
if data[col].dtype == 'object':
data[col] = label_encode(data[col])
train = data[data.Label.notnull()].reset_index(drop=True)
test = data[data.Label.isnull()].reset_index(drop=True)
# 特征筛选
features = [f for f in train.columns if f not in ['uuid','Label','smiles_list']]
# 构建训练集和测试集
x_train = train[features]
x_test = test[features]
# 训练集标签
y_train = train['Label'].astype(int)
x_train.isna().sum()[x_train.isna().sum()>0]
IC50 (nM, Protac to Target) 283
Kd (nM, Protac to Target) 329
IC50 (nM, Protac to E3) 335
delta G (kcal/mol, Protac to E3 351
delta H (kcal/mol, Protac to E3 351
-T*delta S (kcal/mol, Protac _1 351
IC50 (nM, Cellular activities) 281
dtype: int64
x_train["Kd (nM, Protac to Target)"]
0 NaN
1 NaN
2 NaN
3 NaN
4 375.0
...
346 NaN
347 NaN
348 NaN
349 NaN
350 NaN
Name: Kd (nM, Protac to Target), Length: 351, dtype: float64
训练、预测
def cv_model(clf, train_x, train_y, test_x, clf_name, seed=2022):
kf = KFold(n_splits=5, shuffle=True, random_state=seed)
train = np.zeros(train_x.shape[0])
test = np.zeros(test_x.shape[0])
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('************************************ {} {}************************************'.format(str(i+1), str(seed)))
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
params = {'learning_rate': 0.1, 'depth': 6, 'l2_leaf_reg': 10, 'bootstrap_type':'Bernoulli','random_seed':seed,
'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False, 'task_type':'CPU'}
model = clf(iterations=20000, **params, eval_metric='AUC')
model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
metric_period=100,
cat_features=[],
use_best_model=True,
verbose=1)
val_pred = model.predict_proba(val_x)[:,1]
test_pred = model.predict_proba(test_x)[:,1]
train[valid_index] = val_pred
test += test_pred / kf.n_splits
cv_scores.append(f1_score(val_y, np.where(val_pred>0.5, 1, 0)))
print(cv_scores)
print("%s_score_list:" % clf_name, cv_scores)
print("%s_score_mean:" % clf_name, np.mean(cv_scores))
print("%s_score_std:" % clf_name, np.std(cv_scores))
return train, test
cat_train, cat_test = cv_model(CatBoostClassifier, x_train, y_train, x_test, "cat")
************************************ 1 2022************************************
0: test: 0.7420034 best: 0.7420034 (0) total: 53.8ms remaining: 17m 55s
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
100: test: 0.9082492 best: 0.9116162 (58) total: 557ms remaining: 1m 49s
200: test: 0.9082492 best: 0.9158249 (139) total: 1.09s remaining: 1m 47s
Stopped by overfitting detector (100 iterations wait)
bestTest = 0.9158249158
bestIteration = 139
Shrink model to first 140 iterations.
[0.9052631578947369]
************************************ 2 2022************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0: test: 0.7957428 best: 0.7957428 (0) total: 6.33ms remaining: 2m 6s
100: test: 0.9248188 best: 0.9248188 (100) total: 599ms remaining: 1m 57s
200: test: 0.9483696 best: 0.9510870 (167) total: 1.2s remaining: 1m 58s
300: test: 0.9547101 best: 0.9547101 (265) total: 1.91s remaining: 2m 5s
400: test: 0.9565217 best: 0.9574275 (324) total: 2.6s remaining: 2m 7s
Stopped by overfitting detector (100 iterations wait)
bestTest = 0.9574275362
bestIteration = 324
Shrink model to first 325 iterations.
[0.9052631578947369, 0.9072164948453608]
************************************ 3 2022************************************
0: test: 0.7631822 best: 0.7631822 (0) total: 5.48ms remaining: 1m 49s
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
100: test: 0.8940796 best: 0.9033302 (65) total: 601ms remaining: 1m 58s
Stopped by overfitting detector (100 iterations wait)
bestTest = 0.9033302498
bestIteration = 65
Shrink model to first 66 iterations.
[0.9052631578947369, 0.9072164948453608, 0.88]
************************************ 4 2022************************************
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0: test: 0.8167347 best: 0.8167347 (0) total: 6.05ms remaining: 2m
100: test: 0.8897959 best: 0.8946939 (87) total: 673ms remaining: 2m 12s
200: test: 0.8930612 best: 0.8971429 (155) total: 1.28s remaining: 2m 6s
300: test: 0.8963265 best: 0.8987755 (225) total: 1.81s remaining: 1m 58s
Stopped by overfitting detector (100 iterations wait)
bestTest = 0.8987755102
bestIteration = 225
Shrink model to first 226 iterations.
[0.9052631578947369, 0.9072164948453608, 0.88, 0.8571428571428571]
************************************ 5 2022************************************
0: test: 0.8423913 best: 0.8423913 (0) total: 5.73ms remaining: 1m 54s
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
100: test: 0.9547101 best: 0.9655797 (46) total: 670ms remaining: 2m 12s
Stopped by overfitting detector (100 iterations wait)
bestTest = 0.9655797101
bestIteration = 46
Shrink model to first 47 iterations.
[0.9052631578947369, 0.9072164948453608, 0.88, 0.8571428571428571, 0.9052631578947369]
cat_score_list: [0.9052631578947369, 0.9072164948453608, 0.88, 0.8571428571428571, 0.9052631578947369]
cat_score_mean: 0.8909771335555383
cat_score_std: 0.019683259686381092
#pd.DataFrame(
# {
# 'uuid': test['uuid'],
# 'Label': np.where(cat_test>0.5, 1, 0)
# }
#).to_csv('submit1.csv', index=None)