import pandas as pd
import numpy as np
import json
import re
import fasttext
import warnings
warnings.filterwarnings('ignore')
class Config():
seed=2024
abstracttopwords=['the', 'of', 'and', 'in', 'to', 'a', 'is', 'with', 'for', 'that', 'by', 'on', 'was', 'this', 'are', 'we', 'were', 'as', 'from', 'an', 'be', 'at', 'can', 'which', 'results', 'or', 'using', 'based', 'has', 'have', 'between', 'it', 'method', 'patients', 'data', 'model', 'than', 'proposed', 'our', 'these', 'system', 'been', 'two', 'also', 'used', 'not', 'different', 'high', 'more', 'study', 'show', '=', 'both', 'into', 'new', 'its', 'control', 'cancer', 'analysis', 'performance', 'such', '2', 'their', 'after', 'paper', 'all', 'but', 'power', 'however,', 'one', 'algorithm', 'time', 'compared', 'may', 'cell', 'when', 'other', 'under', 'rate', 'paper,', 'energy', 'network', 'higher', 'each', 'expression', 'information', 'only', 'during', '1', 'significantly', 'most', 'surface', 'cells', 'approach', 'effect', '3', 'through', 'novel', 'low', 'group']
titletopwords=['of', 'and', 'in', 'for', 'the', 'a', 'on', 'with', 'by', 'based', 'to', 'from', 'study', 'cancer', 'analysis', 'using', 'an', 'system', 'method', 'patients', 'cell', 'control', 'model', 'systems', 'design', 'power', 'effect', 'phase', 'data', 'lung', 'networks', 'as', 'network', 'at', 'properties', 'detection', 'high', 'novel', 'application', 'its', 'learning', 'new', 'algorithm', 'research', 'carbon', 'via', 'image', 'treatment', 'effects', 'synthesis', 'performance', 'human', 'risk', 'between', 'structure', '2', 'energy', 'breast', 'approach', 'cells', 'surface', 'dynamic', 'expression', 'advanced', 'optimization', 'neural', 'clinical', 'efficient', 'is', 'wireless', 'evaluation', 'under', 'simulation', 'carcinoma', 'development', 'water', 'hybrid', 'films', 'growth', 'multiple', 'during', 'characteristics', 'therapy', 'cancer.', 'modeling', 'china', 'robust', 'after', 'optical', 'recognition', 'gene', '3d', 'distribution', 'estimation', 'process', 'molecular', 'low', 'characterization', 'through', 'or']
top=100
country=['china','japan','france','usa','switzerland','uk','germany','canada',
'australia','hong kong','united states','u.s.a','singapore','united kingdom','russia'
]
try_program=False
import random
def seed_everything(seed):
np.random.seed(seed)
random.seed(seed)
seed_everything(Config.seed)
path='/kaggle/input/'
with open(path+"2024kddcupwhoiswho/2024kddcupwhoiswho/train_author.json") as f:
train_author=json.load(f)
with open(path+"2024kddcupwhoiswho/2024kddcupwhoiswho/pid_to_info_all.json") as f:
pid_to_info=json.load(f)
with open(path+"2024kddcupwhoiswho/2024kddcupwhoiswho/ind_test_author_filter_public.json") as f:
valid_author=json.load(f)
with open(path+"2024kddcupwhoiswho/2024kddcupwhoiswho/ind_test_author_filter_public.json") as f:
submission=json.load(f)
def ARI(txt):
if txt==None:
txt="q"
characters=len(txt)
words=len(re.split(' |\\n|\\.|\\?|\\!|\,',txt))
sentence=len(re.split('\\.|\\?|\\!',txt))
ari_score=4.71*(characters/words)+0.5*(words/sentence)-21.43
return ari_score
"""
http://www.supermagnus.com/mac/Word_Counter/index.html
McAlpine EFLAW© Test
(W + SW) / S
McAlpine EFLAW© Readability
Scale:
1-20: Easy
21-25: Quite Easy
26-29: Mildly Difficult
≥ 30: Very Confusing
S:total sentences
W:total words
"""
def McAlpine_EFLAW(txt):
if txt==None:
txt="q"
W=len(re.split(' |\\n|\\.|\\?|\\!|\,',txt))
S=len(re.split('\\.|\\?|\\!',txt))
mcalpine_eflaw_score=(W+S*W)/S
return mcalpine_eflaw_score
"""
https://readable.com/readability/coleman-liau-readability-index/
=0.0588*L-0.296*S-15.8
L是每100个单词有多少个字母,S是平均每100个单词有多少句子.
"""
def CLRI(txt):
if txt==None:
txt="q"
characters=len(txt)
words=len(re.split(' |\\n|\\.|\\?|\\!|\,',txt))
sentence=len(re.split('\\.|\\?|\\!',txt))
L=100*characters/words
S=100*sentence/words
clri_score=0.0588*L-0.296*S-15.8
return clri_score
def calwordcnt(txt,find_word):
wordcnt=0
words=txt.split()
for word in words:
word=word.lower()
wordcnt+=int(word==find_word)
return wordcnt
model = fasttext.load_model('/kaggle/input/fasttext-essay-category-80/fasttext_arxivcategory.model')
columns=['authorid',
'person_id','essay_id',
'title_len','title_wordcount','title_ari','title_McAlpine_EFLAW','title_CLRI',
'title_word_maxlen','title_word_medianlen','title_word_meanlen','title_word_stdlen','title_word_sumlen',
'author_count',
'china','japan','france','usa','switzerland','uk','germany','canada',
'australia','hong kong','united states','u.s.a','singapore','united kingdom','russia',
'author_org_count',
'max_orgs_len','median_orgs_len','mean_orgs_len','std_orgs_len','sum_orgs_len',
'max_orgs_wordcnt','median_orgs_wordcnt','mean_orgs_wordcnt','std_orgs_wordcnt','sum_orgs_wordcnt',
'abs_ari','abs_McAlpine_EFLAW','abs_CLRI',
'abs_len','abs_wordcount','abs_sentenece_count',
'max_abs_wordlen','median_abs_wordlen','mean_abs_wordlen','std_abs_wordlen','sum_abs_wordlen',
'max_abs_sentencelen','median_abs_sentencelen','mean_abs_sentencelen','std_abs_sentencelen','sum_abs_sentencelen',
'max_abs_senwordcnt','median_abs_senwordcnt','mean_abs_senwordcnt','std_abs_senwordcnt','sum_abs_senwordcnt',
'keywords_count','keywords_len_sum',
'keywords_len_max','keywords_len_median','keywords_len_mean','keywords_len_std',
'venue_len','venue_wordcount','venue_maxwordlen','venue_medianwordlen','venue_meanwordlen','venue_stdwordlen','venue_sumwordlen',
'venue_ari','venue_McAlpine_EFLAW','venue_CLRI',
'year',
'title_top0', 'title_top1', 'title_top2', 'title_top3', 'title_top4',
'title_top5', 'title_top6', 'title_top7', 'title_top8', 'title_top9',
'title_top10', 'title_top11', 'title_top12', 'title_top13', 'title_top14',
'title_top15', 'title_top16', 'title_top17', 'title_top18', 'title_top19',
'title_top20', 'title_top21', 'title_top22', 'title_top23', 'title_top24',
'title_top25', 'title_top26', 'title_top27', 'title_top28', 'title_top29',
'title_top30', 'title_top31', 'title_top32', 'title_top33', 'title_top34',
'title_top35', 'title_top36', 'title_top37', 'title_top38', 'title_top39',
'title_top40', 'title_top41', 'title_top42', 'title_top43', 'title_top44',
'title_top45', 'title_top46', 'title_top47', 'title_top48', 'title_top49',
'title_top50', 'title_top51', 'title_top52', 'title_top53', 'title_top54',
'title_top55', 'title_top56', 'title_top57', 'title_top58', 'title_top59',
'title_top60', 'title_top61', 'title_top62', 'title_top63', 'title_top64',
'title_top65', 'title_top66', 'title_top67', 'title_top68', 'title_top69',
'title_top70', 'title_top71', 'title_top72', 'title_top73', 'title_top74',
'title_top75', 'title_top76', 'title_top77', 'title_top78', 'title_top79',
'title_top80', 'title_top81', 'title_top82', 'title_top83', 'title_top84',
'title_top85', 'title_top86', 'title_top87', 'title_top88', 'title_top89',
'title_top90', 'title_top91', 'title_top92', 'title_top93', 'title_top94',
'title_top95', 'title_top96', 'title_top97', 'title_top98', 'title_top99',
'abstract_top0', 'abstract_top1', 'abstract_top2', 'abstract_top3', 'abstract_top4',
'abstract_top5', 'abstract_top6', 'abstract_top7', 'abstract_top8', 'abstract_top9',
'abstract_top10', 'abstract_top11', 'abstract_top12', 'abstract_top13', 'abstract_top14',
'abstract_top15', 'abstract_top16', 'abstract_top17', 'abstract_top18', 'abstract_top19',
'abstract_top20', 'abstract_top21', 'abstract_top22', 'abstract_top23', 'abstract_top24',
'abstract_top25', 'abstract_top26', 'abstract_top27', 'abstract_top28', 'abstract_top29',
'abstract_top30', 'abstract_top31', 'abstract_top32', 'abstract_top33', 'abstract_top34',
'abstract_top35', 'abstract_top36', 'abstract_top37', 'abstract_top38', 'abstract_top39',
'abstract_top40', 'abstract_top41', 'abstract_top42', 'abstract_top43', 'abstract_top44',
'abstract_top45', 'abstract_top46', 'abstract_top47', 'abstract_top48', 'abstract_top49',
'abstract_top50', 'abstract_top51', 'abstract_top52', 'abstract_top53', 'abstract_top54',
'abstract_top55', 'abstract_top56', 'abstract_top57', 'abstract_top58', 'abstract_top59',
'abstract_top60', 'abstract_top61', 'abstract_top62', 'abstract_top63', 'abstract_top64',
'abstract_top65', 'abstract_top66', 'abstract_top67', 'abstract_top68', 'abstract_top69',
'abstract_top70', 'abstract_top71', 'abstract_top72', 'abstract_top73', 'abstract_top74',
'abstract_top75', 'abstract_top76', 'abstract_top77', 'abstract_top78', 'abstract_top79',
'abstract_top80', 'abstract_top81', 'abstract_top82', 'abstract_top83', 'abstract_top84',
'abstract_top85', 'abstract_top86', 'abstract_top87', 'abstract_top88', 'abstract_top89',
'abstract_top90', 'abstract_top91', 'abstract_top92', 'abstract_top93', 'abstract_top94',
'abstract_top95', 'abstract_top96', 'abstract_top97', 'abstract_top98', 'abstract_top99',
'category'
]
print(f"len(columns):{len(columns)}")
idx2col={}
for i in range(len(columns)):
idx2col[i]=columns[i]
train_feats=[]
labels=[]
authorid=0
for id,person_info in train_author.items():
for data in ['normal_data','outliers']:
for text_id in person_info[data]:
feat=pid_to_info[text_id]
train_feat=[authorid,id,feat['id']]
train_feat+=[len(feat['title'])]
title_word=feat['title'].split(" ")
title_word_len=[len(word) for word in title_word]
train_feat+=[len(title_word_len),ARI(feat['title']),McAlpine_EFLAW(feat['title']),CLRI(feat['title']),
np.max(title_word_len),np.median(title_word_len),np.mean(title_word_len),np.std(title_word_len),np.sum(title_word_len)]
train_feat+=[len(feat['authors'])]
country_cnt=[0 for i in range(len(Config.country))]
for author_dict in feat['authors']:
for i in range(len(Config.country)):
country_cnt[i]+=int(Config.country[i] in author_dict['org'].lower())
train_feat+=country_cnt
orgs=[]
for org_dict in feat['authors']:
org=org_dict['org']
if org not in orgs:
orgs.append(org)
train_feat+=[len(orgs)]
orgs_len=[len(org) for org in orgs]
orgs_wordcnt=[len(org.split()) for org in orgs]
try:
train_feat+=[np.max(orgs_len),np.median(orgs_len),np.mean(orgs_len),
np.std(orgs_len),np.sum(orgs_len)]
except:
train_feat+=[np.nan,np.nan,np.nan,np.nan,np.nan]
try:
train_feat+=[np.max(orgs_wordcnt),np.median(orgs_wordcnt),np.mean(orgs_wordcnt),
np.std(orgs_wordcnt),np.sum(orgs_wordcnt)]
except:
train_feat+=[np.nan,np.nan,np.nan,np.nan,np.nan]
train_feat+=[ARI(feat['abstract']),McAlpine_EFLAW(feat['abstract']),CLRI(feat['abstract'])]
words=feat['abstract'].split()
sentences=feat['abstract'].split(".")
train_feat+=[len(feat['abstract']),len(words),len(sentences)]
wordlen=[len(word) for word in words]
try:
train_feat+=[np.max(wordlen),np.median(wordlen),np.mean(wordlen),
np.std(wordlen),np.sum(wordlen)]
except:
train_feat+=[np.nan,np.nan,np.nan,np.nan,np.nan]
sentencelen=[len(sentence) for sentence in sentences]
try:
train_feat+=[np.max(sentencelen),np.median(sentencelen),np.mean(sentencelen),
np.std(sentencelen),np.sum(sentencelen)]
except:
train_feat+=[np.nan,np.nan,np.nan,np.nan,np.nan]
sentencewordcnt=[len(sentence.split()) for sentence in sentences]
try:
train_feat+=[np.max(sentencewordcnt),np.median(sentencewordcnt),np.mean(sentencewordcnt),
np.std(sentencewordcnt),np.sum(sentencewordcnt)]
except:
train_feat+=[np.nan,np.nan,np.nan,np.nan,np.nan]
len_keyword=np.array([len(word) for word in feat['keywords']])
try:
train_feat+=[len(feat['keywords']),np.sum(len_keyword),np.max(len_keyword),
np.median(len_keyword),np.mean(len_keyword),np.std(len_keyword)]
except:
train_feat+=[len(feat['keywords']),np.sum(len_keyword),np.nan,np.nan,np.nan,np.nan]
try:
train_feat+=[len(feat['venue'])]
venue_word=feat['venue'].split(" ")
venue_wordlen=[len(word)for word in venue_word]
train_feat+=[len(venue_wordlen),np.max(venue_wordlen),np.median(venue_wordlen),
np.mean(venue_wordlen),np.std(venue_wordlen),np.sum(venue_wordlen)]
except:
train_feat+=[0]
train_feat+=[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]
train_feat+=[ARI(feat['venue']),McAlpine_EFLAW(feat['venue']),CLRI(feat['venue'])]
try:
train_feat+=[int(feat['year'])]
except:
train_feat+=[np.nan]
for top in range(Config.top):
train_feat+=[calwordcnt(feat['title'],Config.titletopwords[top])]
for top in range(Config.top):
train_feat+=[calwordcnt(feat['abstract'],Config.abstracttopwords[top])]
train_feat+=[int(model.predict(feat['abstract'].replace('\n', ''),k=len(model.labels))[0][0][9:])]
train_feats.append(train_feat)
labels.append(int(data=='normal_data'))
authorid+=1
if Config.try_program:
break
train_feats=np.array(train_feats)
labels=np.array(labels)
print(f"train_feats.shape:{train_feats.shape},labels.shape:{labels.shape}")
print(f"np.mean(labels):{np.mean(labels)}")
train_feats=pd.DataFrame(train_feats)
train_feats=train_feats.rename(columns=idx2col)
train_feats['label']=labels
train_feats.head()
valid_feats=[]
authorid=0
for id,person_info in valid_author.items():
for text_id in person_info['papers']:
feat=pid_to_info[text_id]
valid_feat=[authorid,id,feat['id']]
valid_feat+=[len(feat['title'])]
title_word=feat['title'].split(" ")
title_word_len=[len(word) for word in title_word]
valid_feat+=[len(title_word_len),ARI(feat['title']),McAlpine_EFLAW(feat['title']),CLRI(feat['title']),
np.max(title_word_len),np.median(title_word_len),np.mean(title_word_len),np.std(title_word_len),np.sum(title_word_len)]
valid_feat+=[len(feat['authors'])]
country_cnt=[0 for i in range(len(Config.country))]
for author_dict in feat['authors']:
for i in range(len(Config.country)):
country_cnt[i]+=int(Config.country[i] in author_dict['org'].lower())
valid_feat+=country_cnt
orgs=[]
for org_dict in feat['authors']:
org=org_dict['org']
if org not in orgs:
orgs.append(org)
valid_feat+=[len(orgs)]
orgs_len=[len(org) for org in orgs]
orgs_wordcnt=[len(org.split()) for org in orgs]
try:
valid_feat+=[np.max(orgs_len),np.median(orgs_len),np.mean(orgs_len),
np.std(orgs_len),np.sum(orgs_len)]
except:
valid_feat+=[np.nan,np.nan,np.nan,np.nan,np.nan]
try:
valid_feat+=[np.max(orgs_wordcnt),np.median(orgs_wordcnt),np.mean(orgs_wordcnt),
np.std(orgs_wordcnt),np.sum(orgs_wordcnt)]
except:
valid_feat+=[np.nan,np.nan,np.nan,np.nan,np.nan]
valid_feat+=[ARI(feat['abstract']),McAlpine_EFLAW(feat['abstract']),CLRI(feat['abstract'])]
words=feat['abstract'].split()
sentences=feat['abstract'].split(".")
valid_feat+=[len(feat['abstract']),len(words),len(sentences)]
wordlen=[len(word) for word in words]
try:
valid_feat+=[np.max(wordlen),np.median(wordlen),np.mean(wordlen),
np.std(wordlen),np.sum(wordlen)]
except:
valid_feat+=[np.nan,np.nan,np.nan,np.nan,np.nan]
sentencelen=[len(sentence) for sentence in sentences]
try:
valid_feat+=[np.max(sentencelen),np.median(sentencelen),np.mean(sentencelen),
np.std(sentencelen),np.sum(sentencelen)]
except:
valid_feat+=[np.nan,np.nan,np.nan,np.nan,np.nan]
sentencewordcnt=[len(sentence.split()) for sentence in sentences]
try:
valid_feat+=[np.max(sentencewordcnt),np.median(sentencewordcnt),np.mean(sentencewordcnt),
np.std(sentencewordcnt),np.sum(sentencewordcnt)]
except:
valid_feat+=[np.nan,np.nan,np.nan,np.nan,np.nan]
len_keyword=np.array([len(word) for word in feat['keywords']])
try:
valid_feat+=[len(feat['keywords']),np.sum(len_keyword),np.max(len_keyword),
np.median(len_keyword),np.mean(len_keyword),np.std(len_keyword)]
except:
valid_feat+=[len(feat['keywords']),np.sum(len_keyword),np.nan,np.nan,np.nan,np.nan]
try:
valid_feat+=[len(feat['venue'])]
venue_word=feat['venue'].split(" ")
venue_wordlen=[len(word)for word in venue_word]
valid_feat+=[len(venue_wordlen),np.max(venue_wordlen),np.median(venue_wordlen),
np.mean(venue_wordlen),np.std(venue_wordlen),np.sum(venue_wordlen)]
except:
valid_feat+=[0]
valid_feat+=[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]
valid_feat+=[ARI(feat['venue']),McAlpine_EFLAW(feat['venue']),CLRI(feat['venue'])]
try:
valid_feat+=[int(feat['year'])]
except:
valid_feat+=[np.nan]
for top in range(Config.top):
valid_feat+=[calwordcnt(feat['title'],Config.titletopwords[top])]
for top in range(Config.top):
valid_feat+=[calwordcnt(feat['abstract'],Config.abstracttopwords[top])]
valid_feat+=[int(model.predict(feat['abstract'].replace('\n', ''),k=len(model.labels))[0][0][9:])]
valid_feats.append(valid_feat)
authorid+=1
if Config.try_program:
break
valid_feats=np.array(valid_feats)
print(f"valid_feats.shape:{valid_feats.shape}")
valid_feats=pd.DataFrame(valid_feats)
valid_feats=valid_feats.rename(columns=idx2col)
valid_feats.head()
def feature_engineer(df):
for col in columns:
if col not in ['person_id','essay_id']:
if col in df.columns:
df[col]=df[col].astype(float)
df['mean_abstract_sentence_wordcount']=df['abs_wordcount']/df['abs_sentenece_count']
df['mean_org_author_count']=df['author_count']/df['author_org_count']
df['mean_title_wordlen']=df['title_len']/df['title_wordcount']
df['mean_venue_wordlen']=df['venue_len']/df['venue_wordcount']
df['total_len']=df['title_len']+df['abs_len']+df['venue_len']
df['year'] = df['year'].replace(2024, 2023)
df['year'] = df['year'].replace(0, np.nan)
df.loc[df['title_len']==np.expm1(0), 'title_len'] = np.expm1(4.5)
df.loc[df['title_len']<=np.expm1(3), 'title_len'] = np.expm1(3)
df.loc[df['title_len']>=np.expm1(6), 'title_len'] = np.expm1(6)
df.loc[df['abs_len']==np.expm1(0), 'abs_len'] = np.expm1(7)
df.loc[df['abs_len']<=np.expm1(3), 'abs_len'] = np.expm1(3)
df.loc[df['abs_len']>=np.expm1(9), 'abs_len'] = np.expm1(9)
df.loc[df['venue_len']==np.expm1(0), 'venue_len'] = np.expm1(3.25)
df.loc[df['venue_len']<=np.expm1(1.1), 'venue_len'] = np.expm1(1.1)
df.loc[df['venue_len']>=np.expm1(5.5), 'venue_len'] = np.expm1(5.5)
for i in range(14):
df[f'category_{i}']=(df['category']==i)
for col in columns+[f'category_{i}' for i in range(14)]:
if col not in ['authorid','person_id','essay_id']:
if col in df.columns:
person_col=df[col].groupby(df['person_id']).sum().reset_index()
person_col=person_col.rename(columns={col:f'sum_{col}'})
df=df.merge(person_col,on='person_id',how='left')
person_col=df[col].groupby(df['person_id']).mean().reset_index()
person_col=person_col.rename(columns={col:f'mean_{col}'})
df=df.merge(person_col,on='person_id',how='left')
person_col=df[col].groupby(df['person_id']).median().reset_index()
person_col=person_col.rename(columns={col:f'median_{col}'})
df=df.merge(person_col,on='person_id',how='left')
person_col=df[col].groupby(df['person_id']).skew().reset_index()
person_col=person_col.rename(columns={col:f'skew_{col}'})
df=df.merge(person_col,on='person_id',how='left')
person_col=df[col].groupby(df['person_id']).max().reset_index()
person_col=person_col.rename(columns={col:f'max_{col}'})
df=df.merge(person_col,on='person_id',how='left')
person_col=df[col].groupby(df['person_id']).std().reset_index()
person_col=person_col.rename(columns={col:f'std_{col}'})
df=df.merge(person_col,on='person_id',how='left')
df[f'gap_{col}']=df[f'mean_{col}']-df[col]
df.drop(['person_id','essay_id'],axis=1,inplace=True)
return df
train_feats=feature_engineer(train_feats)
valid_feats=feature_engineer(valid_feats)
tops=['title_ari','author_count','venue_stdwordlen','year','mean_org_author_count','total_len','gap_author_count','mean_abs_wordlen', 'std_abs_wordlen', 'sum_abs_wordlen', 'keywords_len_mean', 'keywords_len_std']
for i in range(len(tops)):
for j in range(i+1,len(tops)):
train_feats[f"{tops[i]}+{tops[j]}"]=train_feats[tops[i]]+train_feats[tops[j]]
train_feats[f"{tops[i]}-{tops[j]}"]=train_feats[tops[i]]-train_feats[tops[j]]
train_feats[f"{tops[i]}*{tops[j]}"]=train_feats[tops[i]]*train_feats[tops[j]]
train_feats[f"{tops[i]}/{tops[j]}"]=train_feats[tops[i]]/train_feats[tops[j]]
valid_feats[f"{tops[i]}+{tops[j]}"]=valid_feats[tops[i]]+valid_feats[tops[j]]
valid_feats[f"{tops[i]}-{tops[j]}"]=valid_feats[tops[i]]-valid_feats[tops[j]]
valid_feats[f"{tops[i]}*{tops[j]}"]=valid_feats[tops[i]]*valid_feats[tops[j]]
valid_feats[f"{tops[i]}/{tops[j]}"]=valid_feats[tops[i]]/valid_feats[tops[j]]
useless_cols=['abstract_top89', 'mean_abstract_top39', 'median_title_top66', 'max_title_top0', 'sum_keywords_len_max', 'max_abstract_top49', 'median_title_top79', 'mean_title_top30', 'sum_abstract_top89', 'median_max_orgs_wordcnt', 'std_abs_CLRI', 'median_title_top8', 'median_mean_orgs_wordcnt', 'author_org_count', 'abstract_top75', 'mean_title_top49', 'gap_title_top87', 'median_abstract_top31', 'median_title_top60', 'max_title_top81', 'std_sum_abs_senwordcnt', 'std_std_orgs_wordcnt', 'std_abstract_top16', 'std_abstract_top91', 'sum_title_top33', 'mean_title_top57', 'std_sum_abs_wordlen', 'mean_std_abs_sentencelen', 'std_title_top46', 'median_abstract_top98', 'abstract_top66', 'title_top83', 'median_title_top1', 'max_abstract_top27', 'sum_abstract_top23', 'max_title_top66', 'max_title_top42', 'max_abstract_top6', 'sum_abstract_top30', 'max_abstract_top86', 'sum_title_ari', 'sum_abstract_top25', 'median_abstract_top64', 'abstract_top91', 'abstract_top17', 'std_title_top13', 'max_title_word_medianlen', 'median_title_top89', 'sum_title_top82', 'sum_title_top64', 'sum_abstract_top43', 'mean_title_word_sumlen', 'std_title_top53', 'std_abs_len', 'median_abstract_top9', 'title_top42', 'max_venue_ari', 'median_mean_orgs_len', 'sum_abstract_top87', 'median_abstract_top94', 'mean_title_top67', 'max_abstract_top91', 'abstract_top31', 'mean_max_abs_senwordcnt', 'std_mean_orgs_len', 'max_title_top69', 'sum_abstract_top31', 'median_title_word_medianlen', 'title_top36', 'sum_abstract_top10', 'abstract_top14', 'sum_abstract_top14', 'sum_title_top10', 'max_abstract_top83', 'std_title_top40', 'abstract_top86', 'median_title_top98', 'max_abstract_top4', 'max_abstract_top87', 'sum_title_top41', 'mean_title_top26', 'gap_title_top66', 'max_abstract_top81', 'gap_title_top83', 'abstract_top64', 'max_venue_maxwordlen', 'median_title_top39', 'max_title_top25', 'mean_mean_abs_senwordcnt', 'std_title_top19', 'max_title_top70', 'median_title_word_maxlen', 'mean_title_top82', 'max_abstract_top48', 'sum_std_orgs_len', 'max_abstract_top18', 'title_top3', 'std_title_top59', 'median_title_top41', 'mean_abstract_top79', 'title_top40', 'title_top67', 'abstract_top73', 'std_title_top26', 'sum_abstract_top28', 'median_abstract_top91', 'max_title_top2', 'median_title_top17', 'median_title_top64', 'median_abstract_top28', 'abstract_top83', 'std_title_top48', 'sum_abstract_top66', 'std_title_top92', 'skew_title_top98', 'title_top54', 'max_abstract_top23', 'max_title_top74', 'max_title_top35', 'mean_max_orgs_len', 'max_title_top61', 'title_top63', 'max_title_top82', 'sum_author_count', 'median_title_top69', 'max_title_top33', 'median_abstract_top84', 'median_abstract_top88', 'title_top50', 'sum_max_abs_wordlen', 'sum_title_top19', 'title_top93', 'median_keywords_count', 'median_title_top59', 'mean_abstract_top92', 'median_abstract_top66', 'max_title_top3', 'mean_mean_orgs_wordcnt', 'title_top84', 'abstract_top30', 'std_title_top57', 'title_top56', 'median_title_top48', 'title_top51', 'sum_keywords_len_std', 'std_title_top50', 'gap_title_top80', 'sum_venue_medianwordlen', 'max_title_top8', 'abstract_top4', 'median_median_abs_wordlen', 'sum_title_top36', 'max_abstract_top63', 'max_title_top84', 'sum_title_top57', 'abstract_top92', 'median_title_top80', 'abstract_top87', 'max_title_top48', 'title_top24', 'max_abstract_top78', 'title_top96', 'title_top65', 'sum_abstract_top4', 'abstract_top3', 'median_abstract_top20', 'title_top35', 'abstract_top60', 'median_abstract_top35', 'median_abstract_top33', 'std_title_top27', 'abstract_top63', 'max_abs_len', 'sum_abstract_top8', 'sum_title_McAlpine_EFLAW', 'max_title_top85', 'sum_abstract_top72', 'std_abstract_top93', 'title_top69', 'median_abstract_top85', 'median_abstract_top19', 'median_max_orgs_len', 'max_title_top54', 'median_title_top31', 'median_title_top29', 'sum_abstract_top46', 'std_abs_sentenece_count', 'median_median_abs_senwordcnt', 'sum_abs_len', 'max_abstract_top43', 'title_top26', 'max_abstract_top0', 'sum_abstract_top27', 'max_title_top23', 'title_top72', 'abstract_top41', 'title_top70', 'sum_title_word_stdlen', 'sum_title_top35', 'median_title_top62', 'max_abstract_top98', 'sum_title_top32', 'sum_abstract_top74', 'mean_title_len', 'mean_abstract_top32', 'std_abstract_top79', 'max_abstract_top26', 'max_abstract_top46', 'median_title_top14', 'median_abstract_top53', 'median_std_orgs_wordcnt', 'sum_title_top0', 'median_title_top10', 'std_abstract_top9', 'title_top39', 'median_abs_wordcount', 'max_venue_meanwordlen', 'max_title_top19', 'max_venue_McAlpine_EFLAW', 'abstract_top58', 'sum_title_top70', 'gap_title_top55', 'median_title_top97', 'sum_abstract_top13', 'sum_abstract_top16', 'abstract_top67', 'title_top5', 'mean_title_top99', 'median_title_top27', 'mean_title_top92', 'title_top68', 'max_abs_McAlpine_EFLAW', 'abstract_top19', 'abstract_top85', 'max_abstract_top56', 'max_title_top80', 'sum_abstract_top9', 'sum_title_top94', 'max_abstract_top64', 'median_title_top65', 'max_abstract_top93', 'skew_title_top88', 'abstract_top84', 'sum_title_top51', 'std_title_top75', 'sum_abstract_top95', 'median_title_top40', 'sum_sum_abs_sentencelen', 'sum_title_top91', 'median_sum_orgs_wordcnt', 'skew_abs_CLRI', 'title_top10', 'max_abstract_top54', 'max_title_top77', 'median_abstract_top26', 'mean_title_CLRI', 'sum_title_top12', 'median_title_top49', 'abstract_top77', 'median_abstract_top77', 'median_abstract_top89', 'title_top82', 'median_abstract_top58', 'median_title_McAlpine_EFLAW', 'std_title_top31', 'gap_title_top57', 'sum_abstract_top65', 'sum_title_top72', 'median_abstract_top6', 'max_title_top98', 'sum_title_top45', 'sum_title_top18', 'max_abstract_top14', 'max_title_top62', 'sum_abstract_top76', 'mean_title_top75', 'std_title_top39', 'sum_abstract_top71', 'max_abstract_top61', 'max_abstract_top66', 'sum_title_top52', 'mean_title_top3', 'std_title_top80', 'sum_title_top61', 'title_top37', 'sum_std_abs_sentencelen', 'sum_abstract_top79', 'mean_title_top62', 'median_abstract_top92', 'max_sum_abs_sentencelen', 'sum_title_top15', 'sum_title_top7', 'median_title_top84', 'median_abstract_top11', 'skew_title_top73', 'mean_abstract_top88', 'skew_title_top57', 'max_abstract_top15', 'median_abs_len', 'mean_title_word_stdlen', 'sum_max_orgs_wordcnt', 'max_abstract_top53', 'std_title_top55', 'sum_title_top3', 'median_title_top13', 'sum_abs_sentenece_count', 'max_abstract_top62', 'median_abstract_top75', 'max_abstract_top20', 'max_title_top68', 'median_abstract_top49', 'max_max_orgs_wordcnt', 'median_abstract_top55', 'max_title_top78', 'max_title_top93', 'sum_title_top8', 'title_top62', 'sum_title_top31', 'std_abstract_top12', 'mean_title_top64', 'abstract_top98', 'title_top58', 'sum_median_orgs_wordcnt', 'skew_title_top25', 'std_title_top38', 'max_abstract_top44', 'sum_title_top95', 'std_abs_McAlpine_EFLAW', 'sum_title_top85', 'std_title_top66', 'median_abstract_top90', 'sum_title_word_meanlen', 'max_abstract_top88', 'std_title_top91', 'sum_abstract_top55', 'median_title_top43', 'median_title_top3', 'median_title_top26', 'median_title_top34', 'sum_abs_wordcount', 'abstract_top53', 'std_title_top44', 'std_title_top49', 'sum_title_top40', 'title_top55', 'median_title_top2', 'title_top11', 'median_sum_abs_sentencelen', 'median_title_top72', 'max_abstract_top24', 'median_abstract_top27', 'std_abstract_top2', 'sum_abstract_top82', 'max_abstract_top67', 'max_title_top13', 'max_abstract_top2', 'median_title_top63', 'std_title_top86', 'sum_title_top20', 'sum_title_top46', 'max_title_top50', 'sum_title_top42', 'sum_title_top88', 'std_title_top88', 'median_abstract_top23', 'skew_abstract_top57', 'sum_abstract_top63', 'median_title_top7', 'sum_venue_sumwordlen', 'sum_abstract_top7', 'sum_abstract_top20', 'venue_wordcount', 'median_title_top5', 'max_abstract_top97', 'gap_title_top47', 'title_top99', 'median_title_top11', 'abstract_top48', 'abstract_top35', 'sum_median_orgs_len', 'abstract_top39', 'max_title_top79', 'max_abstract_top12', 'max_abstract_top22', 'std_title_top29', 'sum_title_top50', 'max_abstract_top5', 'median_median_abs_sentencelen', 'max_abstract_top8', 'abstract_top74', 'median_abstract_top46', 'sum_abstract_top50', 'mean_title_top34', 'mean_abs_CLRI', 'median_title_top74', 'title_top7', 'max_title_top51', 'median_max_abs_sentencelen', 'std_title_top70', 'max_title_top91', 'max_title_top17', 'skew_abs_McAlpine_EFLAW', 'title_top15', 'max_abstract_top41', 'skew_title_top44', 'median_title_top23', 'max_abstract_top47', 'sum_title_top92', 'median_abstract_top72', 'median_title_top78', 'abstract_top7', 'std_abs_wordcount', 'sum_title_top56', 'title_top64', 'sum_std_orgs_wordcnt', 'sum_abstract_top73', 'skew_abs_wordcount', 'median_title_top35', 'median_abstract_top93', 'sum_title_top86', 'title_top95', 'median_title_top82', 'median_title_top33', 'median_abstract_top30', 'abstract_top37', 'max_title_top73', 'median_abstract_top54', 'std_title_top61', 'std_abstract_top95', 'std_abstract_top89', 'sum_title_top99', 'sum_title_top63', 'sum_abstract_top52', 'sum_title_top80', 'max_mean_orgs_len', 'sum_year', 'title_top98', 'std_title_top87', 'median_title_top83', 'abstract_top10', 'mean_title_top56', 'median_title_top36', 'median_title_top55', 'mean_title_top85', 'title_top90', 'median_title_top94', 'abstract_top69', 'max_abstract_top16', 'gap_title_top90', 'sum_keywords_len_median', 'title_top97', 'median_abstract_top7', 'sum_venue_stdwordlen', 'median_abstract_top96', 'max_title_top10', 'sum_median_abs_senwordcnt', 'mean_title_top86', 'abstract_top20', 'sum_abstract_top91', 'max_title_top72', 'sum_title_top11', 'skew_title_top78', 'max_title_top38', 'abstract_top59', 'median_title_top70', 'mean_title_top32', 'sum_abstract_top11', 'std_title_top56', 'mean_abstract_top61', 'title_top79', 'mean_title_top42', 'mean_title_top31', 'mean_abstract_top8', 'title_top9', 'max_title_top20', 'abstract_top24', 'abstract_top46', 'sum_sum_orgs_wordcnt', 'sum_title_top22', 'max_title_top47', 'max_title_top58', 'max_title_top65', 'std_title_top72', 'std_title_top84', 'mean_title_top74', 'max_title_top49', 'sum_abstract_top36', 'max_title_top6', 'max_title_top43', 'abstract_top72', 'sum_title_top98', 'max_abstract_top35', 'gap_title_top89', 'median_title_top58', 'max_abstract_top77', 'sum_abstract_top1', 'max_title_top90', 'median_title_top28', 'abstract_top62', 'max_title_top86', 'max_title_top92', 'median_title_top67', 'std_abstract_top92', 'sum_abstract_top54', 'median_abstract_top24', 'sum_abstract_top6', 'mean_title_top61', 'max_title_top16', 'gap_title_top70', 'sum_title_top76', 'sum_mean_orgs_wordcnt', 'sum_abstract_top41', 'std_title_top93', 'max_abstract_top33', 'median_title_top44', 'median_median_orgs_wordcnt', 'mean_title_top87', 'max_title_top7', 'std_title_top76', 'abstract_top99', 'abstract_top23', 'sum_title_top65', 'mean_title_top53', 'median_title_top50', 'max_abstract_top58', 'median_title_top30', 'std_title_top62', 'std_title_top69', 'max_title_top36', 'abstract_top28', 'max_title_top29', 'max_title_top24', 'std_title_top81', 'std_title_top71', 'sum_title_top26', 'sum_median_abs_wordlen', 'abstract_top34', 'abstract_top65', 'median_abs_sentenece_count', 'sum_abstract_top24', 'max_abstract_top76', 'title_top16', 'sum_venue_len', 'max_title_top1', 'title_top34', 'title_top23', 'sum_abstract_top81', 'max_abstract_top60', 'median_abstract_top48', 'median_abstract_top61', 'median_abstract_top63', 'median_title_top56', 'sum_author_org_count', 'max_title_top14', 'median_title_top96', 'max_title_top44', 'max_title_top97', 'max_abstract_top74', 'sum_abstract_top94', 'skew_title_top46', 'sum_sum_orgs_len', 'sum_abstract_top59', 'std_abstract_top51', 'mean_abstract_top4', 'max_abstract_top94', 'abstract_top82', 'max_title_top96', 'median_abstract_top1', 'abstract_top8', 'median_abstract_top45', 'std_abstract_top33', 'max_title_top27', 'skew_sum_abs_sentencelen', 'title_top14', 'abstract_top29', 'max_title_top9', 'sum_title_top49', 'max_title_top64', 'max_title_top83', 'abstract_top97', 'sum_abstract_top67', 'sum_abstract_top93', 'mean_title_top91', 'sum_title_top67', 'sum_title_top25', 'max_title_top94', 'mean_abstract_top95', 'title_top25', 'abstract_top52', 'max_title_top39', 'median_title_top24', 'sum_mean_abs_sentencelen', 'mean_title_top55', 'mean_title_top69', 'skew_title_top76', 'median_title_top92', 'mean_abs_McAlpine_EFLAW', 'max_title_top26', 'mean_title_top48', 'mean_mean_orgs_len', 'sum_abstract_top2', 'std_title_top58', 'median_author_count', 'abstract_top25', 'median_title_top4', 'abstract_top81', 'sum_title_wordcount', 'title_top4', 'median_title_top16', 'title_top52', 'max_title_top76', 'max_abstract_top55', 'mean_median_orgs_len', 'max_abstract_top99', 'abstract_top38', 'sum_title_word_sumlen', 'abstract_top9', 'std_title_top30', 'median_abstract_top5', 'median_abstract_top34', 'max_abstract_top84', 'mean_title_top63', 'sum_abstract_top98', 'sum_title_top43', 'median_title_top93', 'max_abstract_top38', 'sum_abstract_top12', 'mean_title_top46', 'median_abstract_top81', 'sum_abstract_top34', 'median_title_top52', 'sum_abs_ari', 'skew_title_top65', 'title_top91', 'std_title_top73', 'std_abs_ari', 'median_title_top21', 'max_abstract_top71', 'max_title_top21', 'median_abstract_top15', 'median_abstract_top50', 'max_mean_orgs_wordcnt', 'max_abstract_top31', 'std_title_top89', 'sum_title_top37', 'median_abstract_top80', 'max_title_top41', 'median_abstract_top44', 'std_abstract_top88', 'skew_title_top89', 'std_title_top35', 'sum_abstract_top38', 'abstract_top95', 'title_top17', 'std_title_top9', 'skew_title_top60', 'sum_abstract_top86', 'mean_abs_wordcount', 'skew_title_top63', 'title_top86', 'gap_title_top82', 'sum_title_top39', 'max_abstract_top42', 'median_abstract_top2', 'title_top28', 'title_top85', 'title_top45', 'sum_max_orgs_len', 'sum_abstract_top44', 'median_abstract_top62', 'max_title_ari', 'sum_title_top13', 'title_top46', 'sum_title_top89', 'median_title_top85', 'abstract_top93', 'median_title_top45', 'sum_abstract_top70', 'max_abstract_top39', 'max_title_top22', 'gap_title_top86', 'std_title_top95', 'mean_title_top9', 'mean_title_top40', 'std_title_top45', 'std_title_top94', 'mean_title_top97', 'sum_abstract_top42', 'median_abstract_top59', 'std_abstract_top39', 'mean_sum_abs_sentencelen', 'median_title_word_sumlen', 'skew_abs_len', 'max_sum_abs_senwordcnt', 'sum_abstract_top39', 'median_abstract_top82', 'sum_title_top79', 'median_title_top46', 'sum_abstract_top18', 'title_top32', 'sum_title_top9', 'max_title_top37', 'median_sum_abs_senwordcnt', 'gap_title_top93', 'sum_abstract_top92', 'sum_abstract_top37', 'sum_title_top55', 'max_title_top60', 'median_abstract_top99', 'sum_venue_McAlpine_EFLAW', 'sum_sum_abs_wordlen', 'gap_title_top53', 'median_title_top77', 'sum_venue_maxwordlen', 'sum_abstract_top69', 'std_title_top99', 'abstract_top51', 'sum_title_top47', 'max_abstract_top10', 'median_abstract_top36', 'median_venue_wordcount', 'max_title_wordcount', 'std_abstract_top36', 'sum_abstract_top77', 'median_abstract_top52', 'sum_abstract_top53', 'max_abstract_top51', 'title_top92', 'std_title_top67', 'sum_abstract_top56', 'max_title_top67', 'sum_venue_meanwordlen', 'std_title_top24', 'sum_title_top75', 'sum_title_top4', 'sum_title_top84', 'median_title_top91', 'median_abstract_top41', 'title_top75', 'title_top27', 'median_abstract_top78', 'title_top29', 'sum_title_top58', 'sum_abstract_top47', 'max_abstract_top79', 'median_abstract_top51', 'title_top1', 'skew_title_top47', 'sum_abstract_top64', 'max_abstract_top85', 'max_abstract_top21', 'max_abstract_top45', 'max_abstract_top36', 'max_abstract_top1', 'sum_abstract_top96', 'sum_title_top90', 'max_abstract_top70', 'max_title_top31', 'median_median_orgs_len', 'sum_title_top69', 'median_title_wordcount', 'title_top47', 'title_top81', 'sum_median_abs_sentencelen', 'median_abstract_top17', 'abstract_top49', 'mean_title_top23', 'median_abstract_top4', 'std_title_top43', 'std_title_top90', 'sum_abstract_top3', 'std_abstract_top67', 'sum_max_abs_sentencelen', 'mean_median_abs_sentencelen', 'sum_abstract_top57', 'median_max_abs_wordlen', 'median_abstract_top47', 'abstract_top50', 'median_abstract_top60', 'max_abstract_top32', 'sum_abstract_top48', 'median_abstract_top73', 'abstract_top61', 'median_title_top37', 'abstract_top18', 'median_abstract_top39', 'mean_abstract_top70', 'sum_abstract_top19', 'skew_abs_ari', 'max_title_top32', 'std_title_top60', 'abstract_top36', 'max_median_abs_senwordcnt', 'sum_abstract_top0', 'title_top53', 'abstract_top44', 'sum_title_top38', 'median_title_top57', 'sum_title_top78', 'mean_title_top72', 'sum_abstract_top32', 'std_abstract_top61', 'mean_std_abs_senwordcnt', 'abstract_top11', 'std_title_top32', 'max_title_top18', 'max_title_top95', 'sum_title_word_medianlen', 'median_title_top88', 'mean_title_top78', 'sum_mean_orgs_len', 'skew_abstract_top4', 'sum_mean_abs_senwordcnt', 'max_title_top99', 'sum_title_top71', 'title_top48', 'median_title_top18', 'median_abs_wordlen', 'abstract_top45', 'median_title_top32', 'title_top38', 'sum_abstract_top22', 'max_title_top59', 'max_abstract_top30', 'median_abstract_top76', 'sum_abstract_top62', 'max_title_top75', 'skew_title_top13', 'std_title_top52', 'max_abstract_top50', 'sum_title_top27', 'title_top43','std_title_top79', 'median_title_top76', 'sum_abstract_top84', 'skew_sum_abs_senwordcnt', 'median_title_top15', 'max_title_top87', 'median_venue_len', 'max_title_top55', 'abstract_top43', 'median_abs_McAlpine_EFLAW', 'median_title_top71', 'max_sum_orgs_wordcnt', 'max_title_top56', 'max_abstract_top13', 'median_abstract_top18', 'max_abstract_top3', 'max_abstract_top89', 'std_abstract_top1', 'median_abstract_top16', 'mean_mean_abs_sentencelen', 'median_abstract_top68', 'mean_sum_orgs_len', 'abstract_top32', 'sum_title_top6', 'median_title_top99', 'sum_abstract_top78', 'skew_mean_abs_senwordcnt', 'title_top33', 'max_median_abs_wordlen', 'max_abstract_top7', 'abstract_top27', 'mean_venue_sumwordlen', 'sum_abstract_top26', 'mean_title_top47', 'median_abstract_top79', 'std_title_top28', 'max_venue_wordcount', 'sum_title_top53', 'median_abstract_top56', 'median_venue_medianwordlen', 'abstract_top13', 'median_title_top38', 'max_abstract_top52', 'max_abstract_top28', 'sum_abstract_top75', 'max_abstract_top75', 'median_abstract_top95', 'max_abstract_top57', 'mean_title_top59', 'median_author_org_count', 'median_abstract_top14', 'max_title_top30', 'max_abstract_top17', 'sum_title_top21', 'std_title_top54', 'mean_abstract_top7', 'mean_abs_ari', 'max_abstract_top69', 'abstract_top54', 'mean_title_top83', 'max_title_top40', 'max_abstract_top73', 'median_title_top53', 'max_abstract_top9', 'std_title_top64', 'sum_std_abs_senwordcnt', 'abstract_top96', 'sum_std_abs_wordlen', 'mean_max_abs_sentencelen', 'max_sum_abs_wordlen', 'sum_title_top48', 'title_top71', 'abstract_top1', 'mean_title_top76', 'max_abstract_top59', 'std_title_top12', 'mean_abs_len', 'max_title_top89', 'median_abstract_top87', 'median_abstract_top10', 'std_title_top23', 'abstract_top90', 'title_top12', 'std_title_top78', 'median_abstract_top42', 'abstract_top71', 'title_top2', 'median_title_top75', 'abstract_top42', 'median_abstract_top32', 'sum_abstract_top51', 'max_abstract_top96', 'median_venue_maxwordlen', 'sum_abstract_top17', 'title_top20', 'max_abstract_top65', 'gap_title_top65', 'sum_sum_abs_senwordcnt', 'std_title_top34', 'max_title_top4', 'std_title_top85', 'max_abstract_top11', 'title_top66', 'std_title_top68', 'sum_title_top23', 'median_abstract_top67', 'title_top30', 'skew_title_top49', 'sum_title_top30', 'abstract_top80', 'title_top59', 'max_title_top71', 'title_top74', 'median_title_top25', 'sum_title_top73', 'max_title_top46', 'abstract_top76', 'sum_max_abs_senwordcnt', 'mean_title_top66', 'std_title_top74', 'std_title_top77', 'median_abstract_top3', 'max_abstract_top82', 'std_title_top97', 'median_abstract_top83', 'sum_title_top77', 'max_abstract_top29', 'sum_abs_McAlpine_EFLAW', 'std_title_top47', 'title_top89', 'median_title_top95', 'max_title_top15', 'max_title_top57', 'abstract_top55', 'sum_title_top34', 'max_title_top28', 'median_abstract_top97', 'std_title_top83', 'sum_title_top68', 'mean_title_top29', 'title_top78', 'mean_sum_abs_senwordcnt', 'median_std_orgs_len', 'sum_keywords_len_sum', 'std_title_top96', 'sum_abstract_top33', 'mean_title_top37', 'max_title_top45', 'median_title_top12', 'sum_title_top28', 'skew_title_top77', 'mean_title_top84', 'std_title_top20', 'median_abstract_top29', 'title_top21', 'max_author_org_count', 'mean_title_top88', 'sum_keywords_count', 'median_abstract_top21', 'std_title_top63', 'median_abstract_top86', 'skew_abstract_top60', 'median_title_top54', 'sum_title_top44', 'median_title_top61', 'max_median_orgs_wordcnt', 'sum_keywords_len_mean', 'median_title_top68', 'median_title_top81', 'median_abstract_top37', 'std_abstract_top57', 'title_top76', 'mean_title_top90', 'abstract_top21', 'title_top60', 'median_abstract_top38', 'median_title_top73', 'mean_title_top93', 'abstract_top94', 'median_title_top51', 'sum_title_top83', 'median_title_top9', 'median_abstract_top12', 'max_abstract_top40', 'mean_title_top13', 'sum_title_top29', 'median_abstract_top13', 'median_max_abs_senwordcnt', 'median_title_top22', 'sum_venue_ari', 'max_title_top34', 'median_title_top86', 'median_title_top47', 'median_venue_McAlpine_EFLAW', 'max_title_top11', 'median_title_top19', 'max_title_top88', 'abstract_top40', 'max_abstract_top68', 'max_abstract_top19', 'median_title_top6', 'abstract_top56', 'median_abstract_top57', 'std_title_top98', 'mean_abstract_top51', 'abstract_top78', 'std_title_top36', 'median_sum_orgs_len', 'sum_mean_abs_wordlen', 'median_title_top90', 'sum_title_top96', 'sum_title_top97', 'sum_abstract_top60', 'title_top22', 'sum_title_top93', 'mean_title_top27', 'median_title_top42', 'max_abstract_top80', 'std_abstract_top7', 'sum_abstract_top88', 'mean_title_top79', 'abstract_top79', 'title_top18', 'median_abstract_top74', 'std_max_orgs_len', 'std_title_top51', 'title_top41', 'sum_abstract_top21', 'max_title_top5', 'abstract_top68', 'max_title_top63', 'skew_title_top90', 'skew_title_top82', 'gap_title_top26', 'title_top49', 'std_title_top42', 'median_abstract_top70', 'max_std_orgs_wordcnt', 'abstract_top47', 'sum_title_top2', 'abstract_top26', 'abstract_top88', 'title_top0', 'median_title_top0', 'sum_abstract_top45', 'mean_title_top73', 'median_abstract_top0', 'median_keywords_len_max', 'max_title_top52', 'std_title_top65', 'abstract_top70', 'max_title_top53', 'sum_abstract_top85', 'max_abstract_top92', 'sum_abstract_top49', 'gap_title_top74', 'title_top19', 'sum_title_top59', 'median_abstract_top69', 'median_title_top87', 'max_abstract_top90', 'title_top88', 'title_top94', 'std_sum_abs_sentencelen', 'sum_title_top60', 'sum_title_top54', 'max_venue_medianwordlen', 'sum_title_top17', 'mean_title_top19', 'sum_title_top1', 'max_abstract_top25', 'max_title_top12', 'title_top31', 'sum_abstract_top61', 'sum_venue_wordcount', 'abstract_top22', 'skew_title_top29', 'median_abstract_top25', 'mean_median_abs_senwordcnt', 'std_title_top82', 'median_abstract_top65', 'max_abstract_top72', 'max_abs_sentenece_count', 'mean_title_top89', 'sum_title_top74', 'sum_venue_CLRI', 'median_title_top20', 'sum_title_len', 'mean_median_orgs_wordcnt', 'sum_abstract_top29', 'sum_title_top62', 'sum_abstract_top80', 'sum_title_top24', 'max_abstract_top34', 'title_top44', 'sum_title_top66', 'mean_title_top20', 'title_top80', 'median_abstract_top22', 'sum_abs_sentencelen', 'median_abstract_top40', 'mean_title_top44', 'median_abstract_top8', 'sum_title_top87', 'title_top61', 'title_top57', 'max_abstract_top95', 'median_abstract_top71', 'title_top8', 'title_top6', 'max_abs_wordcount', 'std_title_top37', 'median_abstract_top43', 'title_top87', 'max_abstract_top37', 'skew_title_top50', 'sum_title_word_maxlen','std_title_top22', 'std_title_top18', 'abstract_top57', 'abstract_top5', 'std_abstract_top18', 'std_title_top25', 'skew_title_top40', 'skew_title_top85', 'mean_title_top77', 'median_std_abs_sentencelen', 'title_top73', 'median_venue_meanwordlen', 'skew_title_top51']
useless_cols+=['japan', 'france', 'switzerland', 'uk','germany', 'canada', 'australia', 'hong kong', 'united states', 'u.s.a', 'singapore', 'united kingdom', 'russia', 'median_china', 'sum_japan', 'median_japan', 'max_japan', 'median_france', 'max_france', 'median_usa', 'sum_switzerland', 'median_switzerland', 'max_switzerland', 'std_switzerland','median_uk', 'max_uk', 'sum_germany', 'median_germany', 'max_germany', 'sum_canada', 'median_canada', 'sum_australia', 'median_australia', 'max_australia', 'std_australia', 'median_hong kong', 'median_united states', 'sum_u.s.a', 'mean_u.s.a', 'median_u.s.a', 'skew_u.s.a', 'max_u.s.a', 'std_u.s.a', 'gap_u.s.a', 'median_singapore', 'max_singapore', 'sum_united kingdom', 'median_united kingdom', 'max_united kingdom', 'std_united kingdom','sum_russia', 'mean_russia', 'median_russia', 'max_russia', 'std_russia', 'gap_russia','gap_switzerland', 'max_united states','mean_switzerland', 'author_count/mean_org_author_count','sum_uk', 'max_hong kong', 'sum_united states']
useless_cols+=['max_author_count', 'skew_mean_abs_wordlen', 'skew_venue_wordcount', 'skew_venue_McAlpine_EFLAW', 'mean_year', 'skew_year', 'skew_title_top4', 'skew_title_top11', 'skew_title_top42', 'skew_title_top53', 'gap_title_top56', 'std_abstract_top10', 'std_abstract_top22', 'std_abstract_top35', 'skew_abstract_top47', 'std_abstract_top69', 'skew_abstract_top72', 'skew_abstract_top93']
useless_cols+=['category_3', 'sum_category_8', 'abstract_top0', 'category_8', 'max_category_6', 'sum_category_10', 'abs_sentenece_count', 'mean_category_7', 'median_category_6', 'max_category_0', 'max_category_5', 'skew_category_13', 'category_12', 'median_category_3', 'category_0', 'median_category_7', 'max_category_1', 'max_category_12', 'median_category_11', 'median_category_9', 'title_word_medianlen', 'std_category_7', 'median_category_0', 'max_category_10', 'max_category_2', 'median_category_12', 'gap_category_8', 'std_category_13', 'max_category_8', 'category_1', 'median_category', 'category_10', 'sum_category_11', 'max_category_7', 'category_13', 'median_category_8', 'median_category_4', 'category_4', 'median_category_2', 'max_category_4', 'median_category_13', 'std_category_8', 'median_category_10', 'max_category_13', 'gap_category_7', 'max_category_3', 'max_category_11', 'category_7', 'sum_category_13', 'median_category_1', 'category_6', 'max_category_9', 'median_category_5', 'sum_category_7']
useless_cols+=['abstract_top2', 'abstract_top6', 'author_count', 'china', 'median_abs_senwordcnt', 'abstract_top33', 'gap_category_13', 'max_orgs_wordcnt', 'sum_abs_wordlen', 'total_len+std_abs_wordlen', 'mean_abs_wordlen+sum_abs_wordlen', 'mean_abs_wordlen-sum_abs_wordlen', 'std_abs_wordlen+sum_abs_wordlen', 'std_abs_wordlen-sum_abs_wordlen', 'abs_len', 'category_11', 'std_category_11', 'author_count-sum_abs_wordlen', 'venue_stdwordlen+sum_abs_wordlen', 'mean_org_author_count+sum_abs_wordlen', 'total_len+mean_abs_wordlen', 'total_len-mean_abs_wordlen', 'std_category_2', 'venue_stdwordlen-sum_abs_wordlen', 'gap_author_count-sum_abs_wordlen', 'sum_category_4', 'title_ari+sum_abs_wordlen', 'mean_org_author_count-sum_abs_wordlen', 'total_len-std_abs_wordlen', 'category_9', 'mean_category_8', 'total_len*sum_abs_wordlen', 'total_len+sum_abs_wordlen', 'skew_category_7', 'category_2', 'std_category_4', 'title_ari-sum_abs_wordlen', 'year*sum_abs_wordlen', 'mean_category_4', 'author_count+sum_abs_wordlen', 'max_category', 'gap_category_11', 'year-sum_abs_wordlen', 'sum_category_2']
train_feats.drop(useless_cols,axis=1,inplace=True)
valid_feats.drop(useless_cols,axis=1,inplace=True)
train_feats.replace([np.inf, -np.inf], np.nan, inplace=True)
valid_feats.replace([np.inf, -np.inf], np.nan, inplace=True)
for col in valid_feats.columns:
if (train_feats[col].skew()>1) and train_feats[col].min()>-1:
print(f"skew:{col}")
train_feats[col]=np.log1p(train_feats[col])
valid_feats[col]=np.log1p(valid_feats[col])
for col in valid_feats.columns:
if ('len' in col) or ('count' in col) or ('top' in col ):
tmp=train_feats[col].value_counts().to_frame().reset_index()
margin=10
if tmp[tmp['count']<=margin]['count'].sum()<len(train_feats)*0.0025:
less_value_min=tmp[tmp['count']<=margin][col].min()
more_value_max=tmp[tmp['count']>margin][col].max()
less_value_max=tmp[tmp['count']<=margin][col].max()
more_value_min=tmp[tmp['count']>margin][col].min()
if less_value_min>more_value_max:
print(f"1:{col}")
value=(less_value_min+more_value_max)/2
train_feats.loc[train_feats[col] >= value, col] = np.nan
valid_feats.loc[valid_feats[col] >= value, col] = np.nan
if less_value_max<more_value_min:
print(f"2:{col}")
value=(less_value_max+more_value_min)/2
train_feats.loc[train_feats[col] <= value, col] = np.nan
valid_feats.loc[valid_feats[col] <= value, col] = np.nan
zerocols=[]
for col in valid_feats.columns:
if (('len' in col) or ('count' in col)) and \
('+' not in col) and ('-' not in col) and ('*' not in col)and ('/' not in col)\
and ('gap' not in col ) and ('skew' not in col ) and ('std' not in col ):
df=train_feats[train_feats[col]==0]
if (len(df)>0) and (len(df)/len(train_feats)<0.15):
zerocols.append(col)
print(f"len(zerocols):{len(zerocols)},zerocols:{zerocols}")
for col in zerocols:
mean=train_feats[train_feats[col]!=0][col].mean()
train_feats[col]=train_feats[col].replace(0, mean)
valid_feats[col]=valid_feats[col].replace(0, mean)
train_feats.head()
def reduce_mem_usage(df, float16_as32=True):
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object and str(col_type)!='category':
c_min,c_max = df[col].min(),df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
if float16_as32:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
train_feats=reduce_mem_usage(train_feats, float16_as32=True)
valid_feats=reduce_mem_usage(valid_feats, float16_as32=True)
train_feats.to_csv("train_feats.csv",index=None)
valid_feats.to_csv("valid_feats.csv",index=None)