1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
| def str2char(): txt = df['text'].apply(lambda x: list(x.split())) x = [] for i in (range(0, len(txt))): x.append(txt[i]) y = x[i] for j in range(len(y)): df['str2char_{}'.format(j)] = 0
for i in (range(0, len(txt))): x.append(txt[i]) y = x[i] for value, idx in zip(y, range(len(y))): df['str2char_{}'.format(idx)][i] = value str2char()
def dup_num(): a = df['q1'].apply(lambda x: (x.split())) b = df['q2'].apply(lambda x: (x.split())) x = [] for i in range(len(a)): x.append(len(set(a[i]) & set(b[i]))) df['dup_num']=pd.DataFrame(x) return df['dup_num'] df['dup_num'] = dup_num()
def dup_chr(): a = df['q1'].apply(lambda x: list(x.split())) b = df['q2'].apply(lambda x: list(x.split())) x = [] for i in (range(0, len(a))): x.append(set(a[i]) & set(b[i])) y = x[i] for idx in range(len(y)): df['dup_chr_{}'.format(idx)] = 0
for i in (range(0, len(a))): x.append(set(a[i]) & set(b[i])) y = x[i] for value, idx in zip(y, range(len(y))): df['dup_chr_{}'.format(idx)][i] = value dup_chr()
df['q1_len'] = df['q1'].apply(lambda x: len(x.split(' '))) df['q2_len'] = df['q2'].apply(lambda x: len(x.split(' '))) df['text_len'] = df['text'].apply(lambda x: len(x.split(' ')))
df['sub_q1q2'] = df['q1_len'] - df['q2_len'] df['sub_q2q1'] = df['q2_len'] - df['q1_len']
df['max_q1'] = df['q1'].apply(lambda x: max(list(map(int, x.split())))) df['max_q2'] = df['q2'].apply(lambda x: max(list(map(int, x.split())))) df['max_text'] = df['text'].apply(lambda x: max(list(map(int, x.split()))))
df['min_q1'] = df['q1'].apply(lambda x: min(list(map(int, x.split())))) df['min_q2'] = df['q2'].apply(lambda x: min(list(map(int, x.split())))) df['min_text'] = df['text'].apply(lambda x: min(list(map(int, x.split()))))
df['max_min_q1'] = df['max_q1'] - df['min_q1'] df['max_min_q2'] = df['max_q2'] - df['min_q2'] df['max_min_text'] = df['max_text'] - df['min_text']
df['mean_q1'] = df['q1'].apply(lambda x: mean(list(map(int, x.split())))) df['mean_q2'] = df['q2'].apply(lambda x: mean(list(map(int, x.split())))) df['mean_text'] = df['text'].apply(lambda x: mean(list(map(int, x.split()))))
df['std_q1'] = df['q1'].apply(lambda x: std(list(map(int, x.split())))) df['std_q2'] = df['q2'].apply(lambda x: std(list(map(int, x.split())))) df['std_text'] = df['text'].apply(lambda x: std(list(map(int, x.split()))))
df['freq_q1'] = df['q1'].apply(lambda x: max(((Counter(list(map(int, x.split()))))).values())) df['freq_q2'] = df['q2'].apply(lambda x: max(((Counter(list(map(int, x.split()))))).values())) df['freq_text'] = df['text'].apply(lambda x: max(((Counter(list(map(int, x.split()))))).values()))
|