def compute_features(train_df, test_df):
train_df[Fields.qratio] = train_df.apply(
lambda row: fuzz.QRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.qratio] = test_df.apply(
lambda row: fuzz.QRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_qratio = compute_quality(train_df, Fields.qratio)
train_df[Fields.wratio] = train_df.apply(
lambda row: fuzz.WRatio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.wratio] = test_df.apply(
lambda row: fuzz.WRatio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_wratio = compute_quality(train_df, Fields.wratio)
train_df[Fields.partial_ratio] = train_df.apply(
lambda row: fuzz.partial_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.partial_ratio] = test_df.apply(
lambda row: fuzz.partial_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_partial_ratio = compute_quality(train_df, Fields.partial_ratio)
train_df[Fields.partial_token_set_ratio] = train_df.apply(
lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.partial_token_set_ratio] = test_df.apply(
lambda row: fuzz.partial_token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_partial_token_set_ratio = compute_quality(train_df, Fields.partial_token_set_ratio)
train_df[Fields.partial_token_sort_ratio] = train_df.apply(
lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.partial_token_sort_ratio] = test_df.apply(
lambda row: fuzz.partial_token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_partial_token_sort_ratio = compute_quality(train_df, Fields.partial_token_sort_ratio)
train_df[Fields.token_set_ratio] = train_df.apply(
lambda row: fuzz.token_set_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.token_set_ratio] = test_df.apply(
lambda row: fuzz.token_set_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_token_set_ratio = compute_quality(train_df, Fields.token_set_ratio)
train_df[Fields.token_sort_ratio] = train_df.apply(
lambda row: fuzz.token_sort_ratio(str(row[FieldsTrain.question1]), str(row[FieldsTrain.question2])), axis=1)
test_df[Fields.token_sort_ratio] = test_df.apply(
lambda row: fuzz.token_sort_ratio(str(row[FieldsTest.question1]), str(row[FieldsTest.question2])), axis=1)
quality_token_sort_ratio = compute_quality(train_df, Fields.token_sort_ratio)
quality = dict(
quality_qratio=quality_qratio,
quality_wratio=quality_wratio,
quality_partial_ratio=quality_partial_ratio,
quality_partial_token_set_ratio=quality_partial_token_set_ratio,
quality_partial_token_sort_ratio=quality_partial_token_sort_ratio,
quality_token_set_ratio=quality_token_set_ratio,
quality_token_sort_ratio=quality_token_sort_ratio
)
return quality