Skip to content

Commit

Permalink
Triple Selection Training working
Browse files Browse the repository at this point in the history
  • Loading branch information
pedrojlazevedo committed Mar 30, 2020
1 parent 905d53d commit 9416131
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 29 deletions.
14 changes: 9 additions & 5 deletions defacto/model_nl.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from defacto.SolrUtils import SolrUtils
from defacto.core_util import get_topic_terms
# from defacto.SolrUtils import SolrUtils
# from defacto.core_util import get_topic_terms
from defacto.rel_extract import TripleExtraction_ClausIE
from defacto.wikipedia import WikiPediaUtils


# from defacto.wikipedia import WikiPediaUtils


class ModelNL(object):
Expand All @@ -22,12 +24,11 @@ def __init__(self, claim, language='en', label=None, fever_id=None):
self.error_on_extract_triples = False
self.error_message = ''
self.__extract_triples()
#if len(self.triples) == 0:
# if len(self.triples) == 0:
# raise Exception('could not extract triples out of the claim!')
except Exception as error:
raise error


def __extract_triples(self):
try:
print('extracting triples...')
Expand All @@ -47,6 +48,8 @@ def extract_features(self):
raise


'''
if __name__ == '__main__':
try:
Expand Down Expand Up @@ -76,3 +79,4 @@ def extract_features(self):
except Exception as e:
print(e)
'''
10 changes: 6 additions & 4 deletions defacto/rel_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from nltk.parse.stanford import StanfordParser
from nltk.tree import ParentedTree, Tree
from nltk.stem import WordNetLemmatizer
from pycorenlp import *
#from pycorenlp import *


def get_sentences_from_document(text):
return sent_tokenize(text)
Expand Down Expand Up @@ -40,7 +41,6 @@ def annotate(self, sentence):
raise error



class TripleExtraction_Rusu(object):
'''
[1] Delia Rusu, Lorand Dali, Blaž Fortuna, Marko Grobelnik, Dunja Mladenić Triplet Extraction from Sentences
Expand Down Expand Up @@ -123,11 +123,12 @@ def get_triples(self, sentence):
o = self.find_object(t)
return (s, p, o)


class TripleExtraction_ClausIE(object):

def __init__(self):
try:
self.cl = ClausIE.get_instance(jar_filename='/data/defacto/github/fever/clausie/clausie/clausie.jar')
self.cl = ClausIE.get_instance(jar_filename='/home/guest/git/DeFactoNLP/clausie/clausie.jar')
except Exception as error:
raise error

Expand All @@ -137,7 +138,7 @@ def get_triples(self, sentence):
return triples
except:
raise

'''
if __name__ == '__main__':
try:
Expand All @@ -164,3 +165,4 @@ def get_triples(self, sentence):
except Exception as e:
print(e)
'''
70 changes: 50 additions & 20 deletions proof_extraction_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,18 @@ def smith_waterman_distance(seq1, seq2, match=3, mismatch=-1, insertion=-1, dele
------------------------------------------------------------------------------------------------------------------
'''

def encode(x):

x = x.replace(" (", "-LRB-")
x = x.replace(") ", "-RRB-")
x = x.replace("/", "-SLH-")
x = x.replace(" [", "-LSB-")
x = x.replace("] ", "-RSB-")
x = x.replace(" ", "_")
x = x.replace(":", "-COLON-")
return x


def getDocContentFromFile(doc_filename):
try:
content=[]
Expand Down Expand Up @@ -355,7 +367,28 @@ def train_model():
y = []
[y.extend(row) for row in [r for r in data_y]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=MODEL_TEST_SIZE, random_state=42)
# marreta
_X = []
_y = []
maxk = sum(y)
k = 0
for i in range(0, len(y)):
if y[i] == 1:
_X.append(X[i])
_y.append(y[i])
elif k <= maxk and y[i] == 0:
_X.append(X[i])
_y.append(y[i])
k += 1
elif sum(_y) < maxk:
continue
else:
break

print(len(_y))
print(sum(_y))

X_train, X_test, y_train, y_test = train_test_split(_X, _y, test_size=MODEL_TEST_SIZE, random_state=42)

print('training the classifier...')
clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)
Expand All @@ -366,10 +399,11 @@ def train_model():
model2 = clf2.fit(X_train, y_train)
predictions = model.predict(X_test)
predictions2 = model2.predict(X_test)
#P, R, F, S = sklearn.metrics.precision_recall_fscore_support(y_test, predictions)
P, R, F, S = sklearn.metrics.precision_recall_fscore_support(y_test, predictions)
print(classification_report(y_test, predictions, digits=3))
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print('tn, fp, fn, tp', tn, fp, fn, tp)
print(clf.feature_importances_)
print('--------------------------------------------------------')
print(classification_report(y_test, predictions2, digits=3))
tn, fp, fn, tp = confusion_matrix(y_test, predictions2).ravel()
Expand Down Expand Up @@ -405,12 +439,14 @@ def extract_features(defactoNL_full_path_file):
import os
from defacto.model_nl import ModelNL
with open(defactoNL_full_path_file, 'rb') as handle:
print(':: processing ' + defactoNL_full_path_file)
defactoNL = pickle.load(handle)
X = []
y = []
if defactoNL.error_on_extract_triples is True:
print('error on defacto triple extraction: ', defactoNL.error_message)
else:
print('defacto triple extraction OK ')
for proof in defactoNL.proofs:
y.append(1)
X.append(_extract_features(proof, defactoNL.claim, defactoNL.triples))
Expand All @@ -419,11 +455,13 @@ def extract_features(defactoNL_full_path_file):
y.append(0)
X.append(_extract_features(non_proof, defactoNL.claim, defactoNL.triples))

assert len(X) == len(y)
return (X, y)
if len(X) == len(y):
return (X, y)
else:
raise Exception('X and y have different size!')

except Exception as e:
print('-- error ', repr(e))
print('-- extract features error ', repr(e))

def export_training_data_proof_detection():
import glob
Expand Down Expand Up @@ -462,6 +500,7 @@ def export_training_data_proof_detection():
except Exception as e:
print('error export_training_data_proof_detection()', repr(e))


def export_defacto_models():
try:
job_args = []
Expand Down Expand Up @@ -504,7 +543,7 @@ def save_defacto_model(fever_id, claim, label, evidences_train):

# extracting sentences generically
for evidence_meta in evidences_train: # train file
filename = evidence_meta[2]
filename = encode(evidence_meta[2])
id_sentence_supports_refutes = evidence_meta[3]
if filename not in defactoNL.external_documents_names:
defactoNL.external_documents_names.append(filename)
Expand Down Expand Up @@ -575,24 +614,15 @@ def save_defacto_model(fever_id, claim, label, evidences_train):
ROOT_PATH = os.getcwd() + "/"
print(' --> root: ', ROOT_PATH)
MAX_TRAINING_DATA = 100000
MODEL_TEST_SIZE = 0.3
MODEL_TEST_SIZE = 0.15

args = sys.argv
print(args)
print(args[1])


if args[1] == 'prod':
PATH_WIKIPAGES = '/data/defacto/github/fever/data/wiki-pages/wiki-pages-split/'
TRAIN_FILE = "/data/defacto/github/fever/data/subsample_train_relevant_docs.jsonl"
DEFACTO_OUTPUT_FOLDER = 'defacto/defacto_models/'
else:
if len(args) == 0:
args = ['dev', '0', '1', '2']
PATH_WIKIPAGES = '/Users/diegoesteves/Github/factchecking/DeFacto/python/defacto/'
TRAIN_FILE = "defacto/small_train.jsonl"
DEFACTO_OUTPUT_FOLDER = 'defacto/defacto_models/'

DEFACTO_OUTPUT_FOLDER = 'defacto/defacto_models/'
args = ['dev', '2']
PATH_WIKIPAGES = '/home/guest/git/DeFactoNLP/data/wiki-pages-split/'
TRAIN_FILE = "/home/guest/git/DeFactoNLP/data/subsample_train_relevant_docs.jsonl"

if '0' in args:
print('=======================================================================================')
Expand Down

0 comments on commit 9416131

Please sign in to comment.