diff --git a/.gitignore b/.gitignore index cce8b2a..9b62224 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,3 @@ data/ word_embeddings/ *.txt *.csv - diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..0e40fe8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ + +# Default ignored files +/workspace.xml \ No newline at end of file diff --git a/.idea/RelEx-relex_CNN.iml b/.idea/RelEx-relex_CNN.iml new file mode 100644 index 0000000..7c9d48f --- /dev/null +++ b/.idea/RelEx-relex_CNN.iml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..041ee02 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..a9411ab --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/Command Line Guide b/Command Line Guide new file mode 100644 index 0000000..8cce458 --- /dev/null +++ b/Command Line Guide @@ -0,0 +1,67 @@ +Adding command line parameters: + +If you want to add command line parameters, in __main__.py use the argparser.add_argument() method + +I've already done this but if you were doing this from scratch first you would initialize an argparser object + +Example: + parser = argparse.ArgumentParser(prog="RelEx") + # prog is the name of the program - not really important + + + + +After initializing an ArgumentParser object, call add_argument to add desired parameters. + +Example: + # say you wanted to add a flag for crf output layer + # you could do this + parser.add_argument("--crf") + # The string passed here is the flag that users would type when running the program + + >> python relex --crf TRUE + # The value of crf here would be "TRUE" + # Format should be two hyphens followed by name + + + +It's good practice to also add help text for users. Give a short description of what the flag is for. +This will show up to users when they run the program with the -h or --help flag + +Example: + parser.add_argument("--crf", help="Use CRF output layer") + + + + +If you want to have a default value, just add a default parameter + +Example: + parser.add_argument("--crf", help="Use CRF output layer", default="FALSE") + + >>python relex + # The value of crf here would be "FALSE" + + + +Finally, if you want to make a parameter required: + + parser.add_argument("--crf", help="Use CRF output layer", default="FALSE", required=True) + # set required=True. By default all command line parameters are not required. + + + +Do this for all the parameters you'd like to have and then parse them using ArgumentParser.parse_args() +I have done this in the code already, but if you were to recreate this following the example used thus far, +it would look like: + + args = parser.parse_args() + # I believe parse_args() returns some kind of object where all the command line parameters are stored + + +Now to access the parameters: args.nameOfParameter +The name of the parameter is the string of text that comes after the two hyphens. +So to access "--crf" which we've been using as an example: + + args.crf + # The value of crf commandline parameter diff --git a/README.md b/README.md index c0d3f95..11fec3e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# RelEx + # RelEx RelEx is a clinical **Rel**ation **Ex**traction Framework to identify relations between two entities. The framework is divided into two main components: co-location based (rule based) Relation Extraction and deep learning based (CNN) Relation Extraction. diff --git a/relex/RelEx_NN/__init__.pyc b/relex/RelEx_NN/__init__.pyc new file mode 100644 index 0000000..01f97b7 Binary files /dev/null and b/relex/RelEx_NN/__init__.pyc differ diff --git a/relex/RelEx_NN/__pycache__/__init__.cpython-36.pyc b/relex/RelEx_NN/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..480eaf6 Binary files /dev/null and b/relex/RelEx_NN/__pycache__/__init__.cpython-36.pyc differ diff --git a/relex/RelEx_NN/cnn/__pycache__/__init__.cpython-36.pyc b/relex/RelEx_NN/cnn/__pycache__/__init__.cpython-36.pyc deleted file mode 100644 index 3c0a044..0000000 Binary files a/relex/RelEx_NN/cnn/__pycache__/__init__.cpython-36.pyc and /dev/null differ diff --git a/relex/RelEx_NN/cnn/__pycache__/segment_cnn.cpython-36.pyc b/relex/RelEx_NN/cnn/__pycache__/segment_cnn.cpython-36.pyc deleted file mode 100644 index 4347f31..0000000 Binary files a/relex/RelEx_NN/cnn/__pycache__/segment_cnn.cpython-36.pyc and /dev/null differ diff --git a/relex/RelEx_NN/cnn/sentence_cnn.py b/relex/RelEx_NN/cnn/sentence_cnn.py index c0956f6..8c96853 100644 --- a/relex/RelEx_NN/cnn/sentence_cnn.py +++ b/relex/RelEx_NN/cnn/sentence_cnn.py @@ -9,6 +9,7 @@ from sklearn.utils import class_weight from utils import file, normalization import numpy as np +import tensorflow as tf class Sentence_CNN: @@ -62,16 +63,24 @@ def define_model(self, no_classes): model.compile(optimizer=self.optimizer, loss=self.loss, metrics=self.metrics) else: - - input_shape = Input(shape=(self.data_model.maxlen,)) - embedding = Embedding(self.data_model.common_words, self.embedding.embedding_dim)(input_shape) - - if self.embedding: - embedding = Embedding(self.data_model.common_words, self.embedding.embedding_dim, - weights=[self.embedding.embedding_matrix], trainable=False)(input_shape) + elmo=False + + if(self.sentences!=None): + elmo=True; + + if(elmo==False): + input_shape = Input(shape=(self.data_model.maxlen,)) + embedding = Embedding(self.data_model.common_words, self.embedding.embedding_dim)(input_shape) + + if self.embedding: + embedding = Embedding(self.data_model.common_words, self.embedding.embedding_dim, + weights=[self.embedding.embedding_matrix], trainable=False)(input_shape) + else: + input_shape = Input(shape=(1,),dtype=tf.string) + embedding = ElmoEmbeddingLayer()(input_shape) + conv1 = Conv1D(filters=self.filters, kernel_size=self.filter_conv, activation=self.activation)(embedding) pool1 = MaxPooling1D(pool_size=self.filter_maxPool)(conv1) - conv2 = Conv1D(filters=self.filters, kernel_size=self.filter_conv, activation=self.activation)(pool1) drop = Dropout(self.drop_out)(conv2) @@ -79,12 +88,11 @@ def define_model(self, no_classes): dense1 = Dense(self.filters, activation=self.activation)(flat) outputs = Dense(no_classes, activation=self.output_activation)(dense1) - model = Model(inputs=input_shape, outputs=outputs) + model = Model(inputs=[input_shape], outputs=outputs) model.compile(loss=self.loss, optimizer=self.optimizer, metrics=self.metrics) # summarize print(model.summary()) - return model def fit_Model(self, model, x_train, y_train, validation=None): @@ -100,8 +108,10 @@ def fit_Model(self, model, x_train, y_train, validation=None): # weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train) # history = model.fit(x_train, y_train, epochs=self.epochs, batch_size=self.batch_size, validation_data=validation, class_weight=weights) + history = model.fit(x_train, y_train, epochs=self.epochs, - batch_size=self.batch_size, validation_data=validation) + batch_size=self.batch_size, validation_data=validation) + # loss = history.history['loss'] # acc = history.history['acc'] @@ -216,7 +226,26 @@ def cross_validate(self, num_folds=5): """ X_data = self.data_model.train Y_data = self.data_model.train_label + binary_Y = self.data_model.binarize_labels(Y_data, True) + labelfile = open("binarylabel","w") + for thing in binary_Y: + for t in thing: + labelfile.write(str(t)) + labelfile.write(" ") + labelfile.write("\n") + labelfile.close() + exit() + cv_model = self.define_model(len(self.data_model.encoder.classes_)) + for i in range(0,len(self.sentences)): + if(len(self.sentences[i].split())>100): + self.sentences[i]=" ".join(self.sentences[i].split()[:100]) + sentences=np.array(self.sentences, dtype=object)[:, np.newaxis] + print(sentences.shape) + print(X_data.shape) + print(binary_Y.shape) + cv_model.fit(sentences, binary_Y, epochs=self.epochs, batch_size=self.batch_size) + """ if num_folds <= 1: raise ValueError("Number of folds for cross validation must be greater than 1") assert X_data is not None and Y_data is not None, \ @@ -224,7 +253,6 @@ def cross_validate(self, num_folds=5): skf = StratifiedKFold(n_splits=num_folds, shuffle=True) skf.get_n_splits(X_data, Y_data) - evaluation_statistics = {} fold = 1 @@ -292,3 +320,5 @@ def cross_validate(self, num_folds=5): #print results using MedaCy evaluation (similar to the sklearn evaluation above) print("---------------------medacy Results --------------------------------") evaluate.cv_evaluation(labels, evaluation_statistics) + + """ diff --git a/relex/RelEx_NN/embeddings/__pycache__/__init__.cpython-36.pyc b/relex/RelEx_NN/embeddings/__pycache__/__init__.cpython-36.pyc deleted file mode 100644 index 380b888..0000000 Binary files a/relex/RelEx_NN/embeddings/__pycache__/__init__.cpython-36.pyc and /dev/null differ diff --git a/relex/RelEx_NN/embeddings/__pycache__/elmo_embeddings.cpython-36.pyc b/relex/RelEx_NN/embeddings/__pycache__/elmo_embeddings.cpython-36.pyc new file mode 100644 index 0000000..e41a050 Binary files /dev/null and b/relex/RelEx_NN/embeddings/__pycache__/elmo_embeddings.cpython-36.pyc differ diff --git a/relex/RelEx_NN/embeddings/__pycache__/embeddings.cpython-36.pyc b/relex/RelEx_NN/embeddings/__pycache__/embeddings.cpython-36.pyc deleted file mode 100644 index f58eab0..0000000 Binary files a/relex/RelEx_NN/embeddings/__pycache__/embeddings.cpython-36.pyc and /dev/null differ diff --git a/relex/RelEx_NN/embeddings/elmo_embeddings.py b/relex/RelEx_NN/embeddings/elmo_embeddings.py new file mode 100644 index 0000000..f97c833 --- /dev/null +++ b/relex/RelEx_NN/embeddings/elmo_embeddings.py @@ -0,0 +1,27 @@ +import tensorflow as tf +import tensorflow_hub as hub +from keras import backend as K +from keras.engine import Layer + +class ElmoEmbeddingLayer(Layer): + def __init__(self, **kwargs): + self.dimensions = 1024 + self.trainable=False + super(ElmoEmbeddingLayer, self).__init__(**kwargs) + + def build(self, input_shape): + self.elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=self.trainable, name="{}_module".format(self.name)) + self.trainable_weights += tf.trainable_variables(scope="^{}_module/.*".format(self.name)) + super(ElmoEmbeddingLayer, self).build(input_shape) + + def call(self, x, mask=None): + result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),as_dict=True,signature='default',)['default'] + result=tf.expand_dims(result,axis=0) + print(result.shape) + return result + + # def compute_mask(self, inputs, mask=None): + # return K.not_equal(inputs, '--PAD--') + + def compute_output_shape(self, input_shape): + return (input_shape[0],100, self.dimensions) diff --git a/relex/RelEx_NN/model/__init__.pyc b/relex/RelEx_NN/model/__init__.pyc new file mode 100644 index 0000000..f30eb04 Binary files /dev/null and b/relex/RelEx_NN/model/__init__.pyc differ diff --git a/relex/RelEx_NN/model/__pycache__/__init__.cpython-36.pyc b/relex/RelEx_NN/model/__pycache__/__init__.cpython-36.pyc deleted file mode 100644 index 512645e..0000000 Binary files a/relex/RelEx_NN/model/__pycache__/__init__.cpython-36.pyc and /dev/null differ diff --git a/relex/RelEx_NN/model/__pycache__/evalaute.cpython-36.pyc b/relex/RelEx_NN/model/__pycache__/evalaute.cpython-36.pyc new file mode 100644 index 0000000..f7d672d Binary files /dev/null and b/relex/RelEx_NN/model/__pycache__/evalaute.cpython-36.pyc differ diff --git a/relex/RelEx_NN/model/__pycache__/evaluate.cpython-36.pyc b/relex/RelEx_NN/model/__pycache__/evaluate.cpython-36.pyc deleted file mode 100644 index 3463197..0000000 Binary files a/relex/RelEx_NN/model/__pycache__/evaluate.cpython-36.pyc and /dev/null differ diff --git a/relex/RelEx_NN/model/__pycache__/model.cpython-36.pyc b/relex/RelEx_NN/model/__pycache__/model.cpython-36.pyc deleted file mode 100644 index ae55119..0000000 Binary files a/relex/RelEx_NN/model/__pycache__/model.cpython-36.pyc and /dev/null differ diff --git a/relex/RelEx_NN/model/model.py b/relex/RelEx_NN/model/model.py index bdb3119..cc7a46f 100644 --- a/relex/RelEx_NN/model/model.py +++ b/relex/RelEx_NN/model/model.py @@ -28,6 +28,7 @@ def create_validation_data(train_data, train_label, num_data=1000): return x_train, x_val, y_train, y_val + def reduce_duplicate_data(train_data, train_labels): """ Reads the data into one dataframe. Removes the duplicated data and merges the respective labels. Also drops the @@ -87,7 +88,6 @@ def __init__(self, data_object, data_object_test=None, segment=False, test=False :param one_hot: Flag to be set to create one-hot vectors (default-False) :param common_words: Number of words to consider as features (default = 10000) :param maxlen: maximum length of the vector (default = 100) - """ self.de_sample = de_sample self.write_Predictions = write_Predictions @@ -164,17 +164,17 @@ def __init__(self, data_object, data_object_test=None, segment=False, test=False self.train_track = train_track if self.test: self.train, self.x_test, self.word_index = self.vectorize_words(train_data, test_data) - self.train_onehot, self.x_test_onehot, self.token_index = self.one_hot_encoding(train_data, test_data) + #self.train_onehot, self.x_test_onehot, self.token_index = self.one_hot_encoding(train_data, test_data) self.y_test = test_labels self.test_track = test_track else: - self.train_onehot, self.token_index = self.one_hot_encoding(train_data, test_data) + #self.train_onehot, self.token_index = self.one_hot_encoding(train_data, test_data) self.train, self.word_index = self.vectorize_words(train_data, test_data) # divides train data into partial train and validation data self.x_train, self.x_val, self.y_train, self.y_val = create_validation_data(self.train, self.train_label) - self.x_train_onehot, self.x_val_onehot, self.y_train, self.y_val = create_validation_data(self.train_onehot, - self.train_label) + # self.x_train_onehot, self.x_val_onehot, self.y_train, self.y_val = create_validation_data(self.train_onehot, + # self.train_label) if self.segment: train_preceding = data_object['seg_preceding'] train_middle = data_object['seg_middle'] diff --git a/relex/RelEx_NN/model/model.pyc b/relex/RelEx_NN/model/model.pyc new file mode 100644 index 0000000..95b2bc0 Binary files /dev/null and b/relex/RelEx_NN/model/model.pyc differ diff --git a/relex/RelEx_NN/model/temp.py b/relex/RelEx_NN/model/temp.py new file mode 100644 index 0000000..a2fafd6 --- /dev/null +++ b/relex/RelEx_NN/model/temp.py @@ -0,0 +1,106 @@ +import os +import pandas as pd +from keras.preprocessing.text import Tokenizer +from keras.preprocessing.sequence import pad_sequences +from sklearn.preprocessing import MultiLabelBinarizer +from sklearn.model_selection import train_test_split + +from keras.models import Sequential +from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D +from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint +from keras.losses import binary_crossentropy +from keras.optimizers import Adam + +def read_from_file(file): + """ + Reads external files and insert the content to a list. It also removes whitespace + characters like `\n` at the end of each line + + :param file: name of the input file. + :return : content of the file in list format + """ + if not os.path.isfile(file): + raise FileNotFoundError("Not a valid file path") + + with open(file) as f: + content = f.readlines() + content = [x.strip() for x in content] + + return content + + +def get_features(text_series): + """ + transforms text data to feature_vectors that can be used in the ml model. + tokenizer must be available. + """ + sequences = tokenizer.texts_to_sequences(text_series) + return pad_sequences(sequences, maxlen=maxlen) + + +def prediction_to_label(prediction): + tag_prob = [(labels[i], prob) for i, prob in enumerate(prediction.tolist())] + return dict(sorted(tag_prob, key=lambda kv: kv[1], reverse=True)) + +train_data = read_from_file("../../../data/segments/sentence_train") +train_labels = read_from_file("../../../data/segments/labels_train") +df_data = pd.DataFrame(train_data) +df_data.columns = ['sentence'] +df_label = pd.DataFrame(train_labels) +df_label.columns = ['label'] + +df_data.reset_index(drop=True, inplace=True) +df_label.reset_index(drop=True, inplace=True) +df_new= pd.concat((df_data, df_label),axis=1) +# print(df_new) +# df_data["label"]=df_label.label +df_new.drop_duplicates(inplace=True) +# +df= df_new.groupby('sentence').agg({'label':lambda x: ','.join(x)}) +print(df) +df.reset_index( inplace=True) +print(df) + +df['label']= df['label'].str.split(",") +print(df.columns) +df.columns = ['sentence', 'label'] +print(df) +multilabel_binarizer = MultiLabelBinarizer() +multilabel_binarizer.fit(df.label) +labels = multilabel_binarizer.classes_ +print(len(labels)) + +maxlen = 180 +max_words = 5000 +tokenizer = Tokenizer(num_words=max_words, lower=True) +tokenizer.fit_on_texts(df.sentence) +# print(df.label) +x = get_features(df.sentence) +y = multilabel_binarizer.transform(df.label) +print(y) +x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9000) + +filter_length = 300 + +# model = Sequential() +# model.add(Embedding(max_words, 20, input_length=maxlen)) +# model.add(Dropout(0.1)) +# model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1)) +# model.add(GlobalMaxPool1D()) +# model.add(Dense(num_classes)) +# model.add(Activation('sigmoid')) +# +# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy']) +# model.summary() +# +# callbacks = [ +# ReduceLROnPlateau(), +# EarlyStopping(patience=4), +# ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True) +# ] +# +# history = model.fit(x_train, y_train, +# epochs=20, +# batch_size=32, +# validation_split=0.1, +# callbacks=callbacks) \ No newline at end of file diff --git a/relex/RelEx_NN/nn/__pycache__/__init__.cpython-36.pyc b/relex/RelEx_NN/nn/__pycache__/__init__.cpython-36.pyc deleted file mode 100644 index 94485f2..0000000 Binary files a/relex/RelEx_NN/nn/__pycache__/__init__.cpython-36.pyc and /dev/null differ diff --git a/relex/RelEx_NN/nn/__pycache__/simple_NN.cpython-36.pyc b/relex/RelEx_NN/nn/__pycache__/simple_NN.cpython-36.pyc deleted file mode 100644 index 327aa6c..0000000 Binary files a/relex/RelEx_NN/nn/__pycache__/simple_NN.cpython-36.pyc and /dev/null differ diff --git a/relex/__main__.py b/relex/__main__.py new file mode 100644 index 0000000..324b345 --- /dev/null +++ b/relex/__main__.py @@ -0,0 +1,67 @@ +from RelEx_NN.model import Model +from RelEx_NN.embeddings import Embeddings +from RelEx_NN.cnn import Segment_CNN +from RelEx_NN.cnn import Sentence_CNN +import numpy +import argparse +import tensorflow_hub as hu +import tensorflow as tf + +# embedding_path = "home/neha/Documents/mimic3_d200.txt" model = Model(True, False) embedding=Embeddings( +# embedding_path, model) seg_cnn = Segment_CNN(model, embedding) seg_cnn.cross_validate(preceding, +# middle, succeeding, concept1, concept2, train_label) +from torch import hub + + + +parser = argparse.ArgumentParser(prog="RelEx") +# parser.add_argument("-d", "--data",help="provide directory location for data") +parser.add_argument("--sentrain", help="provide path to sentence training data",required=True) +parser.add_argument("--labeltrain", help="provide path to label training data",required=True) +parser.add_argument("--sentest", help="provide path to sentence testing data (if applicable)") +parser.add_argument("--labeltest", help="provide path to label testing data (if applicable)") +parser.add_argument("--embedding", help="provide path to embedding file", required=True) # might want to make this more specific +parser.add_argument("--segment", help="activate segment-cnn", default="TRUE") +parser.add_argument("--test", help="validate model on test dataset", default="FALSE") +parser.add_argument("--multilabel", help="insert help text here", default="TRUE") +parser.add_argument("--onehot", help="create one-hot vectors", default="FALSE") +parser.add_argument("--commonwords", help="number of words to consider as features", default="10000") +parser.add_argument("--maxlen", help="maximum length of vector", default="100") +parser.add_argument("--elmo", help="use elmo embeddings", default="FALSE") +args = parser.parse_args() +segment = args.segment == "TRUE" +test = args.test == "TRUE" +multilabel = args.multilabel == "TRUE" +onehot = args.onehot == "TRUE" +elmo = args.elmo == "TRUE" +model=None + +model = Model(sentrain=args.sentrain, sentest=args.sentest, labeltest=args.labeltest, labeltrain=args.labeltrain, + segment=segment, test=test, multilabel=multilabel, one_hot=onehot, common_words=args.commonwords, + maxlen=args.maxlen) + +train_preceding = open("/home/neha/Desktop/P_Tr/preceding_seg").read() +train_middle = open("/home/neha/Desktop/P_Tr/middle_seg").read() +train_succeeding = open("/home/neha/Desktop/P_Tr/succeeding_seg").read() +train_concept1 = open("/home/neha/Desktop/P_Tr/concept1_seg").read() +train_concept2 = open("/home/neha/Desktop/P_Tr/concept2_seg").read() +train_sent = open("/home/neha/Desktop/P_Tr/sentence_train").read() +sents = train_sent.split("\n") +print(len(sents)) + + +#print(args.sentrain) + + + + +embedding_path = args.embedding +# +embedding=Embeddings(embedding_path, model) +if elmo: + seg_cnn = Sentence_CNN(model, elmo=args.sentrain, cross_validation=True, embedding=embedding) +else: + seg_cnn = Sentence_CNN(model, cross_validation=True, embedding=embedding) + + + diff --git a/relex/__pycache__/__init__.cpython-36.pyc b/relex/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..8a8943c Binary files /dev/null and b/relex/__pycache__/__init__.cpython-36.pyc differ diff --git a/relex/__pycache__/__main__.cpython-36.pyc b/relex/__pycache__/__main__.cpython-36.pyc new file mode 100644 index 0000000..3d1de8c Binary files /dev/null and b/relex/__pycache__/__main__.cpython-36.pyc differ diff --git a/relex/charlie.py b/relex/charlie.py new file mode 100644 index 0000000..d975222 --- /dev/null +++ b/relex/charlie.py @@ -0,0 +1,13 @@ +from RelEx_NN.model import Model +from segment import Set_Connection +from RelEx_NN.cnn import Sentence_CNN +from RelEx_NN.embeddings import Embeddings + +data = Set_Connection(CSV=True, sentence_only = True, sentences='../data/segments/sentence_train', labels='../data/segments/labels_train').data_object +#data = Set_Connection(CSV=True, sentences='../data/segments/sentence_train', labels='../data/segments/labels_train', preceding_segs='../data/segments/preceding_seg', concept1_segs='../data/segments/concept1_seg',middle_segs='../data/segments/middle_seg',concept2_segs='../data/segments/concept2_seg', succeeding_segs='../data/segments/succeeding_seg' ).data_object +sentences = data["sentence"] +model = Model(data, segment=False, test=False, multilabel=False, one_hot=False) +embedding = Embeddings("../word_embeddings/mimic3_d200.txt", model, embedding_dim=200) +sent_cnn = Sentence_CNN(model, embedding, cross_validation=True,sentences=sentences,filters=300, drop_out=0.1, filter_conv=3, optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy']) + + diff --git a/relex/data/__pycache__/__init__.cpython-36.pyc b/relex/data/__pycache__/__init__.cpython-36.pyc deleted file mode 100644 index eae58c5..0000000 Binary files a/relex/data/__pycache__/__init__.cpython-36.pyc and /dev/null differ diff --git a/relex/data/__pycache__/annotation.cpython-36.pyc b/relex/data/__pycache__/annotation.cpython-36.pyc deleted file mode 100644 index 4d9d310..0000000 Binary files a/relex/data/__pycache__/annotation.cpython-36.pyc and /dev/null differ diff --git a/relex/data/__pycache__/dataset.cpython-36.pyc b/relex/data/__pycache__/dataset.cpython-36.pyc deleted file mode 100644 index 1a6cb44..0000000 Binary files a/relex/data/__pycache__/dataset.cpython-36.pyc and /dev/null differ diff --git a/relex/data/dataset.py b/relex/data/dataset.py index 1b34f55..53ed2f3 100644 --- a/relex/data/dataset.py +++ b/relex/data/dataset.py @@ -46,4 +46,4 @@ def __next__(self): except IndexError: raise StopIteration() self.index += 1 - return word \ No newline at end of file + return word diff --git a/relex/exec.py b/relex/exec.py new file mode 100644 index 0000000..16ef2c6 --- /dev/null +++ b/relex/exec.py @@ -0,0 +1,17 @@ +from segment import Segmentation +sample_train = '/home/mahendrand/NLP/RelEx/data/sample_train' +# sample_test = Dataset('/home/samantha/Desktop/Research/Data/i2b2/sample_test') +# training_dataset = Dataset('/home/samantha/Desktop/Research/Data/i2b2/train_data') +# testing_dataset = Dataset('/home/samantha/Desktop/Research/Data/i2b2/test_data') + +seg_sampleTrain = Segmentation(sample_train) +# sample_sentTrain = seg_sampleTrain.segments['sentence'] +# sample_labelTrain = seg_sampleTrain.segments['label'] + +# seg_train = Segmentation(training_dataset) +# print(len(seg_train.segments['sentence'])) +# print(len(seg_train.segments['label'])) + +# seg_test = Segmentation(testing_dataset) +# print(seg_test.segments['sentence']) +# print(seg_test.segments['label']) \ No newline at end of file diff --git a/relex/experiments.py b/relex/experiments.py index b733ef0..d1eba64 100644 --- a/relex/experiments.py +++ b/relex/experiments.py @@ -5,7 +5,7 @@ from RelEx_NN.nn import Simple_NN from segment import Set_Connection import sys - +import os if sys.argv[1] == 'mimic': if int(sys.argv[2]) == 200: embedding_path = "../word_embeddings/mimic3_d200.txt" @@ -47,4 +47,4 @@ sent_cnn = Sentence_CNN(model, embedding, True, filters=300, drop_out=0.1, filter_conv=3, optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy']) # sent_cnn = Simple_NN(model, embedding, True) -# seg_cnn.cross_validate(model.preceding, model.middle, model.succeeding, model.concept1, model.concept2, model.train_label) \ No newline at end of file +# seg_cnn.cross_validate(model.preceding, model.middle, model.succeeding, model.concept1, model.concept2, model.train_label) diff --git a/relex/run.sh b/relex/run.sh index fafea66..d11682f 100755 --- a/relex/run.sh +++ b/relex/run.sh @@ -27,4 +27,4 @@ python experiments.py chem 200 segment #echo "glove 200 multi" #python experiments.py glove 200 multi #echo "glove 300 multi" -#python experiments.py glove 300 multi \ No newline at end of file +#python experiments.py glove 300 multi diff --git a/relex/run_test.sh b/relex/run_test.sh index 83dfcfa..ba46aa2 100755 --- a/relex/run_test.sh +++ b/relex/run_test.sh @@ -21,4 +21,4 @@ python demo_test.py mimic 300 multi echo "glove 200 multi" python demo_test.py glove 200 multi echo "glove 300 multi" -python demo_test.py glove 300 multi \ No newline at end of file +python demo_test.py glove 300 multi diff --git a/relex/segment/__init__.py b/relex/segment/__init__.py index 6b4d5ad..7da82f3 100644 --- a/relex/segment/__init__.py +++ b/relex/segment/__init__.py @@ -1,2 +1,2 @@ from .segmentation import Segmentation -from .set_connection import Set_Connection \ No newline at end of file +from .set_connection import Set_Connection diff --git a/relex/segment/__pycache__/segmentation.cpython-36.pyc b/relex/segment/__pycache__/segmentation.cpython-36.pyc deleted file mode 100644 index 84d6638..0000000 Binary files a/relex/segment/__pycache__/segmentation.cpython-36.pyc and /dev/null differ diff --git a/relex/segment/__pycache__/set_connection.cpython-36.pyc b/relex/segment/__pycache__/set_connection.cpython-36.pyc deleted file mode 100644 index 5a0f73d..0000000 Binary files a/relex/segment/__pycache__/set_connection.cpython-36.pyc and /dev/null differ diff --git a/relex/utils/__pycache__/__init__.cpython-36.pyc b/relex/utils/__pycache__/__init__.cpython-36.pyc deleted file mode 100644 index a612344..0000000 Binary files a/relex/utils/__pycache__/__init__.cpython-36.pyc and /dev/null differ diff --git a/relex/utils/__pycache__/file.cpython-36.pyc b/relex/utils/__pycache__/file.cpython-36.pyc deleted file mode 100644 index 2ddddb9..0000000 Binary files a/relex/utils/__pycache__/file.cpython-36.pyc and /dev/null differ