-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataManager.py
More file actions
146 lines (114 loc) · 5.4 KB
/
dataManager.py
File metadata and controls
146 lines (114 loc) · 5.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import json
from os import remove
from sklearn import model_selection
import string
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
import random
class dataManager:
def __init__(self):
random.seed(1) # for having the same random numbers every time
self.secondTrainingIndices = random.sample(range(0,477), 50)
#property: schutzbedarf, vertraulichkeit, personenbezug, geschäftsgeheimnis
def getCorpus(self, property):
with open('data2.json', encoding='utf8') as f:
data = json.load(f)
Corpus = {'text': [], 'label': []}
Corpus['text'] = data["text"]
Corpus['label'] = list(map(self.toInt,data[property]))
# print(list(Corpus['label']).count(0))
# print(list(Corpus['label']).count(1))
# print(list(Corpus['label']).count(2))
return Corpus
def getCorpusWithoutSecondTraining(self, property):
with open('data2.json', encoding='utf8') as f:
data = json.load(f)
Corpus = {'text': [], 'label': []}
Corpus['text'] = data["text"]
Corpus['label'] = list(map(self.toInt,data[property]))
labelLaterTraining = []
for i in sorted(self.secondTrainingIndices, reverse=True):
Corpus["text"].pop(i)
labelLaterTraining.append(Corpus["label"].pop(i))
# print("inital traindata 0" + str(list(Corpus['label']).count(0)))
# print("inital traindata 1" + str(list(Corpus['label']).count(1)))
# print("inital traindata 2" + str(list(Corpus['label']).count(2)))
# print("later traindata 0" + str(labelLaterTraining.count(0)))
# print("later traindata 1" + str(labelLaterTraining.count(1)))
# print("later traindata 2" + str(labelLaterTraining.count(2)))
return Corpus
def deleteFromJson(self, i):
with open('data2.json', encoding='utf8') as f:
data = json.load(f)
print(data["text"].pop(i))
data["vertraulichkeit"].pop(i)
data["personenbezug"].pop(i)
data["schutzbedarf"].pop(i)
data["geschaeftsgeheimnis"].pop(i)
print(len(data["vertraulichkeit"]))
print(len(data["text"]))
print(len(data["personenbezug"]))
print(len(data["schutzbedarf"]))
print(len(data["geschaeftsgeheimnis"]))
with open('data2.json', 'w') as fp:
json.dump(data, fp)
def toInt(self, value): #maybe use one from a library instead
if value == "normal":
return 0
elif value == "high":
return 1
elif value == "veryhigh":
return 2
elif value == "public":
return 0
elif value == "internal":
return 1
elif value == "confidential":
return 2
else:
return int(value)
def produceTestFile(self, index, property = "personenbezug"):
corpus = self.getCorpus(property)
text = corpus["text"][index]
label = corpus["label"][index]
with open("TestDokumente/test{}{}{}.txt".format(index, property, label), "w") as file:
file.write(text)
return label
def text_preprocessing(self,text):
text = text.lower()
table = str.maketrans('', '', string.punctuation)
text = text.translate(table)
text = text.replace('\n', ' ')
return str(text)
def getRawData(self, property):
#get Data
batch_size = 32
Corpus = self.getCorpusWithoutSecondTraining(property)
Corpus['text_final'] = Corpus['text']
Corpus["label"] = Corpus["label"]
# muss zweimal gemacht werden damit auch validation set (train set : 60% (30 % pre taining and half later), test set: 20 %, val set: 20%)
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'], Corpus['label'],
test_size=0.2, random_state=1)
Train_X, x_val, Train_Y, y_val = model_selection.train_test_split(Train_X, Train_Y, test_size=0.475, random_state=1) # 0.25 x 0.8 = 0.2
# 0.74 --> 99 test documents
# 0.5 --> 191
# 0.475 --> 200
#muss so weil sonst shape error
testDs = tf.data.Dataset.from_tensor_slices((Test_X, Test_Y))
testDs = testDs.shuffle(10000).repeat().batch(batch_size).take(len(Test_X))
valDs = tf.data.Dataset.from_tensor_slices((x_val, y_val))
valDs = valDs.shuffle(10000).repeat().batch(batch_size).take(len(x_val))
trainDs = tf.data.Dataset.from_tensor_slices((Train_X, Train_Y))
trainDs = trainDs.shuffle(10000).repeat().batch(batch_size).take(len(Train_X))
return testDs, valDs, trainDs, Test_X, Test_Y
def getRawTestData(self, property):
Corpus = self.getCorpusWithoutSecondTraining(property)
Corpus['text_final'] = list(map(self.text_preprocessing, Corpus['text']))
Corpus["label"] = Corpus["label"]
# muss zweimal gemacht werden damit auch validation set (train set : 60%, test set: 20 %, val set: 20%)
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'], Corpus['label'],
test_size=0.2, random_state=1)
return Test_X, Test_Y
if __name__ == "__main__":
reader = dataManager()