privacy_preserving_document_classification/dataManager.py at main · tabeagnade/privacy_preserving_document_classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

import json
from os import remove
from sklearn import model_selection
import string
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
import random

class dataManager:


    def __init__(self):
        random.seed(1) # for having the same random numbers every time
        self.secondTrainingIndices = random.sample(range(0,477), 50)

    #property: schutzbedarf, vertraulichkeit, personenbezug, geschäftsgeheimnis
    def getCorpus(self, property):
        with open('data2.json', encoding='utf8') as f:
            data = json.load(f)
        Corpus = {'text': [], 'label': []}
        Corpus['text'] = data["text"]
        Corpus['label'] = list(map(self.toInt,data[property]))
        # print(list(Corpus['label']).count(0))
        # print(list(Corpus['label']).count(1))
        # print(list(Corpus['label']).count(2))
        return Corpus


    def getCorpusWithoutSecondTraining(self, property):
        with open('data2.json', encoding='utf8') as f:
            data = json.load(f)
        Corpus = {'text': [], 'label': []}
        Corpus['text'] = data["text"]
        Corpus['label'] = list(map(self.toInt,data[property]))
        labelLaterTraining = []
        for i in sorted(self.secondTrainingIndices, reverse=True):
            Corpus["text"].pop(i)
            labelLaterTraining.append(Corpus["label"].pop(i))
        # print("inital traindata 0" + str(list(Corpus['label']).count(0)))
        # print("inital traindata 1" + str(list(Corpus['label']).count(1)))
        # print("inital traindata 2" + str(list(Corpus['label']).count(2)))
        # print("later traindata 0" + str(labelLaterTraining.count(0)))
        # print("later traindata 1" + str(labelLaterTraining.count(1)))
        # print("later traindata 2" + str(labelLaterTraining.count(2)))
        return Corpus

    def deleteFromJson(self, i):
        with open('data2.json', encoding='utf8') as f:
            data = json.load(f)
        print(data["text"].pop(i))
        data["vertraulichkeit"].pop(i)
        data["personenbezug"].pop(i)
        data["schutzbedarf"].pop(i)
        data["geschaeftsgeheimnis"].pop(i)
        print(len(data["vertraulichkeit"]))
        print(len(data["text"]))
        print(len(data["personenbezug"]))
        print(len(data["schutzbedarf"]))
        print(len(data["geschaeftsgeheimnis"]))
        with open('data2.json', 'w') as fp:
            json.dump(data, fp)


    def toInt(self, value): #maybe use one from a library instead
        if value == "normal":
            return 0
        elif value == "high":
            return 1
        elif value == "veryhigh":
            return 2
        elif value == "public":
            return 0
        elif value == "internal":
            return 1
        elif value == "confidential":
            return 2
        else:
            return int(value)


    def produceTestFile(self, index, property = "personenbezug"):
        corpus = self.getCorpus(property)
        text = corpus["text"][index]
        label = corpus["label"][index]
        with open("TestDokumente/test{}{}{}.txt".format(index, property, label), "w") as file:
            file.write(text)
        return label

    def text_preprocessing(self,text):
        text = text.lower()
        table = str.maketrans('', '', string.punctuation)
        text = text.translate(table)
        text = text.replace('\n', ' ')
        return str(text)

    def getRawData(self, property):
        #get Data
        batch_size = 32
        Corpus = self.getCorpusWithoutSecondTraining(property)

        Corpus['text_final'] = Corpus['text']
        Corpus["label"] = Corpus["label"]


        # muss zweimal gemacht werden damit auch validation set (train set : 60% (30 % pre taining and half later), test set: 20 %, val set: 20%)
        Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'], Corpus['label'],
                                                                             test_size=0.2, random_state=1)

        Train_X, x_val, Train_Y, y_val = model_selection.train_test_split(Train_X, Train_Y, test_size=0.475, random_state=1) # 0.25 x 0.8 = 0.2
        # 0.74 --> 99 test documents
        # 0.5 --> 191
        # 0.475 --> 200


        #muss so weil sonst shape error
        testDs = tf.data.Dataset.from_tensor_slices((Test_X, Test_Y))
        testDs = testDs.shuffle(10000).repeat().batch(batch_size).take(len(Test_X))
        valDs = tf.data.Dataset.from_tensor_slices((x_val, y_val))
        valDs = valDs.shuffle(10000).repeat().batch(batch_size).take(len(x_val))
        trainDs = tf.data.Dataset.from_tensor_slices((Train_X, Train_Y))
        trainDs = trainDs.shuffle(10000).repeat().batch(batch_size).take(len(Train_X))

        return testDs, valDs, trainDs, Test_X, Test_Y

    def getRawTestData(self, property):

        Corpus = self.getCorpusWithoutSecondTraining(property)

        Corpus['text_final'] = list(map(self.text_preprocessing, Corpus['text']))
        Corpus["label"] = Corpus["label"]

        # muss zweimal gemacht werden damit auch validation set (train set : 60%, test set: 20 %, val set: 20%)
        Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'], Corpus['label'],
                                                                             test_size=0.2, random_state=1)

        return Test_X, Test_Y


if __name__ == "__main__":
    reader = dataManager()