1
- import numpy as np
2
- import pandas as pd
3
- import random
4
- import matplotlib .pyplot as plt
5
1
import multiprocessing
6
2
import pickle
7
- from sklearn .datasets import load_iris
8
- from matplotlib import pyplot
3
+ import random
9
4
5
+ import numpy as np
6
+ import pandas as pd
7
+ from deap import algorithms
10
8
from deap import base
11
9
from deap import creator
12
10
from deap import tools
13
- from deap import algorithms
14
-
15
- import rpy2 .robjects as robjects
16
- from meta_features .ecol import Ecol
11
+ from rpy2 import robjects
17
12
18
- import setup .setup_framework as setup
19
- from instances_generator .generator import InstancesGenerator
20
13
import extractor
21
14
import preprocess
15
+ import setup .setup_framework as setup
16
+ from meta_features .ecol import Ecol
17
+ from instances_generator .generator import InstancesGenerator
22
18
23
- # TODO: Implement Setup in a minimal main()
24
- options = setup .get_options ()
25
-
26
- cont = 0
27
- bobj = 0.4
28
- P = [12 ]
29
- SCALES = [1 ]
30
- tread = ""
31
- select_new_dataset = "N"
32
- NGEN = 1000
33
- # NGEN = options['NGEN']
34
- CXPB = 0.7
35
- MUTPB = 0.2
36
- INDPB = 0.05
37
- POP = 100
38
-
39
- # TODO: Implement Generator of Instances in a minimal main()
40
- gen_instances = InstancesGenerator (options )
41
- df = gen_instances .generate (options ['maker' ][0 ])
42
-
43
- filename = options ['filename' ] if options ['filename' ] != "" else "NGEN=" + \
44
- str (NGEN )
45
-
46
- metrics = options ['measures' ]
47
-
48
- # TODO: Implement fitness global measures in a minimal main()
49
- global_measures = []
50
- if (options ['filepath' ] != "" ):
51
- base_df = pd .read_csv (options ['filepath' ])
52
- target = options ['label_name' ]
53
-
54
- # Copying Columns names
55
- # df.columns = preprocess.copyFeatureNamesFrom(base_df, label_name=target)
56
-
57
- # Extraction of Data Complexity Values
58
- global_measures = tuple (extractor .complexity (base_df , target , metrics ))
59
- else :
60
- for metric in metrics :
61
- global_measures .append (options [metric ])
62
- global_measures = tuple (global_measures )
63
-
64
- filename += '-' + '-' .join (metrics )
65
- N_ATTRIBUTES = int (options ['samples' ]) # mispelled variable name
66
- print (metrics , len (metrics ))
67
- print (global_measures )
68
- NOBJ = len (metrics )
69
-
70
- dic = {}
71
-
72
- # reference points
73
- ref_points = [tools .uniform_reference_points (
74
- NOBJ , p , s ) for p , s in zip (P , SCALES )]
75
- ref_points = np .concatenate (ref_points )
76
- _ , uniques = np .unique (ref_points , axis = 0 , return_index = True )
77
- ref_points = ref_points [uniques ]
19
+ def generate_instances (samples , attributes , classes , maker : tuple [int ,str ]
20
+ ) -> pd .DataFrame :
21
+ """
22
+ Function responsible for the Generatation of Instances, highly dependent
23
+ of a InstancesGenerator object.
24
+
25
+ Parameters
26
+ ----------
27
+ samples : Number of instances to be generated.
28
+ attributes : Number of Attributes/Features to be generated.
29
+ classes : Number of classes to be classified to a instance.
30
+ maker : The type of maker that will generate the set of instances.
31
+
32
+ Returns
33
+ -------
34
+ pandas.DataFrame
35
+ """
36
+ gen_instances = InstancesGenerator (samples , attributes ,
37
+ classes = classes ,
38
+ maker_option = maker [1 ])
39
+ return gen_instances .generate (maker [0 ])
40
+
41
+ def complexity_extraction (measures : list [str ], * ,
42
+ dataframe_label : tuple [pd .DataFrame ,str ]= None ,
43
+ complexity_values : dict ) -> tuple [np .float64 ]:
44
+ """
45
+ Function that extracts complexity values of a Data Set, highly dependent
46
+ of a extractor module.
47
+
48
+ Parameters
49
+ ----------
50
+ measures : A list of complexity measures to extract from the Data Set.
51
+ dataframe_label : Refers to the DataFrame itself and its label.
52
+ complexity_values : Dictionary of complexity values (TODO: Simplify!)
53
+
54
+ Returns
55
+ -------
56
+ tuple[complexity_values]
57
+ """
58
+ if dataframe_label is not None :
59
+ # Copying Columns names
60
+ # df.columns = preprocess.copyFeatureNamesFrom(base_df, label_name=target)
61
+
62
+ # Extraction of Data Complexity Values
63
+ return tuple (extractor .complexity (dataframe_label [0 ],
64
+ dataframe_label [1 ],
65
+ measures ))
66
+ return tuple (complexity_values [cm ] for cm in measures )
67
+
68
+ # TODO: Build a clever architecture for the filename
69
+ def build_filename (filename : str = '' , * , ngen : int , metrics : list ) -> str :
70
+ """
71
+ Function that builds a filename based on the number of generations and
72
+ metrics used to optimize.
73
+
74
+ Parameters
75
+ ----------
76
+ filename : Name or Prefix of the File that contains the result of the
77
+ optimization process.
78
+ ngen : Number of generations of the current run of optimization.
79
+ metrics : A list of metrics used to optimize.
80
+ """
81
+ filename = filename if filename != "" else "NGEN=" + \
82
+ str (ngen )
83
+ filename += '-' + '-' .join (metrics )
84
+ return filename
78
85
79
86
def my_evaluate (individual ):
80
87
vetor = []
@@ -100,25 +107,70 @@ def print_evaluate(individual):
100
107
101
108
return tuple (vetor )
102
109
103
-
104
- creator .create ("FitnessMin" , base .Fitness , weights = (- 1.0 ,)* NOBJ )
105
- creator .create ("Individual" , list , fitness = creator .FitnessMin )
106
-
107
- RANDINT_LOW = 0
108
- RANDINT_UP = options ['classes' ] - 1
109
-
110
- toolbox = base .Toolbox ()
111
- toolbox .register ("attr_int" , random .randint , RANDINT_LOW , RANDINT_UP )
112
- toolbox .register ("individual" , tools .initRepeat ,
113
- creator .Individual , toolbox .attr_int , N_ATTRIBUTES )
114
- toolbox .register ("population" , tools .initRepeat , list , toolbox .individual )
115
- toolbox .register ("evaluate" , my_evaluate )
116
- toolbox .register ("mate" , tools .cxTwoPoint )
117
- toolbox .register ("mutate" , tools .mutShuffleIndexes , indpb = INDPB )
118
- toolbox .register ("select" , tools .selNSGA3 , ref_points = ref_points )
119
-
120
-
121
- def main (seed = None ):
110
+ def setup_engine (options ):
111
+ """
112
+ Function that set up a deap.base.toolbox for the search-engine process
113
+
114
+ Parameters
115
+ ----------
116
+ options : Dictionary of setup parameters highly necessary to how the
117
+ search engine will find the solutions
118
+
119
+ Returns
120
+ -------
121
+ deap.base.Toolbox
122
+ """
123
+ samples = int (options ['samples' ])
124
+ n_objectives = len (options ['measures' ])
125
+
126
+ # reference points
127
+ ref_points = [tools .uniform_reference_points (
128
+ n_objectives , p , s ) for p , s in zip (options ['P' ], options ['SCALES' ])]
129
+ ref_points = np .concatenate (ref_points )
130
+ _ , uniques = np .unique (ref_points , axis = 0 , return_index = True )
131
+ ref_points = ref_points [uniques ]
132
+
133
+ creator .create ("FitnessMin" , base .Fitness , weights = (- 1.0 ,)* n_objectives )
134
+ creator .create ("Individual" , list , fitness = creator .FitnessMin )
135
+
136
+ randint_down = 0
137
+ randint_up = options ['classes' ] - 1
138
+
139
+ toolbox = base .Toolbox ()
140
+ toolbox .register ("attr_int" , random .randint , randint_down , randint_up )
141
+ toolbox .register ("individual" , tools .initRepeat ,
142
+ creator .Individual , toolbox .attr_int , samples )
143
+ toolbox .register ("population" , tools .initRepeat , list , toolbox .individual )
144
+ toolbox .register ("evaluate" , my_evaluate )
145
+ toolbox .register ("mate" , tools .cxTwoPoint )
146
+ indpb = options ['INDPB' ]
147
+ toolbox .register ("mutate" , tools .mutShuffleIndexes , indpb = indpb )
148
+ toolbox .register ("select" , tools .selNSGA3 , ref_points = ref_points )
149
+
150
+ return toolbox
151
+
152
+ def results (options : dict , toolbox : base .Toolbox ):
153
+ """
154
+ Function that operates the search engine process by operating an
155
+ evolutional algorithm to find the best results.
156
+
157
+ Parameters
158
+ ----------
159
+ options : Dictionary of setup parameters highly necessary to how the
160
+ search engine will find the solutions.
161
+ toolbox : A Toolbox for evolution that contains evolutionary operators.
162
+
163
+ Returns
164
+ -------
165
+ deap.base.toolbox.population : A population of the best individuals
166
+ from the search engine process.
167
+ deap.tools.logbook : A logbook that contains evolutionary and
168
+ statistics information about the search process.
169
+ """
170
+ pop = options ['POP' ]
171
+ cxpb = options ['CXPB' ]
172
+ mutpb = options ['MUTPB' ]
173
+ ngen = options ['NGEN' ]
122
174
random .seed (64 )
123
175
pool = multiprocessing .Pool (processes = 12 )
124
176
toolbox .register ("map" , pool .map )
@@ -132,58 +184,84 @@ def main(seed=None):
132
184
logbook = tools .Logbook ()
133
185
logbook .header = "gen" , "evals" , "std" , "min" , "avg" , "max"
134
186
135
- pop = toolbox .population (POP )
187
+ tool_pop = toolbox .population (pop )
136
188
137
189
# Evaluate the individuals with an invalid fitness
138
- invalid_ind = [ind for ind in pop if not ind .fitness .valid ]
190
+ invalid_ind = [ind for ind in tool_pop if not ind .fitness .valid ]
139
191
fitnesses = toolbox .map (toolbox .evaluate , invalid_ind )
140
192
for ind , fit in zip (invalid_ind , fitnesses ):
141
193
ind .fitness .values = fit
142
194
# Compile statistics about the population
143
- record = stats .compile (pop )
195
+ record = stats .compile (tool_pop )
144
196
145
197
logbook .record (gen = 0 , evals = len (invalid_ind ), ** record )
146
198
print (logbook .stream )
147
199
# Begin the generational process
148
- for gen in range (1 , NGEN ):
149
- offspring = algorithms .varAnd (pop , toolbox , CXPB , MUTPB )
200
+ for gen in range (1 , ngen ):
201
+ offspring = algorithms .varAnd (tool_pop , toolbox , cxpb , mutpb )
150
202
# Evaluate the individuals with an invalid fitness
151
203
invalid_ind = [ind for ind in offspring if not ind .fitness .valid ]
152
204
fitnesses = toolbox .map (toolbox .evaluate , invalid_ind )
153
205
for ind , fit in zip (invalid_ind , fitnesses ):
154
206
ind .fitness .values = fit
155
207
# Select the next generation population from parents and offspring
156
- pop = toolbox .select (pop + offspring , POP )
208
+ tool_pop = toolbox .select (tool_pop + offspring , pop )
157
209
158
210
# Compile statistics about the new population
159
- record = stats .compile (pop )
211
+ record = stats .compile (tool_pop )
160
212
logbook .record (gen = gen , evals = len (invalid_ind ), ** record )
161
213
print (logbook .stream )
162
- return pop , logbook
214
+ return tool_pop , logbook
215
+
216
+ def main ():
217
+ options = setup .get_options ()
218
+
219
+ if options ['filepath' ] != '' :
220
+ base_df = pd .read_csv (options ['filepath' ])
221
+
222
+ global dataFrame
223
+ dataFrame = generate_instances (options ['samples' ], options ['attributes' ],
224
+ options ['classes' ], options ['maker' ])
225
+
226
+ complexity_values = {}
227
+ global metrics
228
+ metrics = options ['measures' ]
229
+ for measure in metrics :
230
+ complexity_values [measure ] = options [measure ]
231
+ global global_measures
232
+ global_measures = complexity_extraction (metrics ,
233
+ dataframe_label = (
234
+ base_df , options ['label_name' ]
235
+ ),
236
+ complexity_values = complexity_values
237
+ )
238
+
239
+ filename = build_filename (options ['filename' ],
240
+ ngen = options ['NGEN' ],
241
+ metrics = metrics )
163
242
164
-
165
- if __name__ == '__main__' :
166
- cont1 = 0
167
- cont0 = 0
168
- #dataFrame = pd.read_csv(str(N_ATTRIBUTES) + '.csv')
169
- #dataFrame = dataFrame.drop('c0', axis=1)
170
- dataFrame = df
171
243
# This Ecol object should be called according to the variable dataFrame.
172
- # If dataFrame is renamed, then ecol_dataFrame should be renamed
244
+ # If dataFrame is renamed, then ecol_dataFrame should be renamed
173
245
# accordingly.
246
+ global ecol_dataFrame
174
247
ecol_dataFrame = Ecol (dataframe = dataFrame , label = 'label' )
175
- results = main ()
176
- print ("logbook" )
177
- print (results [0 ][0 ])
178
- for x in range (len (results [0 ])):
179
- dic [print_evaluate (results [0 ][x ])] = results [0 ][x ]
248
+
249
+ print (metrics , len (metrics ))
250
+ print (global_measures )
251
+ toolbox = setup_engine (options )
252
+ result = results (options , toolbox )
253
+
254
+ compiled_results = {}
255
+ for x in range (len (result [0 ])):
256
+ compiled_results [print_evaluate (result [0 ][x ])] = result [0 ][x ]
180
257
outfile = open (filename , 'wb' )
181
- pickle .dump (dic , outfile )
258
+ pickle .dump (compiled_results , outfile )
182
259
outfile .close ()
183
260
184
- df ['label' ] = results [0 ][0 ]
261
+ dataFrame ['label' ] = result [0 ][0 ]
185
262
# Scale to original Dataset (Optional) #TODO: Improve preprocessing
186
263
# df = preprocess.scaleColumnsFrom(base_df, df, label_column='label')
187
- df .to_csv (str (filename )+ ".csv" )
188
- ax1 = df .plot .scatter (x = 0 , y = 1 , c = 'label' , colormap = 'Paired' )
189
- pyplot .show ()
264
+ dataFrame .to_csv (str (filename )+ ".csv" )
265
+
266
+ if __name__ == '__main__' :
267
+ main ()
0 commit comments