Skip to content

Commit 36604f1

Browse files
authored
[Refactor] Main Code to a modular Architecture (#34)
Merge pull request #34 from SteffanoP/refactor/def-main
2 parents 9da3d8e + dbea3fd commit 36604f1

File tree

4 files changed

+213
-120
lines changed

4 files changed

+213
-120
lines changed

src/cbdgen-framework.py

Lines changed: 190 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -1,80 +1,87 @@
1-
import numpy as np
2-
import pandas as pd
3-
import random
4-
import matplotlib.pyplot as plt
51
import multiprocessing
62
import pickle
7-
from sklearn.datasets import load_iris
8-
from matplotlib import pyplot
3+
import random
94

5+
import numpy as np
6+
import pandas as pd
7+
from deap import algorithms
108
from deap import base
119
from deap import creator
1210
from deap import tools
13-
from deap import algorithms
14-
15-
import rpy2.robjects as robjects
16-
from meta_features.ecol import Ecol
11+
from rpy2 import robjects
1712

18-
import setup.setup_framework as setup
19-
from instances_generator.generator import InstancesGenerator
2013
import extractor
2114
import preprocess
15+
import setup.setup_framework as setup
16+
from meta_features.ecol import Ecol
17+
from instances_generator.generator import InstancesGenerator
2218

23-
# TODO: Implement Setup in a minimal main()
24-
options = setup.get_options()
25-
26-
cont = 0
27-
bobj = 0.4
28-
P = [12]
29-
SCALES = [1]
30-
tread = ""
31-
select_new_dataset = "N"
32-
NGEN = 1000
33-
# NGEN = options['NGEN']
34-
CXPB = 0.7
35-
MUTPB = 0.2
36-
INDPB = 0.05
37-
POP = 100
38-
39-
# TODO: Implement Generator of Instances in a minimal main()
40-
gen_instances = InstancesGenerator(options)
41-
df = gen_instances.generate(options['maker'][0])
42-
43-
filename = options['filename'] if options['filename'] != "" else "NGEN=" + \
44-
str(NGEN)
45-
46-
metrics = options['measures']
47-
48-
# TODO: Implement fitness global measures in a minimal main()
49-
global_measures = []
50-
if (options['filepath'] != ""):
51-
base_df = pd.read_csv(options['filepath'])
52-
target = options['label_name']
53-
54-
# Copying Columns names
55-
# df.columns = preprocess.copyFeatureNamesFrom(base_df, label_name=target)
56-
57-
# Extraction of Data Complexity Values
58-
global_measures = tuple(extractor.complexity(base_df, target, metrics))
59-
else:
60-
for metric in metrics:
61-
global_measures.append(options[metric])
62-
global_measures = tuple(global_measures)
63-
64-
filename += '-' + '-'.join(metrics)
65-
N_ATTRIBUTES = int(options['samples']) # mispelled variable name
66-
print(metrics, len(metrics))
67-
print(global_measures)
68-
NOBJ = len(metrics)
69-
70-
dic = {}
71-
72-
# reference points
73-
ref_points = [tools.uniform_reference_points(
74-
NOBJ, p, s) for p, s in zip(P, SCALES)]
75-
ref_points = np.concatenate(ref_points)
76-
_, uniques = np.unique(ref_points, axis=0, return_index=True)
77-
ref_points = ref_points[uniques]
19+
def generate_instances(samples, attributes, classes, maker: tuple[int,str]
20+
) -> pd.DataFrame:
21+
"""
22+
Function responsible for the Generatation of Instances, highly dependent
23+
of a InstancesGenerator object.
24+
25+
Parameters
26+
----------
27+
samples : Number of instances to be generated.
28+
attributes : Number of Attributes/Features to be generated.
29+
classes : Number of classes to be classified to a instance.
30+
maker : The type of maker that will generate the set of instances.
31+
32+
Returns
33+
-------
34+
pandas.DataFrame
35+
"""
36+
gen_instances = InstancesGenerator(samples, attributes,
37+
classes=classes,
38+
maker_option=maker[1])
39+
return gen_instances.generate(maker[0])
40+
41+
def complexity_extraction(measures: list[str], *,
42+
dataframe_label: tuple[pd.DataFrame,str]=None,
43+
complexity_values: dict) -> tuple[np.float64]:
44+
"""
45+
Function that extracts complexity values of a Data Set, highly dependent
46+
of a extractor module.
47+
48+
Parameters
49+
----------
50+
measures : A list of complexity measures to extract from the Data Set.
51+
dataframe_label : Refers to the DataFrame itself and its label.
52+
complexity_values : Dictionary of complexity values (TODO: Simplify!)
53+
54+
Returns
55+
-------
56+
tuple[complexity_values]
57+
"""
58+
if dataframe_label is not None:
59+
# Copying Columns names
60+
# df.columns = preprocess.copyFeatureNamesFrom(base_df, label_name=target)
61+
62+
# Extraction of Data Complexity Values
63+
return tuple(extractor.complexity(dataframe_label[0],
64+
dataframe_label[1],
65+
measures))
66+
return tuple(complexity_values[cm] for cm in measures)
67+
68+
# TODO: Build a clever architecture for the filename
69+
def build_filename(filename: str='', *, ngen: int, metrics: list) -> str:
70+
"""
71+
Function that builds a filename based on the number of generations and
72+
metrics used to optimize.
73+
74+
Parameters
75+
----------
76+
filename : Name or Prefix of the File that contains the result of the
77+
optimization process.
78+
ngen : Number of generations of the current run of optimization.
79+
metrics : A list of metrics used to optimize.
80+
"""
81+
filename = filename if filename != "" else "NGEN="+ \
82+
str(ngen)
83+
filename += '-' + '-'.join(metrics)
84+
return filename
7885

7986
def my_evaluate(individual):
8087
vetor = []
@@ -100,25 +107,70 @@ def print_evaluate(individual):
100107

101108
return tuple(vetor)
102109

103-
104-
creator.create("FitnessMin", base.Fitness, weights=(-1.0,)*NOBJ)
105-
creator.create("Individual", list, fitness=creator.FitnessMin)
106-
107-
RANDINT_LOW = 0
108-
RANDINT_UP = options['classes'] - 1
109-
110-
toolbox = base.Toolbox()
111-
toolbox.register("attr_int", random.randint, RANDINT_LOW, RANDINT_UP)
112-
toolbox.register("individual", tools.initRepeat,
113-
creator.Individual, toolbox.attr_int, N_ATTRIBUTES)
114-
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
115-
toolbox.register("evaluate", my_evaluate)
116-
toolbox.register("mate", tools.cxTwoPoint)
117-
toolbox.register("mutate", tools.mutShuffleIndexes, indpb=INDPB)
118-
toolbox.register("select", tools.selNSGA3, ref_points=ref_points)
119-
120-
121-
def main(seed=None):
110+
def setup_engine(options):
111+
"""
112+
Function that set up a deap.base.toolbox for the search-engine process
113+
114+
Parameters
115+
----------
116+
options : Dictionary of setup parameters highly necessary to how the
117+
search engine will find the solutions
118+
119+
Returns
120+
-------
121+
deap.base.Toolbox
122+
"""
123+
samples = int(options['samples'])
124+
n_objectives = len(options['measures'])
125+
126+
# reference points
127+
ref_points = [tools.uniform_reference_points(
128+
n_objectives, p, s) for p, s in zip(options['P'], options['SCALES'])]
129+
ref_points = np.concatenate(ref_points)
130+
_, uniques = np.unique(ref_points, axis=0, return_index=True)
131+
ref_points = ref_points[uniques]
132+
133+
creator.create("FitnessMin", base.Fitness, weights=(-1.0,)*n_objectives)
134+
creator.create("Individual", list, fitness=creator.FitnessMin)
135+
136+
randint_down = 0
137+
randint_up = options['classes'] - 1
138+
139+
toolbox = base.Toolbox()
140+
toolbox.register("attr_int", random.randint, randint_down, randint_up)
141+
toolbox.register("individual", tools.initRepeat,
142+
creator.Individual, toolbox.attr_int, samples)
143+
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
144+
toolbox.register("evaluate", my_evaluate)
145+
toolbox.register("mate", tools.cxTwoPoint)
146+
indpb = options['INDPB']
147+
toolbox.register("mutate", tools.mutShuffleIndexes, indpb=indpb)
148+
toolbox.register("select", tools.selNSGA3, ref_points=ref_points)
149+
150+
return toolbox
151+
152+
def results(options: dict, toolbox: base.Toolbox):
153+
"""
154+
Function that operates the search engine process by operating an
155+
evolutional algorithm to find the best results.
156+
157+
Parameters
158+
----------
159+
options : Dictionary of setup parameters highly necessary to how the
160+
search engine will find the solutions.
161+
toolbox : A Toolbox for evolution that contains evolutionary operators.
162+
163+
Returns
164+
-------
165+
deap.base.toolbox.population : A population of the best individuals
166+
from the search engine process.
167+
deap.tools.logbook : A logbook that contains evolutionary and
168+
statistics information about the search process.
169+
"""
170+
pop = options['POP']
171+
cxpb = options['CXPB']
172+
mutpb = options['MUTPB']
173+
ngen = options['NGEN']
122174
random.seed(64)
123175
pool = multiprocessing.Pool(processes=12)
124176
toolbox.register("map", pool.map)
@@ -132,58 +184,84 @@ def main(seed=None):
132184
logbook = tools.Logbook()
133185
logbook.header = "gen", "evals", "std", "min", "avg", "max"
134186

135-
pop = toolbox.population(POP)
187+
tool_pop = toolbox.population(pop)
136188

137189
# Evaluate the individuals with an invalid fitness
138-
invalid_ind = [ind for ind in pop if not ind.fitness.valid]
190+
invalid_ind = [ind for ind in tool_pop if not ind.fitness.valid]
139191
fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
140192
for ind, fit in zip(invalid_ind, fitnesses):
141193
ind.fitness.values = fit
142194
# Compile statistics about the population
143-
record = stats.compile(pop)
195+
record = stats.compile(tool_pop)
144196

145197
logbook.record(gen=0, evals=len(invalid_ind), **record)
146198
print(logbook.stream)
147199
# Begin the generational process
148-
for gen in range(1, NGEN):
149-
offspring = algorithms.varAnd(pop, toolbox, CXPB, MUTPB)
200+
for gen in range(1, ngen):
201+
offspring = algorithms.varAnd(tool_pop, toolbox, cxpb, mutpb)
150202
# Evaluate the individuals with an invalid fitness
151203
invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
152204
fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
153205
for ind, fit in zip(invalid_ind, fitnesses):
154206
ind.fitness.values = fit
155207
# Select the next generation population from parents and offspring
156-
pop = toolbox.select(pop + offspring, POP)
208+
tool_pop = toolbox.select(tool_pop + offspring, pop)
157209

158210
# Compile statistics about the new population
159-
record = stats.compile(pop)
211+
record = stats.compile(tool_pop)
160212
logbook.record(gen=gen, evals=len(invalid_ind), **record)
161213
print(logbook.stream)
162-
return pop, logbook
214+
return tool_pop, logbook
215+
216+
def main():
217+
options = setup.get_options()
218+
219+
if options['filepath'] != '':
220+
base_df = pd.read_csv(options['filepath'])
221+
222+
global dataFrame
223+
dataFrame = generate_instances(options['samples'], options['attributes'],
224+
options['classes'], options['maker'])
225+
226+
complexity_values = {}
227+
global metrics
228+
metrics = options['measures']
229+
for measure in metrics:
230+
complexity_values[measure] = options[measure]
231+
global global_measures
232+
global_measures = complexity_extraction(metrics,
233+
dataframe_label=(
234+
base_df, options['label_name']
235+
),
236+
complexity_values=complexity_values
237+
)
238+
239+
filename = build_filename(options['filename'],
240+
ngen=options['NGEN'],
241+
metrics=metrics)
163242

164-
165-
if __name__ == '__main__':
166-
cont1 = 0
167-
cont0 = 0
168-
#dataFrame = pd.read_csv(str(N_ATTRIBUTES) + '.csv')
169-
#dataFrame = dataFrame.drop('c0', axis=1)
170-
dataFrame = df
171243
# This Ecol object should be called according to the variable dataFrame.
172-
# If dataFrame is renamed, then ecol_dataFrame should be renamed
244+
# If dataFrame is renamed, then ecol_dataFrame should be renamed
173245
# accordingly.
246+
global ecol_dataFrame
174247
ecol_dataFrame = Ecol(dataframe=dataFrame, label='label')
175-
results = main()
176-
print("logbook")
177-
print(results[0][0])
178-
for x in range(len(results[0])):
179-
dic[print_evaluate(results[0][x])] = results[0][x]
248+
249+
print(metrics, len(metrics))
250+
print(global_measures)
251+
toolbox = setup_engine(options)
252+
result = results(options, toolbox)
253+
254+
compiled_results = {}
255+
for x in range(len(result[0])):
256+
compiled_results[print_evaluate(result[0][x])] = result[0][x]
180257
outfile = open(filename, 'wb')
181-
pickle.dump(dic, outfile)
258+
pickle.dump(compiled_results, outfile)
182259
outfile.close()
183260

184-
df['label'] = results[0][0]
261+
dataFrame['label'] = result[0][0]
185262
# Scale to original Dataset (Optional) #TODO: Improve preprocessing
186263
# df = preprocess.scaleColumnsFrom(base_df, df, label_column='label')
187-
df.to_csv(str(filename)+".csv")
188-
ax1 = df.plot.scatter(x=0, y=1, c='label', colormap='Paired')
189-
pyplot.show()
264+
dataFrame.to_csv(str(filename)+".csv")
265+
266+
if __name__ == '__main__':
267+
main()

src/instances_generator/generator.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ class InstancesGenerator:
2020
5: '_mlabel_classf'
2121
}
2222

23-
def __init__(self, options: dict):
23+
def __init__(self, samples, attributes, classes=None, maker_option=None):
2424
"""
2525
Constructs the generator based on properties desired.
2626
@@ -30,10 +30,10 @@ def __init__(self, options: dict):
3030
properties desired to generate a dataset (e.g. samples,
3131
attributes, classes).
3232
"""
33-
self._samples = options['samples']
34-
self._attributes = options['attributes']
35-
self._classes = options['classes']
36-
self._optional_option = options['maker'][1]
33+
self._samples = samples
34+
self._attributes = attributes
35+
self._classes = classes
36+
self._optional_option = maker_option
3737

3838
def generate(self, type_gen: int) -> DataFrame:
3939
"""

src/setup/interactor.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ def measures_input() -> list:
7575
# Appends every Complexity Measure in measures list
7676
return [cm(measure) for measure in input_Stream]
7777

78+
def generation_input() -> int:
79+
return int(input("Com quantas gerações você deseja otimizar o dataset?\n"))
80+
7881
def __input_with_default__(input_text: str, default_value, data_type: type):
7982
try:
8083
return data_type(input(input_text))

0 commit comments

Comments
 (0)