@@ -135,9 +135,16 @@ def generate_training_data(self):
135135 X_train .append (features )
136136 y_train .append (0 ) # Negative recommendation
137137
138- # Generate synthetic data if we don't have enough training examples
139- if len (X_train ) < 50 :
140- X_train , y_train = self ._generate_synthetic_data (X_train , y_train )
138+ print (f"Generated { len (X_train )} training examples from real data" )
139+
140+ # Always generate synthetic data to ensure sufficient training examples
141+ # and proper class diversity
142+ X_train , y_train = self ._generate_synthetic_data (X_train , y_train )
143+
144+ print (f"Final training dataset: { len (X_train )} examples" )
145+ if len (X_train ) > 0 :
146+ unique_labels = set (y_train )
147+ print (f"Class distribution: { dict (zip (* np .unique (y_train , return_counts = True )))} " )
141148
142149 return np .array (X_train ), np .array (y_train )
143150
@@ -157,36 +164,68 @@ def _generate_synthetic_data(self, X_existing, y_existing):
157164
158165 # Generate synthetic examples based on common patterns
159166 synthetic_patterns = [
160- # High video preference pattern
167+ # High video preference pattern (positive examples)
161168 [1 , 1 , 1 , 1 , 0.8 , 0.8 , 0.75 , 0.6 , 0.7 ], # Features for video lovers
162- # Low video preference pattern
169+ [0.8 , 0.9 , 0.9 , 0.8 , 0.7 , 0.6 , 0.8 , 0.7 , 0.8 ], # Another positive pattern
170+ [0.9 , 0.8 , 1 , 0.9 , 0.6 , 0.7 , 0.7 , 0.8 , 0.9 ], # High video helpfulness
171+ [1 , 1 , 0.8 , 1 , 0.5 , 0.5 , 0.6 , 0.5 , 0.6 ], # Always use videos for tests
172+
173+ # Low video preference pattern (negative examples)
163174 [0 , 0.25 , 0.3 , 0 , 0.5 , 0.5 , 0.5 , 0.4 , 0.3 ], # Features for non-video learners
164- # Moderate video preference pattern
175+ [0.2 , 0.1 , 0.2 , 0.1 , 0.6 , 0.7 , 0.4 , 0.3 , 0.2 ], # Another negative pattern
176+ [0 , 0.3 , 0.1 , 0.2 , 0.8 , 0.9 , 0.6 , 0.2 , 0.1 ], # High confidence, low video need
177+ [0.1 , 0.2 , 0.3 , 0.1 , 0.4 , 0.3 , 0.3 , 0.4 , 0.2 ], # Low interest in videos
178+
179+ # Moderate video preference pattern (mixed examples)
165180 [0.5 , 0.75 , 0.7 , 0.7 , 0.6 , 0.6 , 0.6 , 0.6 , 0.5 ], # Balanced learners
166- # High confidence, low video need
167- [0 , 0.5 , 0.3 , 0.4 , 0.8 , 1 , 0.75 , 0.4 , 0.7 ],
168- # Low confidence, high video need
169- [1 , 1 , 1 , 1 , 0.4 , 0.2 , 0.5 , 0.8 , 0.7 ],
181+ [0.6 , 0.5 , 0.6 , 0.5 , 0.5 , 0.5 , 0.5 , 0.5 , 0.5 ], # Neutral pattern
170182 ]
171183
172- for pattern in synthetic_patterns :
173- # Add some noise to make it more realistic
174- for _ in range (10 ):
175- noisy_pattern = pattern + np .random .normal (0 , 0.1 , len (pattern ))
184+ # Generate balanced synthetic data
185+ positive_patterns = synthetic_patterns [:4 ] # First 4 are positive
186+ negative_patterns = synthetic_patterns [4 :8 ] # Next 4 are negative
187+ neutral_patterns = synthetic_patterns [8 :] # Last 2 are neutral
188+
189+ # Generate positive examples
190+ for pattern in positive_patterns :
191+ for _ in range (8 ): # Generate 8 positive examples per pattern
192+ noisy_pattern = pattern + np .random .normal (0 , 0.08 , len (pattern ))
176193 noisy_pattern = np .clip (noisy_pattern , 0 , 1 ) # Keep values between 0 and 1
194+ X_synthetic .append (noisy_pattern )
195+ y_synthetic .append (1 ) # Positive
177196
197+ # Generate negative examples
198+ for pattern in negative_patterns :
199+ for _ in range (8 ): # Generate 8 negative examples per pattern
200+ noisy_pattern = pattern + np .random .normal (0 , 0.08 , len (pattern ))
201+ noisy_pattern = np .clip (noisy_pattern , 0 , 1 ) # Keep values between 0 and 1
178202 X_synthetic .append (noisy_pattern )
203+ y_synthetic .append (0 ) # Negative
179204
180- # Determine label based on video preference features
181- video_preference_score = (noisy_pattern [0 ] + noisy_pattern [2 ] + noisy_pattern [3 ]) / 3
182- if video_preference_score > 0.6 :
183- y_synthetic .append (1 ) # Positive
184- else :
185- y_synthetic .append (0 ) # Negative
205+ # Generate neutral examples (distribute between positive and negative)
206+ for pattern in neutral_patterns :
207+ for i in range (6 ): # Generate 6 neutral examples per pattern
208+ noisy_pattern = pattern + np .random .normal (0 , 0.1 , len (pattern ))
209+ noisy_pattern = np .clip (noisy_pattern , 0 , 1 ) # Keep values between 0 and 1
210+ X_synthetic .append (noisy_pattern )
211+ # Alternate between positive and negative for neutral patterns
212+ y_synthetic .append (1 if i % 2 == 0 else 0 )
213+
214+ # Add some completely random examples for diversity
215+ for _ in range (20 ):
216+ random_features = np .random .random (len (self .feature_names ))
217+ X_synthetic .append (random_features )
218+ # Determine label based on video preference features
219+ video_preference_score = (random_features [0 ] + random_features [2 ] + random_features [3 ]) / 3
220+ y_synthetic .append (1 if video_preference_score > 0.5 else 0 )
186221
187222 # Combine existing and synthetic data
188- X_combined = np .vstack ([X_existing , X_synthetic ]) if len (X_existing ) > 0 else np .array (X_synthetic )
189- y_combined = np .concatenate ([y_existing , y_synthetic ]) if len (y_existing ) > 0 else np .array (y_synthetic )
223+ if len (X_existing ) > 0 :
224+ X_combined = np .vstack ([X_existing , X_synthetic ])
225+ y_combined = np .concatenate ([y_existing , y_synthetic ])
226+ else :
227+ X_combined = np .array (X_synthetic )
228+ y_combined = np .array (y_synthetic )
190229
191230 return X_combined , y_combined
192231
@@ -206,13 +245,27 @@ def train_model(self):
206245 return False
207246
208247 # Ensure there are both positive and negative labels
209- if len (set (y_train )) < 2 :
210- print ("Not enough class diversity in training labels" )
211- return False
248+ unique_labels = set (y_train )
249+ if len (unique_labels ) < 2 :
250+ print (f"Not enough class diversity in training labels. Found: { unique_labels } " )
251+ # Force generation of synthetic data with both classes
252+ X_train , y_train = self ._generate_synthetic_data ([], [])
253+ unique_labels = set (y_train )
254+ if len (unique_labels ) < 2 :
255+ print ("Failed to generate diverse synthetic data" )
256+ return False
257+
258+ print (f"Training data: { len (X_train )} examples with { len (unique_labels )} classes" )
259+ print (f"Class distribution: { dict (zip (* np .unique (y_train , return_counts = True )))} " )
260+
261+ # Ensure minimum dataset size
262+ if len (X_train ) < 20 :
263+ print ("Dataset too small, generating more synthetic data" )
264+ X_train , y_train = self ._generate_synthetic_data (X_train , y_train )
212265
213266 # Split data for validation
214267 X_train_split , X_val , y_train_split , y_val = train_test_split (
215- X_train , y_train , test_size = 0.2 , random_state = 42
268+ X_train , y_train , test_size = 0.2 , random_state = 42 , stratify = y_train
216269 )
217270
218271 # Scale features
@@ -224,7 +277,8 @@ def train_model(self):
224277 random_state = 42 ,
225278 max_iter = 1000 ,
226279 C = 1.0 ,
227- solver = 'liblinear'
280+ solver = 'liblinear' ,
281+ class_weight = 'balanced' # Handle class imbalance
228282 )
229283
230284 self .model .fit (X_train_scaled , y_train_split )
@@ -234,6 +288,7 @@ def train_model(self):
234288 accuracy = accuracy_score (y_val , y_pred )
235289
236290 print (f"Model trained successfully. Validation accuracy: { accuracy :.3f} " )
291+ print (f"Training set size: { len (X_train_split )} , Validation set size: { len (X_val )} " )
237292
238293 # Save the model
239294 self .save_model ()
@@ -242,6 +297,8 @@ def train_model(self):
242297
243298 except Exception as e :
244299 print (f"Error training model: { str (e )} " )
300+ import traceback
301+ traceback .print_exc ()
245302 return False
246303
247304 def predict_video_preference (self , survey ):
@@ -439,6 +496,50 @@ def load_model(self):
439496 print (f"Error loading model: { str (e )} " )
440497 return False
441498
499+ def force_retrain (self ):
500+ """
501+ Force retrain the model, ignoring any existing saved model.
502+
503+ Returns:
504+ bool: True if training was successful
505+ """
506+ print ("Forcing model retraining..." )
507+ # Remove existing model files if they exist
508+ for path in [self .model_path , self .scaler_path , self .encoders_path ]:
509+ if os .path .exists (path ):
510+ try :
511+ os .remove (path )
512+ print (f"Removed existing model file: { path } " )
513+ except Exception as e :
514+ print (f"Warning: Could not remove { path } : { e } " )
515+
516+ # Reset model state
517+ self .model = None
518+ self .scaler = StandardScaler ()
519+ self .label_encoders = {}
520+
521+ # Train new model
522+ return self .train_model ()
523+
442524
443525# Global recommender instance
444- recommender = VideoRecommender ()
526+ recommender = VideoRecommender ()
527+
528+ # Try to load existing model, if not available or fails, train a new one
529+ def initialize_recommender ():
530+ """Initialize the global recommender instance."""
531+ global recommender
532+
533+ print ("Initializing AI recommendation system..." )
534+
535+ # Try to load existing model
536+ if not recommender .load_model ():
537+ print ("No existing model found, training new model..." )
538+ if not recommender .train_model ():
539+ print ("Failed to train model, recommender will use fallback method" )
540+ else :
541+ print ("Model trained and saved successfully" )
542+ else :
543+ print ("Existing model loaded successfully" )
544+
545+ return recommender
0 commit comments