@@ -150,56 +150,6 @@ def _download_object_csvs(
150
150
151
151
logger .info (f"CSV info total: total_csvs={ num_csvs } total_bytes={ num_bytes } " )
152
152
153
- @staticmethod
154
- def _load_csvs_to_db (csv_directory : str , sf_db : OnyxSalesforceSQLite ) -> set [str ]:
155
- updated_ids : set [str ] = set ()
156
-
157
- object_type_to_csv_path = SalesforceConnector .reconstruct_object_types (
158
- csv_directory
159
- )
160
-
161
- # NOTE(rkuo): this timing note is meaningless without a reference point in terms
162
- # of number of records, etc
163
- # This takes like 10 seconds
164
-
165
- # This is for testing the rest of the functionality if data has
166
- # already been fetched and put in sqlite
167
- # from import onyx.connectors.salesforce.sf_db.sqlite_functions find_ids_by_type
168
- # for object_type in self.parent_object_list:
169
- # updated_ids.update(list(find_ids_by_type(object_type)))
170
-
171
- # This takes 10-70 minutes first time (idk why the range is so big)
172
- total_types = len (object_type_to_csv_path )
173
- logger .info (f"Starting to process { total_types } object types" )
174
-
175
- for i , (object_type , csv_paths ) in enumerate (
176
- object_type_to_csv_path .items (), 1
177
- ):
178
- logger .info (f"Processing object type { object_type } ({ i } /{ total_types } )" )
179
- # If path is None, it means it failed to fetch the csv
180
- if csv_paths is None :
181
- continue
182
-
183
- # Go through each csv path and use it to update the db
184
- for csv_path in csv_paths :
185
- logger .debug (
186
- f"Processing CSV: object_type={ object_type } "
187
- f"csv={ csv_path } "
188
- f"len={ Path (csv_path ).stat ().st_size } "
189
- )
190
- new_ids = sf_db .update_from_csv (
191
- object_type = object_type ,
192
- csv_download_path = csv_path ,
193
- )
194
- updated_ids .update (new_ids )
195
- logger .debug (
196
- f"Added { len (new_ids )} new/updated records for { object_type } "
197
- )
198
-
199
- os .remove (csv_path )
200
-
201
- return updated_ids
202
-
203
153
@staticmethod
204
154
def _get_all_types (parent_types : list [str ], sf_client : Salesforce ) -> set [str ]:
205
155
all_types : set [str ] = set (parent_types )
@@ -236,6 +186,7 @@ def _fetch_from_salesforce(
236
186
237
187
updated_ids : set [str ] = set ()
238
188
docs_processed = 0
189
+ docs_to_yield : list [Document ] = []
239
190
240
191
sf_db = OnyxSalesforceSQLite (os .path .join (temp_dir , "salesforce_db.sqlite" ))
241
192
sf_db .connect ()
@@ -266,7 +217,43 @@ def _fetch_from_salesforce(
266
217
gc .collect ()
267
218
268
219
# Step 2 - load CSV's to sqlite
269
- updated_ids = SalesforceConnector ._load_csvs_to_db (temp_dir , sf_db )
220
+ object_type_to_csv_paths = SalesforceConnector .reconstruct_object_types (
221
+ temp_dir
222
+ )
223
+
224
+ total_types = len (object_type_to_csv_paths )
225
+ logger .info (f"Starting to process { total_types } object types" )
226
+
227
+ for i , (object_type , csv_paths ) in enumerate (
228
+ object_type_to_csv_paths .items (), 1
229
+ ):
230
+ logger .info (f"Processing object type { object_type } ({ i } /{ total_types } )" )
231
+ # If path is None, it means it failed to fetch the csv
232
+ if csv_paths is None :
233
+ continue
234
+
235
+ # Go through each csv path and use it to update the db
236
+ for csv_path in csv_paths :
237
+ logger .debug (
238
+ f"Processing CSV: object_type={ object_type } "
239
+ f"csv={ csv_path } "
240
+ f"len={ Path (csv_path ).stat ().st_size } "
241
+ )
242
+
243
+ # yield an empty list to keep the connector alive
244
+ yield docs_to_yield
245
+
246
+ new_ids = sf_db .update_from_csv (
247
+ object_type = object_type ,
248
+ csv_download_path = csv_path ,
249
+ )
250
+ updated_ids .update (new_ids )
251
+ logger .debug (
252
+ f"Added { len (new_ids )} new/updated records for { object_type } "
253
+ )
254
+
255
+ os .remove (csv_path )
256
+
270
257
gc .collect ()
271
258
272
259
logger .info (f"Found { len (updated_ids )} total updated records" )
@@ -276,7 +263,6 @@ def _fetch_from_salesforce(
276
263
277
264
# Step 3 - extract and index docs
278
265
batches_processed = 0
279
- docs_to_yield : list [Document ] = []
280
266
docs_to_yield_bytes = 0
281
267
282
268
# Takes 15-20 seconds per batch
0 commit comments