From dbd307f634ec3955629e667aa6ce0487b5ebb653 Mon Sep 17 00:00:00 2001 From: fenglang Date: Wed, 22 Mar 2023 10:29:41 +0100 Subject: [PATCH 01/43] fix the issues in preprocess and np.stack in batches in vec --- type4py/__main__.py | 20 +++ type4py/data_loaders.py | 267 ++++++++++++++++++++++++++-------------- type4py/preprocess.py | 6 + type4py/vectorize.py | 27 +++- 4 files changed, 229 insertions(+), 91 deletions(-) diff --git a/type4py/__main__.py b/type4py/__main__.py index e5dc536..c06718e 100644 --- a/type4py/__main__.py +++ b/type4py/__main__.py @@ -13,6 +13,10 @@ 'test': data_loaders.load_combined_test_data, 'labels': data_loaders.load_combined_labels, 'name': 'complete'} +data_loading_comb_sep = {'train': data_loaders.load_combined_train_data_split, 'valid': data_loaders.load_combined_valid_data_split, + 'test': data_loaders.load_combined_test_data, 'labels': data_loaders.load_combined_labels_split, + 'name': 'complete_sep'} + data_loading_woi = {'train': data_loaders.load_combined_train_data_woi, 'valid': data_loaders.load_combined_valid_data_woi, 'test': data_loaders.load_combined_test_data_woi, 'labels': data_loaders.load_combined_labels, 'name': 'woi'} @@ -63,6 +67,12 @@ def learn(args): else: train(args.o, data_loading_comb, args.p, args.v) +def learn_split(args): + from type4py.learn import train + setup_logs_file(args.o, "learn_sep") + if args.c: + train(args.o, data_loading_comb_sep, args.p, args.v) + def predict(args): from type4py.predict import test setup_logs_file(args.o, "predict") @@ -131,6 +141,16 @@ def main(): learning_parser.add_argument('--v', '--validation', default=False, action="store_true", help="Evaluating Type4Py on the validation set when training") learning_parser.set_defaults(func=learn) + # Learning phase split + learning_parser_sep = sub_parsers.add_parser('learn_sep') + learning_parser_sep.add_argument('--o', '--output', required=True, type=str, help="Path to processed projects") + learning_parser_sep.add_argument('--c', '--complete', default=True, action="store_true", help="Complete Type4Py model") + learning_parser_sep.add_argument('--p', '--parameters', required=False, type=str, + help="Path to the JSON file of model's hyper-parameters") + learning_parser_sep.add_argument('--v', '--validation', default=False, action="store_true", + help="Evaluating Type4Py on the validation set when training") + learning_parser_sep.set_defaults(func=learn_split) + # Prediction phase predict_parser = sub_parsers.add_parser('predict') predict_parser.add_argument('--o', '--output', required=True, type=str, help="Path to processed projects") diff --git a/type4py/data_loaders.py b/type4py/data_loaders.py index 2bb28cb..a103513 100644 --- a/type4py/data_loaders.py +++ b/type4py/data_loaders.py @@ -9,39 +9,85 @@ logger.name = __name__ + def load_data_tensors_TW(filename, limit=-1): return torch.from_numpy(np.load(filename)).float() -def load_flat_labels_tensors(filename): +def load_flat_labels_tensors(filename): return torch.from_numpy(np.load(filename)).long() + def to_numpy(tensor): return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() + # Combined data def load_combined_train_data(output_path: str): - return torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_param_train_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_ret_train_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_var_train_datapoints_x.npy')))), \ - torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_param_train_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_ret_train_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_var_train_datapoints_x.npy')))), \ + return torch.cat( + (load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_param_train_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_ret_train_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_var_train_datapoints_x.npy')))), \ + torch.cat( + (load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_param_train_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_ret_train_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_var_train_datapoints_x.npy')))), \ torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'train', 'params_train_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'train', 'ret_train_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'train', 'var_train_aval_types_dp.npy')))) - + + +def load_combined_train_data_split(output_path: str, type: str): + if type == "var": + logger.info("Loading Variable set...") + return load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_var_train_datapoints_x.npy')), \ + load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_var_train_datapoints_x.npy')), \ + load_data_tensors_TW(join(output_path, 'vectors', 'train', 'var_train_aval_types_dp.npy')) + elif type == "ret": + logger.info("Loading return type set...") + return load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_ret_train_datapoints_x.npy')), \ + load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_ret_train_datapoints_x.npy')), \ + load_data_tensors_TW(join(output_path, 'vectors', 'train', 'ret_train_aval_types_dp.npy')) + elif type == "param": + logger.info("Loading param type set...") + return load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_param_train_datapoints_x.npy')), \ + load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_param_train_datapoints_x.npy')), \ + load_data_tensors_TW(join(output_path, 'vectors', 'train', 'params_train_aval_types_dp.npy')) + else: + logger.info("Type set not defined or not found...") + + def load_combined_valid_data(output_path: str): - return torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_param_valid_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_ret_valid_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_var_valid_datapoints_x.npy')))), \ - torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_param_valid_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_ret_valid_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_var_valid_datapoints_x.npy')))), \ + return torch.cat( + (load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_param_valid_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_ret_valid_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_var_valid_datapoints_x.npy')))), \ + torch.cat( + (load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_param_valid_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_ret_valid_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_var_valid_datapoints_x.npy')))), \ torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'params_valid_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'ret_valid_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'var_valid_aval_types_dp.npy')))) + +def load_combined_valid_data_split(output_path: str, type: str): + if type == "var": + return load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_var_valid_datapoints_x.npy')), \ + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_var_valid_datapoints_x.npy')), \ + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'var_valid_aval_types_dp.npy')) + elif type == "ret": + return load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_ret_valid_datapoints_x.npy')), \ + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_ret_valid_datapoints_x.npy')), \ + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'ret_valid_aval_types_dp.npy')) + elif type == "param": + return load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_param_valid_datapoints_x.npy')), \ + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_param_valid_datapoints_x.npy')), \ + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'params_valid_aval_types_dp.npy')) + else: + logger.info("Type set not defined or not found...") + + def load_combined_test_data(output_path: str): id_p_te = load_data_tensors_TW(join(output_path, 'vectors', 'test', 'identifiers_param_test_datapoints_x.npy')) id_r_te = load_data_tensors_TW(join(output_path, 'vectors', 'test', 'identifiers_ret_test_datapoints_x.npy')) @@ -54,9 +100,10 @@ def load_combined_test_data(output_path: str): torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'test', 'params_test_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'test', 'ret_test_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'test', 'var_test_aval_types_dp.npy')))), \ - (len(id_p_te)-1, (len(id_p_te)+len(id_r_te))-1, (len(id_p_te)+len(id_r_te)+len(id_v_te))-1) - # indexes of combined test data for separating between prediction tasks - + (len(id_p_te) - 1, (len(id_p_te) + len(id_r_te)) - 1, (len(id_p_te) + len(id_r_te) + len(id_v_te)) - 1) + # indexes of combined test data for separating between prediction tasks + + def load_combined_labels(output_path: str): return torch.cat((load_flat_labels_tensors(join(output_path, 'vectors', 'train', 'params_train_dps_y_all.npy')), load_flat_labels_tensors(join(output_path, 'vectors', 'train', 'ret_train_dps_y_all.npy')), @@ -68,23 +115,43 @@ def load_combined_labels(output_path: str): load_flat_labels_tensors(join(output_path, 'vectors', 'test', 'ret_test_dps_y_all.npy')), load_flat_labels_tensors(join(output_path, 'vectors', 'test', 'var_test_dps_y_all.npy')))) + +def load_combined_labels_split(output_path: str, type: str): + if type == "var": + return load_data_tensors_TW(join(output_path, 'vectors', 'train', 'var_train_dps_y_all.npy')), \ + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'var_valid_dps_y_all.npy')), \ + load_data_tensors_TW(join(output_path, 'vectors', 'test', 'var_test_dps_y_all.npy')) + elif type == "ret": + return load_data_tensors_TW(join(output_path, 'vectors', 'train', 'ret_train_dps_y_all.npy')), \ + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'ret_valid_dps_y_all.npy')), \ + load_data_tensors_TW(join(output_path, 'vectors', 'test', 'ret_test_dps_y_all.npy')) + elif type == "param": + return load_data_tensors_TW(join(output_path, 'vectors', 'train', 'params_train_dps_y_all.npy')), \ + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'params_valid_dps_y_all.npy')), \ + load_data_tensors_TW(join(output_path, 'vectors', 'test', 'params_test_dps_y_all.npy')) + + # Loading data for Type4Py model w/o identifiers def load_combined_train_data_woi(output_path: str): - return torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_param_train_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_ret_train_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_var_train_datapoints_x.npy')))), \ + return torch.cat( + (load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_param_train_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_ret_train_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_var_train_datapoints_x.npy')))), \ torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'train', 'params_train_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'train', 'ret_train_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'train', 'var_train_aval_types_dp.npy')))) + def load_combined_valid_data_woi(output_path: str): - return torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_param_valid_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_ret_valid_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_var_valid_datapoints_x.npy')))), \ + return torch.cat( + (load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_param_valid_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_ret_valid_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_var_valid_datapoints_x.npy')))), \ torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'params_valid_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'ret_valid_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'var_valid_aval_types_dp.npy')))) + def load_combined_test_data_woi(output_path: str): tk_p_te = load_data_tensors_TW(join(output_path, 'vectors', 'test', 'tokens_param_test_datapoints_x.npy')) tk_r_te = load_data_tensors_TW(join(output_path, 'vectors', 'test', 'tokens_ret_test_datapoints_x.npy')) @@ -94,26 +161,31 @@ def load_combined_test_data_woi(output_path: str): torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'test', 'params_test_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'test', 'ret_test_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'test', 'var_test_aval_types_dp.npy')))), \ - (len(tk_p_te)-1, (len(tk_p_te)+len(tk_r_te))-1, (len(tk_p_te)+len(tk_r_te)+len(tk_v_te))-1) - # indexes of combined test data for separating between prediction tasks + (len(tk_p_te) - 1, (len(tk_p_te) + len(tk_r_te)) - 1, (len(tk_p_te) + len(tk_r_te) + len(tk_v_te)) - 1) + # indexes of combined test data for separating between prediction tasks + # Loading data for Type4Py model w/o code contexts def load_combined_train_data_woc(output_path: str): - return torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_param_train_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_ret_train_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_var_train_datapoints_x.npy')))), \ + return torch.cat( + (load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_param_train_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_ret_train_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_var_train_datapoints_x.npy')))), \ torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'train', 'params_train_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'train', 'ret_train_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'train', 'var_train_aval_types_dp.npy')))) + def load_combined_valid_data_woc(output_path: str): - return torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_param_valid_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_ret_valid_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_var_valid_datapoints_x.npy')))), \ + return torch.cat( + (load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_param_valid_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_ret_valid_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_var_valid_datapoints_x.npy')))), \ torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'params_valid_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'ret_valid_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'var_valid_aval_types_dp.npy')))) + def load_combined_test_data_woc(output_path: str): id_p_te = load_data_tensors_TW(join(output_path, 'vectors', 'test', 'identifiers_param_test_datapoints_x.npy')) id_r_te = load_data_tensors_TW(join(output_path, 'vectors', 'test', 'identifiers_ret_test_datapoints_x.npy')) @@ -123,25 +195,32 @@ def load_combined_test_data_woc(output_path: str): torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'test', 'params_test_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'test', 'ret_test_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'test', 'var_test_aval_types_dp.npy')))), \ - (len(id_p_te)-1, (len(id_p_te)+len(id_r_te))-1, (len(id_p_te)+len(id_r_te)+len(id_v_te))-1) - # indexes of combined test data for separating between prediction tasks + (len(id_p_te) - 1, (len(id_p_te) + len(id_r_te)) - 1, (len(id_p_te) + len(id_r_te) + len(id_v_te)) - 1) + # indexes of combined test data for separating between prediction tasks + # Loading data for Type4Py model w/o visible type hints def load_combined_train_data_wov(output_path: str): - return torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_param_train_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_ret_train_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_var_train_datapoints_x.npy')))), \ - torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_param_train_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_ret_train_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_var_train_datapoints_x.npy')))) + return torch.cat( + (load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_param_train_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_ret_train_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_var_train_datapoints_x.npy')))), \ + torch.cat( + (load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_param_train_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_ret_train_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_var_train_datapoints_x.npy')))) + def load_combined_valid_data_wov(output_path: str): - return torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_param_valid_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_ret_valid_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_var_valid_datapoints_x.npy')))), \ - torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_param_valid_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_ret_valid_datapoints_x.npy')), - load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_var_valid_datapoints_x.npy')))) + return torch.cat( + (load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_param_valid_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_ret_valid_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_var_valid_datapoints_x.npy')))), \ + torch.cat( + (load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_param_valid_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_ret_valid_datapoints_x.npy')), + load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_var_valid_datapoints_x.npy')))) + def load_combined_test_data_wov(output_path: str): id_p_te = load_data_tensors_TW(join(output_path, 'vectors', 'test', 'identifiers_param_test_datapoints_x.npy')) @@ -152,8 +231,8 @@ def load_combined_test_data_wov(output_path: str): torch.cat((load_data_tensors_TW(join(output_path, 'vectors', 'test', 'tokens_param_test_datapoints_x.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'test', 'tokens_ret_test_datapoints_x.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'test', 'tokens_var_test_datapoints_x.npy')))), \ - (len(id_p_te)-1, (len(id_p_te)+len(id_r_te))-1, (len(id_p_te)+len(id_r_te)+len(id_v_te))-1) - # indexes of combined test data for separating between prediction tasks + (len(id_p_te) - 1, (len(id_p_te) + len(id_r_te)) - 1, (len(id_p_te) + len(id_r_te) + len(id_v_te)) - 1) + # indexes of combined test data for separating between prediction tasks # Argument data @@ -162,80 +241,93 @@ def load_param_train_data(output_path: str): load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_param_train_datapoints_x.npy')), \ load_data_tensors_TW(join(output_path, 'vectors', 'train', 'params_train_aval_types_dp.npy')) + def load_param_valid_data(output_path: str): return load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_param_valid_datapoints_x.npy')), \ load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_param_valid_datapoints_x.npy')), \ load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'params_valid_aval_types_dp.npy')) + def load_param_test_data(output_path: str): return load_data_tensors_TW(join(output_path, 'vectors', 'test', 'identifiers_param_test_datapoints_x.npy')), \ load_data_tensors_TW(join(output_path, 'vectors', 'test', 'tokens_param_test_datapoints_x.npy')), \ load_data_tensors_TW(join(output_path, 'vectors', 'test', 'params_test_aval_types_dp.npy')) + def load_param_labels(output_path: str): return load_flat_labels_tensors(join(output_path, 'vectors', 'train', 'params_train_dps_y_all.npy')), \ load_flat_labels_tensors(join(output_path, 'vectors', 'valid', 'params_valid_dps_y_all.npy')), \ load_flat_labels_tensors(join(output_path, 'vectors', 'test', 'params_test_dps_y_all.npy')) - + + # Return data def load_ret_train_data(output_path: str): return load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_ret_train_datapoints_x.npy')), \ load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_ret_train_datapoints_x.npy')), \ load_data_tensors_TW(join(output_path, 'vectors', 'train', 'ret_train_aval_types_dp.npy')) + def load_ret_valid_data(output_path: str): return load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_ret_valid_datapoints_x.npy')), \ load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_ret_valid_datapoints_x.npy')), \ load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'ret_valid_aval_types_dp.npy')) + def load_ret_test_data(output_path: str): return load_data_tensors_TW(join(output_path, 'vectors', 'test', 'identifiers_ret_test_datapoints_x.npy')), \ load_data_tensors_TW(join(output_path, 'vectors', 'test', 'tokens_ret_test_datapoints_x.npy')), \ load_data_tensors_TW(join(output_path, 'vectors', 'test', 'ret_test_aval_types_dp.npy')) + def load_ret_labels(output_path: str): return load_flat_labels_tensors(join(output_path, 'vectors', 'train', 'ret_train_dps_y_all.npy')), \ load_flat_labels_tensors(join(output_path, 'vectors', 'valid', 'ret_valid_dps_y_all.npy')), \ load_flat_labels_tensors(join(output_path, 'vectors', 'test', 'ret_test_dps_y_all.npy')) + # Variable data def load_var_train_data(output_path: str): return load_data_tensors_TW(join(output_path, 'vectors', 'train', 'identifiers_var_train_datapoints_x.npy')), \ load_data_tensors_TW(join(output_path, 'vectors', 'train', 'tokens_var_train_datapoints_x.npy')), \ load_data_tensors_TW(join(output_path, 'vectors', 'train', 'var_train_aval_types_dp.npy')) + def load_var_valid_data(output_path: str): return load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_var_valid_datapoints_x.npy')), \ load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'tokens_var_valid_datapoints_x.npy')), \ load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'var_valid_aval_types_dp.npy')) + def load_var_test_data(output_path: str): return load_data_tensors_TW(join(output_path, 'vectors', 'test', 'identifiers_var_test_datapoints_x.npy')), \ load_data_tensors_TW(join(output_path, 'vectors', 'test', 'tokens_var_test_datapoints_x.npy')), \ load_data_tensors_TW(join(output_path, 'vectors', 'test', 'var_test_aval_types_dp.npy')) + def load_var_labels(output_path: str): return load_flat_labels_tensors(join(output_path, 'vectors', 'train', 'var_train_dps_y_all.npy')), \ load_flat_labels_tensors(join(output_path, 'vectors', 'valid', 'var_valid_dps_y_all.npy')), \ load_flat_labels_tensors(join(output_path, 'vectors', 'test', 'var_test_dps_y_all.npy')) + def select_data(data, n): """ Selects data points that are frequent more than n times """ - + mask = torch.tensor([False] * data.shape[0], dtype=torch.bool) counter = Counter(data.data.numpy()) - + for i, d in enumerate(data): if counter[d.item()] >= n: mask[i] = True - + return mask + def load_training_data_per_model(data_loading_funcs: dict, output_path: str, - no_batches: int, train_mode:bool=True, load_valid_data:bool=True, - no_workers:int=8) -> Tuple[DataLoader, DataLoader]: + no_batches: int, train_mode: bool = True, load_valid_data: bool = True, + no_workers: int = 8) -> Tuple[DataLoader, DataLoader]: """ Loads appropriate training data based on the model's type """ @@ -253,18 +345,18 @@ def load_training_data_per_model(data_loading_funcs: dict, output_path: str, train_mask = select_data(Y_all_train, MIN_DATA_POINTS) X_tok_train, X_type_train, Y_all_train = X_tok_train[train_mask], \ - X_type_train[train_mask], Y_all_train[train_mask] + X_type_train[train_mask], Y_all_train[train_mask] valid_mask = select_data(Y_all_valid, MIN_DATA_POINTS) X_tok_valid, X_type_valid, Y_all_valid = X_tok_valid[valid_mask], \ - X_type_valid[valid_mask], Y_all_valid[valid_mask] + X_type_valid[valid_mask], Y_all_valid[valid_mask] triplet_data_train = TripletDataset(X_tok_train, X_type_train, labels=Y_all_train, - dataset_name=data_loading_funcs['name'], train_mode=train_mode) + dataset_name=data_loading_funcs['name'], train_mode=train_mode) triplet_data_valid = TripletDataset(X_tok_valid, X_type_valid, labels=Y_all_valid, dataset_name=data_loading_funcs['name'], train_mode=train_mode) - + elif data_loading_funcs['name'] == 'woc': # without code tokens X_id_train, X_type_train = data_loading_funcs['train'](output_path) @@ -273,14 +365,14 @@ def load_training_data_per_model(data_loading_funcs: dict, output_path: str, train_mask = select_data(Y_all_train, MIN_DATA_POINTS) X_id_train, X_type_train, Y_all_train = X_id_train[train_mask], \ - X_type_train[train_mask], Y_all_train[train_mask] + X_type_train[train_mask], Y_all_train[train_mask] valid_mask = select_data(Y_all_valid, MIN_DATA_POINTS) X_id_valid, X_type_valid, Y_all_valid = X_id_valid[valid_mask], \ - X_type_valid[valid_mask], Y_all_valid[valid_mask] + X_type_valid[valid_mask], Y_all_valid[valid_mask] triplet_data_train = TripletDataset(X_id_train, X_type_train, labels=Y_all_train, - dataset_name=data_loading_funcs['name'], train_mode=train_mode) + dataset_name=data_loading_funcs['name'], train_mode=train_mode) triplet_data_valid = TripletDataset(X_id_valid, X_type_valid, labels=Y_all_valid, dataset_name=data_loading_funcs['name'], train_mode=train_mode) @@ -293,18 +385,18 @@ def load_training_data_per_model(data_loading_funcs: dict, output_path: str, train_mask = select_data(Y_all_train, MIN_DATA_POINTS) X_id_train, X_tok_train, Y_all_train = X_id_train[train_mask], \ - X_tok_train[train_mask], Y_all_train[train_mask] + X_tok_train[train_mask], Y_all_train[train_mask] valid_mask = select_data(Y_all_valid, MIN_DATA_POINTS) X_id_valid, X_tok_valid, Y_all_valid = X_id_valid[valid_mask], \ - X_tok_valid[valid_mask], Y_all_valid[valid_mask] + X_tok_valid[valid_mask], Y_all_valid[valid_mask] triplet_data_train = TripletDataset(X_id_train, X_tok_train, labels=Y_all_train, - dataset_name=data_loading_funcs['name'], train_mode=train_mode) + dataset_name=data_loading_funcs['name'], train_mode=train_mode) triplet_data_valid = TripletDataset(X_id_valid, X_tok_valid, labels=Y_all_valid, dataset_name=data_loading_funcs['name'], train_mode=train_mode) - + else: # Complete model X_id_train, X_tok_train, X_type_train = data_loading_funcs['train'](output_path) @@ -314,20 +406,21 @@ def load_training_data_per_model(data_loading_funcs: dict, output_path: str, Y_all_train, _, _ = data_loading_funcs['labels'](output_path) train_mask = select_data(Y_all_train, MIN_DATA_POINTS) - + X_id_train = X_id_train[train_mask] X_tok_train = X_tok_train[train_mask] X_type_train = X_type_train[train_mask] Y_all_train = Y_all_train[train_mask] - + # X_id_train, X_tok_train, X_type_train, Y_all_train = X_id_train[train_mask], \ # X_tok_train[train_mask], X_type_train[train_mask], Y_all_train[train_mask] triplet_data_train = TripletDataset(X_id_train, X_tok_train, X_type_train, labels=Y_all_train, - dataset_name=data_loading_funcs['name'], train_mode=train_mode) + dataset_name=data_loading_funcs['name'], train_mode=train_mode) + + logger.info( + f"Loaded train set of the {data_loading_funcs['name']} dataset in {(time() - load_data_t) / 60:.2f} min") - logger.info(f"Loaded train set of the {data_loading_funcs['name']} dataset in {(time()-load_data_t)/60:.2f} min") - if load_valid_data: X_id_valid, X_tok_valid, X_type_valid = data_loading_funcs['valid'](output_path) valid_mask = select_data(Y_all_valid, MIN_DATA_POINTS) @@ -336,8 +429,8 @@ def load_training_data_per_model(data_loading_funcs: dict, output_path: str, X_type_valid = X_type_valid[valid_mask] Y_all_valid = Y_all_valid[valid_mask] triplet_data_valid = TripletDataset(X_id_valid, X_tok_valid, X_type_valid, labels=Y_all_valid, - dataset_name=data_loading_funcs['name'], - train_mode=train_mode) + dataset_name=data_loading_funcs['name'], + train_mode=train_mode) logger.info(f"Loaded valid set of the {data_loading_funcs['name']} dataset") train_loader = DataLoader(triplet_data_train, batch_size=no_batches, shuffle=True, @@ -349,8 +442,9 @@ def load_training_data_per_model(data_loading_funcs: dict, output_path: str, else: return train_loader, None + def load_test_data_per_model(data_loading_funcs: dict, output_path: str, - no_batches: int, drop_last_batch:bool=False): + no_batches: int, drop_last_batch: bool = False): """ Loads appropriate training data based on the model's type """ @@ -361,16 +455,14 @@ def load_test_data_per_model(data_loading_funcs: dict, output_path: str, X_tok_test, X_type_test, t_idx = data_loading_funcs['test'](output_path) _, _, Y_all_test = data_loading_funcs['labels'](output_path) - triplet_data_test = TripletDataset(X_tok_test, X_type_test, labels=Y_all_test, dataset_name=data_loading_funcs['name'], train_mode=False) - + elif data_loading_funcs['name'] == 'woc': # without code tokens X_id_test, X_type_test, t_idx = data_loading_funcs['test'](output_path) _, _, Y_all_test = data_loading_funcs['labels'](output_path) - triplet_data_test = TripletDataset(X_id_test, X_type_test, labels=Y_all_test, dataset_name=data_loading_funcs['name'], train_mode=False) @@ -379,30 +471,27 @@ def load_test_data_per_model(data_loading_funcs: dict, output_path: str, X_id_test, X_tok_test, t_idx = data_loading_funcs['test'](output_path) _, _, Y_all_test = data_loading_funcs['labels'](output_path) - triplet_data_test = TripletDataset(X_id_test, X_tok_test, labels=Y_all_test, dataset_name=data_loading_funcs['name'], train_mode=False) - + else: # Complete model X_id_test, X_tok_test, X_type_test, t_idx = data_loading_funcs['test'](output_path) _, _, Y_all_test = data_loading_funcs['labels'](output_path) - triplet_data_test = TripletDataset(X_id_test, X_tok_test, X_type_test, labels=Y_all_test, dataset_name=data_loading_funcs['name'], train_mode=False) - - logger.info(f"Loaded the test set of the {data_loading_funcs['name']} dataset in {(time()-load_data_t)/60:.2f} min") + logger.info( + f"Loaded the test set of the {data_loading_funcs['name']} dataset in {(time() - load_data_t) / 60:.2f} min") return DataLoader(triplet_data_test, batch_size=no_batches, num_workers=12, drop_last=drop_last_batch), t_idx - class TripletDataset(torch.utils.data.Dataset): def __init__(self, *in_sequences: torch.Tensor, labels: torch.Tensor, dataset_name: str, - train_mode: bool=True): + train_mode: bool = True): self.data = TensorDataset(*in_sequences) self.labels = labels self.dataset_name = dataset_name @@ -411,7 +500,7 @@ def __init__(self, *in_sequences: torch.Tensor, labels: torch.Tensor, dataset_na self.get_item_func = self.get_item_train if self.train_mode else self.get_item_test def get_item_train(self, index: int) -> Tuple[Tuple[torch.Tensor, torch.Tensor], - Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: + Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: """ It returns three tuples. Each one is a (data, label) - The first tuple is (data, label) at the given index @@ -419,9 +508,9 @@ def get_item_train(self, index: int) -> Tuple[Tuple[torch.Tensor, torch.Tensor], - The third tuple is different (data, label) from the given index """ - # Find a similar datapoint randomly + # Find a similar datapoint randomly mask = self.labels == self.labels[index] - mask[index] = False # Making sure that the similar pair is NOT the same as the given index + mask[index] = False # Making sure that the similar pair is NOT the same as the given index mask = mask.nonzero() a = mask[torch.randint(high=len(mask), size=(1,))][0] @@ -429,16 +518,16 @@ def get_item_train(self, index: int) -> Tuple[Tuple[torch.Tensor, torch.Tensor], mask = self.labels != self.labels[index] mask = mask.nonzero() b = mask[torch.randint(high=len(mask), size=(1,))][0] - + return (self.data[index], self.labels[index]), (self.data[a.item()], self.labels[a.item()]), \ (self.data[b.item()], self.labels[b.item()]) def get_item_test(self, index: int) -> Tuple[Tuple[torch.Tensor, torch.Tensor], list, list]: return (self.data[index], self.labels[index]), [], [] - + def __getitem__(self, index: int) -> Tuple[Tuple[torch.Tensor, torch.Tensor], - Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: - return self.get_item_func(index) + Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: + return self.get_item_func(index) def __len__(self) -> int: return len(self.data) diff --git a/type4py/preprocess.py b/type4py/preprocess.py index 21b619e..26e5fbb 100644 --- a/type4py/preprocess.py +++ b/type4py/preprocess.py @@ -219,6 +219,12 @@ def format_df(df: pd.DataFrame) -> pd.DataFrame: def encode_all_types(df_ret: pd.DataFrame, df_params: pd.DataFrame, df_vars: pd.DataFrame, output_dir: str): + + # add filtering the null types before encoding + df_ret = df_ret.drop(df_ret[df_ret['return_type'].isnull()].index) + df_params = df_params.drop(df_params[df_params['arg_type'].isnull()].index) + df_vars = df_vars.drop(df_vars[df_vars['var_type'].isnull()].index) + all_types = np.concatenate((df_ret['return_type'].values, df_params['arg_type'].values, df_vars['var_type'].values), axis=0) le_all = LabelEncoder() diff --git a/type4py/vectorize.py b/type4py/vectorize.py index a33bb78..7e0a496 100644 --- a/type4py/vectorize.py +++ b/type4py/vectorize.py @@ -13,6 +13,13 @@ W2V_VEC_LENGTH = 100 +class EmdTypeError(Exception): + pass + +class EmdTypeNotFound(EmdTypeError): + def __init__(self): + super().__init__("Embedding Type not found!") + class TokenIterator: def __init__(self, param_df: pd.DataFrame, return_df: pd.DataFrame, var_df: pd.DataFrame) -> None: @@ -224,8 +231,24 @@ def process_datapoints(df, output_path, embedding_type, type, trans_func, cached if not os.path.exists(os.path.join(output_path, embedding_type + type + '_datapoints_x.npy')) or not cached_file: datapoints = df.apply(trans_func, axis=1) - datapoints_X = np.stack(datapoints.progress_apply(lambda x: x.generate_datapoint()), - axis=0) + # optimize np.stack for datapoints in batches when handling large datasets + batch_size = 1000 + num_rows = datapoints.shape[0] + + if embedding_type == "identifiers_": + emd_shape = 31 + elif embedding_type == "tokens_": + emd_shape = TOKEN_SEQ_LEN[0]*TOKEN_SEQ_LEN[1] + else: + raise EmdTypeNotFound + + datapoints_X = np.empty((num_rows, emd_shape, W2V_VEC_LENGTH)) + for i in range(0, num_rows, batch_size): + start_idx = i + end_idx = min(i + batch_size, num_rows) + batch = datapoints.iloc[start_idx:end_idx] + datapoints_X[start_idx:end_idx, :, :] = np.stack(batch.progress_apply(lambda x: x.generate_datapoint()), + axis=0) np.save(os.path.join(output_path, embedding_type + type + '_datapoints_x'), datapoints_X) return datapoints_X From 896e8192772cb2b8dabe3a38be96b9ace52c6b22 Mon Sep 17 00:00:00 2001 From: fenglang Date: Wed, 22 Mar 2023 14:43:26 +0100 Subject: [PATCH 02/43] add the learn_sep into the type4py pepeline --- type4py/__main__.py | 8 +- type4py/data_loaders.py | 48 ++++++ type4py/learn_split.py | 335 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 388 insertions(+), 3 deletions(-) create mode 100644 type4py/learn_split.py diff --git a/type4py/__main__.py b/type4py/__main__.py index c06718e..97a940e 100644 --- a/type4py/__main__.py +++ b/type4py/__main__.py @@ -15,7 +15,7 @@ data_loading_comb_sep = {'train': data_loaders.load_combined_train_data_split, 'valid': data_loaders.load_combined_valid_data_split, 'test': data_loaders.load_combined_test_data, 'labels': data_loaders.load_combined_labels_split, - 'name': 'complete_sep'} + 'name': 'complete'} data_loading_woi = {'train': data_loaders.load_combined_train_data_woi, 'valid': data_loaders.load_combined_valid_data_woi, 'test': data_loaders.load_combined_test_data_woi, 'labels': data_loaders.load_combined_labels, @@ -68,10 +68,10 @@ def learn(args): train(args.o, data_loading_comb, args.p, args.v) def learn_split(args): - from type4py.learn import train + from type4py.learn_split import train_split setup_logs_file(args.o, "learn_sep") if args.c: - train(args.o, data_loading_comb_sep, args.p, args.v) + train_split(args.o, data_loading_comb_sep, args.dt, args.p, args.v) def predict(args): from type4py.predict import test @@ -145,6 +145,8 @@ def main(): learning_parser_sep = sub_parsers.add_parser('learn_sep') learning_parser_sep.add_argument('--o', '--output', required=True, type=str, help="Path to processed projects") learning_parser_sep.add_argument('--c', '--complete', default=True, action="store_true", help="Complete Type4Py model") + learning_parser_sep.add_argument('--dt', '--datatype', required=True, type=str, + help="Datatype for training phase") learning_parser_sep.add_argument('--p', '--parameters', required=False, type=str, help="Path to the JSON file of model's hyper-parameters") learning_parser_sep.add_argument('--v', '--validation', default=False, action="store_true", diff --git a/type4py/data_loaders.py b/type4py/data_loaders.py index a103513..57bce64 100644 --- a/type4py/data_loaders.py +++ b/type4py/data_loaders.py @@ -442,6 +442,54 @@ def load_training_data_per_model(data_loading_funcs: dict, output_path: str, else: return train_loader, None +def load_training_data_per_model_sep(data_loading_funcs: dict, output_path: str, dataset_type: str, + no_batches: int, train_mode: bool = True, load_valid_data: bool = True, + no_workers: int = 8) -> Tuple[DataLoader, DataLoader]: + """ + Loads appropriate training data based on the model's type + """ + + load_data_t = time() + # Complete model + X_id_train, X_tok_train, X_type_train = data_loading_funcs['train'](output_path,dataset_type) + if load_valid_data: + Y_all_train, Y_all_valid, _ = data_loading_funcs['labels'](output_path,dataset_type) + else: + Y_all_train, _, _ = data_loading_funcs['labels'](output_path,dataset_type) + + train_mask = select_data(Y_all_train, MIN_DATA_POINTS) + + X_id_train = X_id_train[train_mask] + X_tok_train = X_tok_train[train_mask] + X_type_train = X_type_train[train_mask] + Y_all_train = Y_all_train[train_mask] + + triplet_data_train = TripletDataset(X_id_train, X_tok_train, X_type_train, labels=Y_all_train, + dataset_name=data_loading_funcs['name'], train_mode=train_mode) + + logger.info( + f"Loaded train set of the {data_loading_funcs['name']} dataset for {dataset_type} in {(time() - load_data_t) / 60:.2f} min") + + if load_valid_data: + X_id_valid, X_tok_valid, X_type_valid = data_loading_funcs['valid'](output_path,dataset_type) + valid_mask = select_data(Y_all_valid, MIN_DATA_POINTS) + X_id_valid = X_id_valid[valid_mask] + X_tok_valid = X_tok_valid[valid_mask] + X_type_valid = X_type_valid[valid_mask] + Y_all_valid = Y_all_valid[valid_mask] + triplet_data_valid = TripletDataset(X_id_valid, X_tok_valid, X_type_valid, labels=Y_all_valid, + dataset_name=data_loading_funcs['name'], + train_mode=train_mode) + logger.info(f"Loaded valid set of the {data_loading_funcs['name']} dataset for {dataset_type}") + + train_loader = DataLoader(triplet_data_train, batch_size=no_batches, shuffle=True, + pin_memory=True, num_workers=no_workers) + + if load_valid_data: + valid_loader = DataLoader(triplet_data_valid, batch_size=no_batches, num_workers=no_workers) + return train_loader, valid_loader + else: + return train_loader, None def load_test_data_per_model(data_loading_funcs: dict, output_path: str, no_batches: int, drop_last_batch: bool = False): diff --git a/type4py/learn_split.py b/type4py/learn_split.py new file mode 100644 index 0000000..2abbdee --- /dev/null +++ b/type4py/learn_split.py @@ -0,0 +1,335 @@ +import os + +from type4py.data_loaders import select_data, TripletDataset, load_training_data_per_model, \ + load_training_data_per_model_sep +from type4py.vectorize import AVAILABLE_TYPES_NUMBER, W2V_VEC_LENGTH +from type4py.eval import eval_type_embed +from type4py.utils import load_model_params +from type4py import logger, MIN_DATA_POINTS, KNN_TREE_SIZE +from torch.utils.data import DataLoader +from typing import Tuple +from collections import Counter +from multiprocessing import cpu_count +from os.path import join +from time import time +from annoy import AnnoyIndex +from tqdm import tqdm +import numpy as np +import torch.nn as nn +import torch +import pickle +import pkg_resources + +logger.name = __name__ +DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +class ModelNotFit(Exception): + pass + +class NotCompleteModel(ModelNotFit): + def __init__(self): + super().__init__("learn_split may just fit for complete model!") + +class TrainedModel(Exception): + pass + +class ModelTrainedError(TrainedModel): + def __init__(self): + super().__init__("Model has been trained for this dataset!") + +class Type4Py(nn.Module): + """ + Complete model + """ + + def __init__(self, input_size: int, hidden_size: int, aval_type_size: int, + num_layers: int, output_size: int, dropout_rate: float): + super(Type4Py, self).__init__() + + self.input_size = input_size + self.hidden_size = hidden_size + self.aval_type_size = aval_type_size + self.num_layers = num_layers + self.output_size = output_size + + self.lstm_id = nn.LSTM(self.input_size, self.hidden_size, self.num_layers, batch_first=True, + bidirectional=True) + self.lstm_tok = nn.LSTM(self.input_size, self.hidden_size, self.num_layers, batch_first=True, + bidirectional=True) + self.linear = nn.Linear(self.hidden_size * 2 * 2 + self.aval_type_size, self.output_size) + self.dropout = nn.Dropout(p=dropout_rate) + + def forward(self, x_id, x_tok, x_type): + # Using dropout on input sequences + x_id = self.dropout(x_id) + x_tok = self.dropout(x_tok) + + # Flattens LSTMs weights for data-parallelism in multi-GPUs config + self.lstm_id.flatten_parameters() + self.lstm_tok.flatten_parameters() + + x_id, _ = self.lstm_id(x_id) + x_tok, _ = self.lstm_tok(x_tok) + + # Decode the hidden state of the last time step + x_id = x_id[:, -1, :] + x_tok = x_tok[:, -1, :] + + x = torch.cat((x_id, x_tok, x_type), 1) + + x = self.linear(x) + return x + + +class TripletModel(nn.Module): + """ + A model with Triplet loss for similarity learning + """ + + def __init__(self, model: nn.Module): + super(TripletModel, self).__init__() + self.model = model + + def forward(self, a, p, n): + """ + A triplet consists of anchor, positive examples and negative examples + """ + # return self.model(*(s.to(DEVICE) for s in a)), \ + # self.model(*(s.to(DEVICE) for s in p)), \ + # self.model(*(s.to(DEVICE) for s in n)) + + return self.model(*(s for s in a)), \ + self.model(*(s for s in p)), \ + self.model(*(s for s in n)) + + +def load_model(model_type: str, model_params: dict): + """ + Load the Type4Py model with desired confings + """ + + if model_type == "complete": + return Type4Py(W2V_VEC_LENGTH, model_params['hidden_size'], AVAILABLE_TYPES_NUMBER, model_params['layers'], + model_params['output_size'], model_params['dr']).to(DEVICE) + else: + raise NotCompleteModel + + +def create_knn_index(train_types_embed: np.array, valid_types_embed: np.array, type_embed_dim: int) -> AnnoyIndex: + """ + Creates KNNs index for given type embedding vectors + """ + + annoy_idx = AnnoyIndex(type_embed_dim, 'euclidean') + + for i, v in enumerate(tqdm(train_types_embed, total=len(train_types_embed), + desc="KNN index")): + annoy_idx.add_item(i, v) + + if valid_types_embed is not None: + for i, v in enumerate(valid_types_embed): + annoy_idx.add_item(len(train_types_embed) + i, v) + + annoy_idx.build(KNN_TREE_SIZE) + return annoy_idx + + +def train_loop_dsl(model: TripletModel, criterion, optimizer, train_data_loader: DataLoader, + valid_data_loader: DataLoader, learning_rate: float, epochs: int, + ubiquitous_types: str, common_types: set, model_path: str): + from type4py.predict import predict_type_embed + + for epoch in range(1, epochs + 1): + model.train() + # epoch_start_t = time() + total_loss = 0 + + for batch_i, (anchor, positive_ex, negative_ex) in enumerate(tqdm(train_data_loader, + total=len(train_data_loader), + desc=f"Epoch {epoch}")): + anchor, _ = anchor[0], anchor[1] + positive_ex, _ = positive_ex[0], positive_ex[1] + negative_ex, _ = negative_ex[0], negative_ex[1] + + optimizer.zero_grad() + anchor_embed, positive_ex_embed, negative_ex_embed = model(anchor, positive_ex, negative_ex) + loss = criterion(anchor_embed, positive_ex_embed, negative_ex_embed) + + # Backward and optimize + loss.backward() + optimizer.step() + + total_loss += loss.item() + + logger.info(f"epoch: {epoch} train loss: {total_loss}") + + if valid_data_loader is not None: + if epoch % 5 == 0: + logger.info("Evaluating on validation set") + valid_start = time() + valid_loss, valid_all_acc = compute_validation_loss_dsl(model, criterion, train_data_loader, + valid_data_loader, + predict_type_embed, ubiquitous_types, + common_types) + logger.info(f"epoch: {epoch} valid loss: {valid_loss} in {(time() - valid_start) / 60.0:.2f} min.") + # torch.save(model.module, join(model_path, f"{model.module.tw_embed_model.__class__.__name__}_{train_data_loader.dataset.dataset_name}_e{epoch}_{datetime.now().strftime('%b%d_%H-%M-%S')}.pt")) + + +def compute_validation_loss_dsl(model: TripletModel, criterion, train_valid_loader: DataLoader, + valid_data_loader: DataLoader, pred_func: callable, + ubiquitous_types: str, common_types: set) -> Tuple[float, float]: + """ + Computes validation loss for Deep Similarity Learning-based approach + """ + + valid_total_loss = 0 + with torch.no_grad(): + model.eval() + + if isinstance(model, nn.DataParallel): + main_model_forward = model.module.model + else: + main_model_forward = model.model + + computed_embed_batches_train = [] + computed_embed_labels_train = [] + computed_embed_batches_valid = [] + computed_embed_labels_valid = [] + + for batch_i, (anchor, positive_ex, negative_ex) in enumerate(tqdm(valid_data_loader, + total=len(valid_data_loader), + desc="Type Cluster - Valid set")): + positive_ex, _ = positive_ex[0], positive_ex[1] + negative_ex, _ = negative_ex[0], negative_ex[1] + + anchor_embed, positive_ex_embed, negative_ex_embed = model(anchor[0], positive_ex, negative_ex) + loss = criterion(anchor_embed, positive_ex_embed, negative_ex_embed) + valid_total_loss += loss.item() + + output_a = main_model_forward(*(s.to(DEVICE) for s in anchor[0])) + computed_embed_batches_valid.append(output_a.data.cpu().numpy()) + computed_embed_labels_valid.append(anchor[1].data.cpu().numpy()) + + return valid_total_loss, 0.0 + +def check_pickle_file(type, data_loading_funcs, output_path): + var_exist = False + param_exist = False + ret_exist = False + if os.path.exists(join(output_path, f"{data_loading_funcs['name']}_common_types_var.pkl")) and type != "var": + var_exist = True + logger.info(f"find existing {data_loading_funcs['name']}_common_types_var.pkl file !") + if os.path.exists(join(output_path, f"{data_loading_funcs['name']}_common_types_param.pkl")) and type != "param": + param_exist = True + logger.info(f"find existing {data_loading_funcs['name']}_common_types_param.pkl file !") + if os.path.exists(join(output_path, f"{data_loading_funcs['name']}_common_types_ret.pkl")) and type != "ret": + ret_exist = True + logger.info(f"find existing {data_loading_funcs['name']}_common_types_ret.pkl file !") + return var_exist, param_exist, ret_exist + +def find_existing_model(data_loading_funcs, output_path): + prefix = f"type4py_{data_loading_funcs['name']}_model" + suffix = ".pt" + for filename in os.listdir(output_path): + if filename.startswith(prefix) and filename.endswith(suffix): + logger.info(f"find existing model file: {filename}!") + middle = filename[len(prefix):-len(suffix)] + trained = middle.split("_") + return filename, trained + return None, None + +def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, model_params_path=None, validation: bool = False): + logger.info(f"Training Type4Py model") + logger.info(f"***********************************************************************") + + # Model's hyper parameters + model_params = load_model_params(model_params_path) + data_type_list = ["var", "param", "ret"] + if dataset_type not in data_type_list: + raise ValueError(f"{dataset_type} is not in the default data type list!") + + train_data_loader, valid_data_loader = load_training_data_per_model_sep(data_loading_funcs, output_path,dataset_type, + model_params['batches'], + load_valid_data=validation, + no_workers=cpu_count() // 2) + + # Loading label encoder and finding ubiquitous & common types + le_all = pickle.load(open(join(output_path, "label_encoder_all.pkl"), 'rb')) + count_types = Counter(train_data_loader.dataset.labels.data.numpy()) + + var_exists, param_exits, ret_exists = check_pickle_file(dataset_type, data_loading_funcs, output_path) + + if os.path.exists(join(output_path, f"{data_loading_funcs['name']}_common_types_{dataset_type}.pkl")): + logger.warn(f"{data_loading_funcs['name']}_common_types_{dataset_type}.pkl file exists!") + + with open(join(output_path, f"{data_loading_funcs['name']}_common_types_{dataset_type}.pkl"), 'wb') as f: + pickle.dump(count_types, f) + + type_filename = dataset_type + + if var_exists and dataset_type != "var": + with open(join(output_path, f"{data_loading_funcs['name']}_common_types_var.pkl"), 'rb') as f1: + count_types_var = pickle.load(f1) + count_types.update(count_types_var) + type_filename = type_filename + "_var" + + if param_exits and dataset_type != "param": + with open(join(output_path, f"{data_loading_funcs['name']}_common_types_param.pkl"), 'rb') as f2: + count_types_param = pickle.load(f2) + count_types.update(count_types_param) + type_filename = type_filename + "_param" + + if ret_exists and dataset_type != "ret": + with open(join(output_path, f"{data_loading_funcs['name']}_common_types_ret.pkl"), 'rb') as f3: + count_types_ret = pickle.load(f3) + count_types.update(count_types_ret) + type_filename = type_filename + "_ret" + + common_types = [t.item() for t in train_data_loader.dataset.labels if count_types[t.item()] >= 100] + ubiquitous_types = set(le_all.transform(['str', 'int', 'list', 'bool', 'float'])) + common_types = set(common_types) - ubiquitous_types + + logger.info("Percentage of ubiquitous types: %.2f%%" % (len([t.item() for t in \ + train_data_loader.dataset.labels if + t.item() in ubiquitous_types]) / + train_data_loader.dataset.labels.shape[0] * 100.0)) + logger.info("Percentage of common types: %.2f%%" % (len([t.item() for t in \ + train_data_loader.dataset.labels if + t.item() in common_types]) / + train_data_loader.dataset.labels.shape[0] * 100.0)) + + with open(join(output_path, f"{data_loading_funcs['name']}_common_types_{type_filename}.pkl"), 'wb') as f: + pickle.dump(common_types, f) + + trained_model_name, trained_types = find_existing_model(data_loading_funcs, output_path) + + if trained_types == None: + logger.info("No trained model found, starting to intialize the model...") + # Loading the model + model = load_model(data_loading_funcs['name'], model_params) + logger.info(f"Intializing the {model.__class__.__name__} model") + model = TripletModel(model).to(DEVICE) + else: + if dataset_type in trained_types: + raise ModelTrainedError + else: + logger.info(f"Loading saved model {trained_model_name}...") + model = torch.load(join(output_path, trained_model_name)) + + if torch.cuda.device_count() > 1: + model = nn.DataParallel(model) + + criterion = torch.nn.TripletMarginLoss(margin=model_params['margin']) + optimizer = torch.optim.Adam(model.parameters(), lr=model_params['lr']) + + train_t = time() + train_loop_dsl(model, criterion, optimizer, train_data_loader, + valid_data_loader if validation else None, model_params['lr'], + model_params['epochs'], ubiquitous_types, common_types, None) + logger.info("Training finished in %.2f min" % ((time() - train_t) / 60)) + + # Saving the model + logger.info("Saved the trained Type4Py model for %s prediction on the disk" % data_loading_funcs['name']) + os.remove(output_path, trained_model_name) + torch.save(model.module if torch.cuda.device_count() > 1 else model, + join(output_path, f"{trained_model_name[:-3]}_{dataset_type}.pt")) From b38ba1e259bfe0fa5b709ff01f18a5e952ec9ec2 Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 23 Mar 2023 10:58:39 +0100 Subject: [PATCH 03/43] add gen_cluster and update reduce for batches script --- type4py/__main__.py | 15 +- type4py/gen_cluster.py | 314 +++++++++++++++++++++++++++++++++++++++++ type4py/reduce.py | 43 +++++- type4py/to_onnx.py | 13 +- 4 files changed, 377 insertions(+), 8 deletions(-) create mode 100644 type4py/gen_cluster.py diff --git a/type4py/__main__.py b/type4py/__main__.py index 97a940e..475a8de 100644 --- a/type4py/__main__.py +++ b/type4py/__main__.py @@ -85,6 +85,11 @@ def predict(args): elif args.c: test(args.o, data_loading_comb, args.l, args.rtc) +def gen_cluster(args): + from type4py.gen_cluster import gen_cluster + setup_logs_file(args.o, "gen_clusters") + gen_cluster(args.o, data_loading_comb_sep, args.dt) + def eval(args): from type4py.eval import evaluate setup_logs_file(args.o, "eval") @@ -100,7 +105,7 @@ def eval(args): evaluate(args.o, data_loading_comb['name'], tasks[args.t], args.tp, args.mrr) def infer(args): - from type4py.infer import infer_main + from type4py.deploy.infer import infer_main setup_logs_file(args.m, 'infer') infer_main(args.m, args.f) @@ -164,6 +169,13 @@ def main(): predict_parser.add_argument('--wov', default=False, action="store_true", help="Type4py model w/o visible type hints") predict_parser.set_defaults(func=predict) + # gen type cluster incremental: predict phase + predict_parser = sub_parsers.add_parser('gen_clu') + predict_parser.add_argument('--o', '--output', required=True, type=str, help="Path to processed projects") + predict_parser.add_argument('--dt', '--datatype', required=True, help="Datatype for generating type clusters") + predict_parser.set_defaults(func=gen_cluster) + + # Evaluation phase eval_parser = sub_parsers.add_parser('eval') eval_parser.add_argument('--o', '--output', required=True, type=str, help="Path to processed projects") @@ -194,6 +206,7 @@ def main(): reduce_parser = sub_parsers.add_parser('reduce') reduce_parser.add_argument("--o", required=True, type=str, help="Path to processed projects") reduce_parser.add_argument("--d", default=256, type=int, help="A new dimension for type clusters [Default: 256]") + reduce_parser.add_argument("--batch", default=False, action="store_true", help="Reduce type clusters in batches") reduce_parser.set_defaults(func=reduce_tc) args = arg_parser.parse_args() diff --git a/type4py/gen_cluster.py b/type4py/gen_cluster.py new file mode 100644 index 0000000..64d863b --- /dev/null +++ b/type4py/gen_cluster.py @@ -0,0 +1,314 @@ +import argparse +import os + +from type4py.learn import load_model, TripletModel + +from type4py.data_loaders import select_data, TripletDataset, load_test_data_per_model, load_training_data_per_model_sep +from type4py.deploy.infer import compute_types_score +from type4py.utils import load_model_params, setup_logs_file +from type4py import logger, MIN_DATA_POINTS, KNN_TREE_SIZE, data_loaders +from libsa4py.utils import save_json +from typing import Tuple, List +from os.path import join +from time import time +from torch.utils.data import DataLoader +from tqdm import tqdm +from annoy import AnnoyIndex +from sklearn.decomposition import PCA +import numpy as np +import pandas as pd +import pickle +import re +import torch +import torch.nn as nn + +logger.name = __name__ +DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +class ModelNotfound(Exception): + pass + +class ModelNotExistsError(ModelNotfound): + def __init__(self, model_name): + super().__init__(f"Model {model_name} not found!") + +class Type4Py(nn.Module): + """ + Complete model + """ + + def __init__(self, input_size: int, hidden_size: int, aval_type_size: int, + num_layers: int, output_size: int, dropout_rate: float): + super(Type4Py, self).__init__() + + self.input_size = input_size + self.hidden_size = hidden_size + self.aval_type_size = aval_type_size + self.num_layers = num_layers + self.output_size = output_size + + self.lstm_id = nn.LSTM(self.input_size, self.hidden_size, self.num_layers, batch_first=True, + bidirectional=True) + self.lstm_tok = nn.LSTM(self.input_size, self.hidden_size, self.num_layers, batch_first=True, + bidirectional=True) + self.linear = nn.Linear(self.hidden_size * 2 * 2 + self.aval_type_size, self.output_size) + self.dropout = nn.Dropout(p=dropout_rate) + + def forward(self, x_id, x_tok, x_type): + # Using dropout on input sequences + x_id = self.dropout(x_id) + x_tok = self.dropout(x_tok) + + # Flattens LSTMs weights for data-parallelism in multi-GPUs config + self.lstm_id.flatten_parameters() + self.lstm_tok.flatten_parameters() + + x_id, _ = self.lstm_id(x_id) + x_tok, _ = self.lstm_tok(x_tok) + + # Decode the hidden state of the last time step + x_id = x_id[:, -1, :] + x_tok = x_tok[:, -1, :] + + x = torch.cat((x_id, x_tok, x_type), 1) + + x = self.linear(x) + return x + + +class TripletModel(nn.Module): + """ + A model with Triplet loss for similarity learning + """ + + def __init__(self, model: nn.Module): + super(TripletModel, self).__init__() + self.model = model + + def forward(self, a, p, n): + """ + A triplet consists of anchor, positive examples and negative examples + """ + # return self.model(*(s.to(DEVICE) for s in a)), \ + # self.model(*(s.to(DEVICE) for s in p)), \ + # self.model(*(s.to(DEVICE) for s in n)) + + return self.model(*(s for s in a)), \ + self.model(*(s for s in p)), \ + self.model(*(s for s in n)) + + +def predict_type_embed(types_embed_array: np.array, types_embed_labels: np.array, + indexed_knn: AnnoyIndex, k: int) -> List[dict]: + """ + Predict type of given type embedding vectors + """ + + pred_types_embed = [] + pred_types_score = [] + for i, embed_vec in enumerate( + tqdm(types_embed_array, total=len(types_embed_array), desc="Finding KNNs & Prediction")): + idx, dist = indexed_knn.get_nns_by_vector(embed_vec, k, include_distances=True) + pred_idx_scores = compute_types_score(dist, idx, types_embed_labels) + pred_types_embed.append([i for i, s in pred_idx_scores]) + pred_types_score.append(pred_idx_scores) + + return pred_types_embed, pred_types_score + + +def predict_type_embed_task(types_embed_array: np.array, types_embed_labels: np.array, type_space_labels: np.array, + pred_task_idx: tuple, indexed_knn: AnnoyIndex, k: int) -> List[dict]: + def find_pred_task(i: int): + if i < pred_task_idx[0]: + return 'Parameter' + elif i < pred_task_idx[1]: + return 'Return' + else: + return 'Variable' + + pred_types: List[dict] = [] + # pred_types_embed = [] + # pred_types_score = [] + for i, embed_vec in enumerate( + tqdm(types_embed_array, total=len(types_embed_array), desc="Finding KNNs & Prediction")): + idx, dist = indexed_knn.get_nns_by_vector(embed_vec, k, include_distances=True) + pred_idx_scores = compute_types_score(dist, idx, type_space_labels) + + pred_types.append({'original_type': types_embed_labels[i], 'predictions': pred_idx_scores, + 'task': find_pred_task(i), + 'is_parametric': bool(re.match(r'(.+)\[(.+)\]', types_embed_labels[i]))}) + + # pred_types_embed.append([i for i, s in pred_idx_scores]) + # pred_types_score.append(pred_idx_scores) + + return pred_types + + +def build_type_clusters(model, output_path, train_data_loader: DataLoader, valid_data_loader: DataLoader, type_vocab: set, + exist_index: str, exist_emd: str): + logger.info("Type Cluster building begin...") + computed_embed_labels = [] + annoy_idx = AnnoyIndex(model.output_size, 'euclidean') + loaded_idx = AnnoyIndex(model.output_size, 'euclidean') + curr_idx = 0 + + if exist_index is not None: + loaded_idx.load(join(output_path, exist_index)) + curr_idx = loaded_idx.get_n_items() + for i in range(loaded_idx.get_n_items()): + item_vector = loaded_idx.get_item_vector(i) + annoy_idx.add_item(i, item_vector) + + if exist_emd is not None: + embedd_labels = np.load(join(output_path, exist_emd)).tolist() + computed_embed_labels.extend(embedd_labels) + + for _, (a, _, _) in enumerate( + tqdm(train_data_loader, total=len(train_data_loader), desc="Computing Type Clusters - Train set")): + model.eval() + with torch.no_grad(): + output_a = model(*(s.to(DEVICE) for s in a[0])) + lables = a[1].data.cpu().numpy() + # computed_embed_labels.append(lables) + for i, v in enumerate(output_a.data.cpu().numpy()): + if lables[i] in type_vocab: + annoy_idx.add_item(curr_idx, v) + computed_embed_labels.append(lables[i]) + curr_idx += 1 + + for _, (a, _, _) in enumerate( + tqdm(valid_data_loader, total=len(valid_data_loader), desc="Computing Type Clusters - Valid set")): + model.eval() + with torch.no_grad(): + output_a = model(*(s.to(DEVICE) for s in a[0])) + lables = a[1].data.cpu().numpy() + # computed_embed_labels.append(a[1].data.cpu().numpy()) + for i, v in enumerate(output_a.data.cpu().numpy()): + if lables[i] in type_vocab: + annoy_idx.add_item(curr_idx, v) + computed_embed_labels.append(lables[i]) + curr_idx += 1 + + annoy_idx.build(KNN_TREE_SIZE) + # annoy_idx. + return annoy_idx, np.array(computed_embed_labels) # np.hstack(computed_embed_labels) + + +def compute_type_embed_batch(model, data_loader: DataLoader, pca: PCA = None) -> Tuple[np.array, np.array]: + """ + Compute type embeddings for the whole dataset + """ + + computed_embed_batches = [] + computed_embed_labels = [] + + for batch_i, (a, p, n) in enumerate(tqdm(data_loader, total=len(data_loader), desc="Computing Type Clusters")): + model.eval() + with torch.no_grad(): + output_a = model(*(s.to(DEVICE) for s in a[0])) + output_a = output_a.data.cpu().numpy() + computed_embed_batches.append(pca.transform(output_a) if pca is not None else output_a) + computed_embed_labels.append(a[1].data.cpu().numpy()) + + return np.vstack(computed_embed_batches), np.hstack(computed_embed_labels) + + +class DataTypeNotExistError(Exception): + pass + + +def find_existing_index(data_loading_funcs, output_path): + prefix = f"type4py_{data_loading_funcs['name']}_type_cluster" + for filename in os.listdir(output_path): + if filename.startswith(prefix): + logger.info(f"find existing TypeCluster file: {filename}!") + middle = filename[len(prefix):] + trained = middle.split("_") + return filename, trained + return None, None + + +def find_existing_embedding(data_loading_funcs, output_path): + prefix = f"type4py_{data_loading_funcs['name']}_true" + suffix = ".npy" + for filename in os.listdir(output_path): + if filename.startswith(prefix) and filename.endswith(suffix): + logger.info(f"find existing Embedding file: {filename}!") + middle = filename[:-len(suffix)] + # trained = middle.split("_") + return filename, middle + return None, None + + +def gen_cluster(output_path: str, data_loading_funcs: dict, datatype: str, type_vocab_limit: int = None, + use_tc_reduced: bool = False): + logger.info(f"Testing Type4Py model") + logger.info(f"**********************************************************************") + + # Model's hyper parameters + model_params = load_model_params() + if os.path.exists(join(output_path, f"type4py_{data_loading_funcs['name']}_model_var_param_ret.pt")): + model = torch.load(join(output_path, f"type4py_{data_loading_funcs['name']}_model_var_param_ret.pt")) + else: + raise ModelNotExistsError("type4py_{data_loading_funcs['name']}_model_var_param_ret.pt") + + le_all = pickle.load(open(join(output_path, "label_encoder_all.pkl"), 'rb')) + type_vocab = pd.read_csv(join(output_path, '_most_frequent_all_types.csv')).head( + type_vocab_limit if type_vocab_limit is not None else -1) + type_vocab = set(le_all.transform(type_vocab['type'].values)) + logger.info(f"Loaded the pre-trained Type4Py {data_loading_funcs['name']} model") + logger.info(f"Type4Py's trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}") + + annoy_index: AnnoyIndex = None + pca_transform: PCA = None + embed_labels: np.array = None + + if not use_tc_reduced: + + # checking datatype + if datatype not in {"var", "param", "ret"}: + raise DataTypeNotExistError(f"datatype input {datatype} not in [ var, param, ret] list") + + # checking and loading the existing Annoy_Index + logger.info("Checking the existing AnnoyIndex...") + cluster_file, processed_type = find_existing_index(data_loading_funcs, output_path) + if cluster_file is None: + logger.info("No existing AnnoyIndex found, started to initialising") + + # checking and loading the embedded labels + logger.info("Checking the existing Embedding labels...") + embedded_file, processed_type_em = find_existing_embedding(data_loading_funcs, output_path) + if embedded_file is None: + logger.info("No existing Embedding file found, started to initialising") + + # Loading dataset + logger.info(f"Loading train and valid sets for datatype {datatype}") + data_type_list = ["var", "param", "ret"] + if datatype not in data_type_list: + raise ValueError(f"{datatype} is not in the default data type list!") + + train_data_loader, valid_data_loader = load_training_data_per_model_sep(data_loading_funcs, output_path, + datatype, + model_params['batches']) + logger.info(f"Train and Valid data loaded") + + annoy_index, embed_labels = build_type_clusters(model.model, output_path, train_data_loader, valid_data_loader, type_vocab, + cluster_file, embedded_file) + logger.info("Created type clusters") + + # update and save the annoy_index and embed_labels + if cluster_file is not None: + os.remove(join(output_path,cluster_file)) + cluster_file = cluster_file + "_" + datatype + annoy_index.save(join(output_path, cluster_file)) + else: + annoy_index.save(join(output_path, f"type4py_{data_loading_funcs['name']}_type_cluster_{datatype}")) + + if embedded_file is not None: + os.remove(join(output_path, embedded_file)) + embedded_file = processed_type_em + "_" + datatype + np.save(join(output_path, embedded_file), embed_labels) + else: + np.save(join(output_path, f"type4py_{data_loading_funcs['name']}_true_{datatype}.npy"), embed_labels) + + logger.info("Saved type clusters") diff --git a/type4py/reduce.py b/type4py/reduce.py index b669508..32ac682 100644 --- a/type4py/reduce.py +++ b/type4py/reduce.py @@ -2,11 +2,12 @@ This script uses PCA to reduce the dimension of type clusters, which decreases the size of type clusters (Annoy Index). NOTE THAT the reduced version of type clusters causes a slight performance loss in type prediction. """ +import os.path from type4py import logger, KNN_TREE_SIZE from type4py.utils import load_model_params from annoy import AnnoyIndex -from sklearn.decomposition import PCA +from sklearn.decomposition import PCA, IncrementalPCA from os.path import join from tqdm import tqdm import numpy as np @@ -14,10 +15,24 @@ logger.name = __name__ + +class TypeClusterNotFound(Exception): + def __init__(self): + super().__init__("Type clusters not found!") + + def reduce_tc(args): model_params = load_model_params() type_cluster_index = AnnoyIndex(model_params['output_size'], 'euclidean') - type_cluster_index.load(join(args.o, "type4py_complete_type_cluster")) + if os.path.exists(join(args.o, "type4py_complete_type_cluster")): + logger.info("Loading type clusters: type4py_complete_type_cluster") + type_cluster_index.load(join(args.o, "type4py_complete_type_cluster")) + elif os.path.exists(join(args.o, "type4py_complete_type_cluster_var_param_return")): + logger.info("Loading type clusters: type4py_complete_type_cluster_var_param_return") + type_cluster_index.load(join(args.o, "type4py_complete_type_cluster_var_param_return")) + else: + raise TypeClusterNotFound + logger.info("Loaded type clusters") type_cluster_dps = np.zeros((type_cluster_index.get_n_items(), model_params['output_size'])) @@ -25,8 +40,27 @@ def reduce_tc(args): type_cluster_dps[i] = type_cluster_index.get_item_vector(i) logger.info(f"Applying PCA to type clusters to reduce dimension from {model_params['output_size']} to {args.d}") - pca = PCA(n_components=args.d) - reduced_type_clusters = pca.fit_transform(type_cluster_dps) + + if not args.batch: + pca = PCA(n_components=args.d) + reduced_type_clusters = pca.fit_transform(type_cluster_dps) + else: + n_samples, n_features = type_cluster_dps.shape + batch_size = 1000 + + # Create an instance of IncrementalPCA + pca = IncrementalPCA(n_components=args.d, batch_size=1000) + logger.info("Starting the PCA fitting process") + # Loop over batches of data, updating the PCA with each batch + for batch_start in tqdm(range(0, n_samples, batch_size)): + batch_end = min(batch_start + batch_size, n_samples) + X_batch = type_cluster_dps[batch_start:batch_end, :] + pca.partial_fit(X_batch) + # logger.info(f"{batch_end} have been processed") + + logger.info("Starting the PCA transform process") + # Transform the data into the PCA space + reduced_type_clusters = pca.transform(type_cluster_dps) pickle.dump(pca, open(join(args.o, 'type_clusters_pca.pkl'), 'wb')) @@ -38,4 +72,3 @@ def reduce_tc(args): tc_reduced_index.build(KNN_TREE_SIZE) tc_reduced_index.save(join(args.o, 'type4py_complete_type_cluster_reduced')) logger.info("Saved the reduced type clusters on the disk") - \ No newline at end of file diff --git a/type4py/to_onnx.py b/type4py/to_onnx.py index 29e4e8f..57a23df 100644 --- a/type4py/to_onnx.py +++ b/type4py/to_onnx.py @@ -1,7 +1,7 @@ """ Converts the pre-trained Type4Py model to ONNX """ - +import os.path from os.path import join from type4py import logger from type4py.data_loaders import to_numpy @@ -17,7 +17,16 @@ def type4py_to_onnx(args): - type4py_model = torch.load(join(args.o, "type4py_complete_model.pt")).model + + if os.path.exists(join(args.o, "type4py_complete_model.pt")): + logger.info("Loading the pre-trained Type4Py model") + type4py_model = torch.load(join(args.o, "type4py_complete_model.pt")).model + elif os.path.exists(join(args.o, "type4py_complete_model_var_param_return.pt")): + logger.info("Loading the pre-trained Type4Py model") + type4py_model = torch.load(join(args.o, "type4py_complete_model_var_param_return.pt")).model + else: + raise FileNotFoundError("Type4Py model not found!") + type4py_model.eval() logger.info("Loaded the pre-trained Type4Py model") From bfb973253669b7c999595a8d843cf8494a423958 Mon Sep 17 00:00:00 2001 From: fenglang Date: Fri, 24 Mar 2023 10:36:39 +0100 Subject: [PATCH 04/43] add infer_project CLI command for infer the dataset --- dataset_split_repo.csv | 2222 +++++++++++++++++++++++++++++++ type4py/__main__.py | 125 +- type4py/deploy/infer.py | 4 +- type4py/deploy/infer_project.py | 89 ++ 4 files changed, 2399 insertions(+), 41 deletions(-) create mode 100644 dataset_split_repo.csv create mode 100644 type4py/deploy/infer_project.py diff --git a/dataset_split_repo.csv b/dataset_split_repo.csv new file mode 100644 index 0000000..8b9a78e --- /dev/null +++ b/dataset_split_repo.csv @@ -0,0 +1,2222 @@ +set,project,json +train,data/NixOS/nixpkgs,NixOSnixpkgs.json +train,data/daxartio/tinvest,daxartiotinvest.json +train,data/rendrom/rosreestr2coord,rendromrosreestr2coord.json +train,data/darknessomi/musicbox,darknessomimusicbox.json +train,data/pydantic/pydantic-core,pydanticpydantic-core.json +train,data/pyauth/pyotp,pyauthpyotp.json +train,data/equinor/ert,equinorert.json +train,data/apache/submarine,apachesubmarine.json +train,data/nipunn1313/mypy-protobuf,nipunn1313mypy-protobuf.json +train,data/wrike/callisto,wrikecallisto.json +train,data/Gobot1234/steam.py,Gobot1234steam.py.json +train,data/cjolowicz/cookiecutter-hypermodern-python,cjolowiczcookiecutter-hypermodern-python.json +train,data/ponty/pyscreenshot,pontypyscreenshot.json +train,data/fzls/djc_helper,fzlsdjc_helper.json +train,data/hove-io/navitia,hove-ionavitia.json +train,data/facebookresearch/demucs,facebookresearchdemucs.json +train,data/davidhamann/python-fmrest,davidhamannpython-fmrest.json +train,data/pytest-dev/pytest-factoryboy,pytest-devpytest-factoryboy.json +train,data/darrikonn/td-cli,darrikonntd-cli.json +train,data/PeterDing/BaiduPCS-Py,PeterDingBaiduPCS-Py.json +train,data/ehForwarderBot/ehForwarderBot,ehForwarderBotehForwarderBot.json +train,data/aio-libs/aiohttp-debugtoolbar,aio-libsaiohttp-debugtoolbar.json +train,data/xitorch/xitorch,xitorchxitorch.json +train,data/ljvmiranda921/seagull,ljvmiranda921seagull.json +train,data/facelessuser/wcmatch,facelessuserwcmatch.json +train,data/microsoft/bistring,microsoftbistring.json +train,data/danieljfarrell/pvtrace,danieljfarrellpvtrace.json +train,data/asvetlov/us-pycon-2019-tutorial,asvetlovus-pycon-2019-tutorial.json +train,data/zulip/zulip-terminal,zulipzulip-terminal.json +train,data/hbmartin/graphviz2drawio,hbmartingraphviz2drawio.json +train,data/insitro/redun,insitroredun.json +train,data/redis/redis-om-python,redisredis-om-python.json +train,data/mjwestcott/runnel,mjwestcottrunnel.json +train,data/usc-isi-i2/cskg,usc-isi-i2cskg.json +train,data/allegro/allRank,allegroallRank.json +train,data/sourcelair/ceryx,sourcelairceryx.json +train,data/freedomofpress/securedrop,freedomofpresssecuredrop.json +train,data/kislyuk/aegea,kislyukaegea.json +train,data/ehForwarderBot/efb-telegram-master,ehForwarderBotefb-telegram-master.json +train,data/StellarCN/py-stellar-base,StellarCNpy-stellar-base.json +train,data/lorenzocesconetto/fastapi-postgresql,lorenzocesconettofastapi-postgresql.json +train,data/sudoguy/tiktokpy,sudoguytiktokpy.json +train,data/rasterio/rasterio,rasteriorasterio.json +train,data/abilian/abilian-sbe,abilianabilian-sbe.json +train,data/cdump/radiacode,cdumpradiacode.json +train,data/blockworks-foundation/mango-explorer,blockworks-foundationmango-explorer.json +train,data/quora/asynq,quoraasynq.json +train,data/google/agi,googleagi.json +train,data/flask-extensions/flaskextensions.com,flask-extensionsflaskextensions.com.json +train,data/provinzio/CoinTaxman,provinzioCoinTaxman.json +train,data/GitGuardian/py-gitguardian,GitGuardianpy-gitguardian.json +train,data/ets-labs/python-dependency-injector,ets-labspython-dependency-injector.json +train,data/Yelp/fuzz-lightyear,Yelpfuzz-lightyear.json +train,data/napalm-automation/napalm,napalm-automationnapalm.json +train,data/asanakoy/kaggle-lyft-motion-prediction-av,asanakoykaggle-lyft-motion-prediction-av.json +train,data/Limych/ha-car_wash,Limychha-car_wash.json +train,data/baal-org/baal,baal-orgbaal.json +train,data/zalando-incubator/kopf,zalando-incubatorkopf.json +train,data/ep1cman/unifi-protect-backup,ep1cmanunifi-protect-backup.json +train,data/metriculous-ml/metriculous,metriculous-mlmetriculous.json +train,data/OSLL/qemu-xtensa,OSLLqemu-xtensa.json +train,data/materialsvirtuallab/maml,materialsvirtuallabmaml.json +train,data/luqasz/librouteros,luqaszlibrouteros.json +train,data/internetarchive/fatcat,internetarchivefatcat.json +train,data/aio-libs/aiohttp-demos,aio-libsaiohttp-demos.json +train,data/larose/utt,laroseutt.json +train,data/jupyter/nbclient,jupyternbclient.json +train,data/volitank/nala,volitanknala.json +train,data/saadmk11/changelog-ci,saadmk11changelog-ci.json +train,data/PythonFreeCourse/lms,PythonFreeCourselms.json +train,data/jaesivsm/JARR,jaesivsmJARR.json +train,data/bain3/pronotepy,bain3pronotepy.json +train,data/alenpaulvarghese/Web-Screenshot-Bot,alenpaulvargheseWeb-Screenshot-Bot.json +train,data/rustedpy/result,rustedpyresult.json +train,data/burakbayramli/books,burakbayramlibooks.json +train,data/BenevolentAI/guacamol,BenevolentAIguacamol.json +train,data/skorokithakis/django-project-template,skorokithakisdjango-project-template.json +train,data/python-poetry/poetry-plugin-export,python-poetrypoetry-plugin-export.json +train,data/medtagger/MedTagger,medtaggerMedTagger.json +train,data/MartinThoma/algorithms,MartinThomaalgorithms.json +train,data/pattertj/LoopTrader,pattertjLoopTrader.json +train,data/kedro-org/kedro,kedro-orgkedro.json +train,data/asynkron/protoactor-python,asynkronprotoactor-python.json +train,data/5afe/safe-relay-service,5afesafe-relay-service.json +train,data/benkehoe/aws-assume-role-lib,benkehoeaws-assume-role-lib.json +train,data/custom-components/alexa_media_player,custom-componentsalexa_media_player.json +train,data/thonny/thonny,thonnythonny.json +train,data/antoniosarosi/pycritty,antoniosarosipycritty.json +train,data/strictdoc-project/strictdoc,strictdoc-projectstrictdoc.json +train,data/strozfriedberg/cobaltstrike-config-extractor,strozfriedbergcobaltstrike-config-extractor.json +train,data/wemake-services/wemake-python-package,wemake-serviceswemake-python-package.json +train,data/neo4j/graph-data-science-client,neo4jgraph-data-science-client.json +train,data/ezyang/ghstack,ezyangghstack.json +train,data/kdrag0n/pyrobud,kdrag0npyrobud.json +train,data/TotallyNotRobots/CloudBot,TotallyNotRobotsCloudBot.json +train,data/lidatong/dataclasses-json,lidatongdataclasses-json.json +train,data/omegaup/omegaup,omegaupomegaup.json +train,data/artefactory/streamlit_prophet,artefactorystreamlit_prophet.json +train,data/JDASoftwareGroup/kartothek,JDASoftwareGroupkartothek.json +train,data/pydron/ifaddr,pydronifaddr.json +train,data/backube/scribe,backubescribe.json +train,data/jettify/pytorch-optimizer,jettifypytorch-optimizer.json +train,data/pytorch/opacus,pytorchopacus.json +train,data/dry-python/returns,dry-pythonreturns.json +train,data/pallets-eco/cachelib,pallets-ecocachelib.json +train,data/loonghao/photoshop-python-api,loonghaophotoshop-python-api.json +train,data/idom-team/idom,idom-teamidom.json +train,data/jfhbrook/pyee,jfhbrookpyee.json +train,data/sourcery-ai/python-best-practices-cookiecutter,sourcery-aipython-best-practices-cookiecutter.json +train,data/meadowdata/meadowrun,meadowdatameadowrun.json +train,data/ActivityWatch/activitywatch,ActivityWatchactivitywatch.json +train,data/symforce-org/symforce,symforce-orgsymforce.json +train,data/oppia/oppia,oppiaoppia.json +train,data/explosion/spacy-transformers,explosionspacy-transformers.json +train,data/kubeflow/training-operator,kubeflowtraining-operator.json +train,data/explosion/wasabi,explosionwasabi.json +train,data/litl/backoff,litlbackoff.json +train,data/cognitedata/cognite-sdk-python,cognitedatacognite-sdk-python.json +train,data/Mause/duckdb_engine,Mauseduckdb_engine.json +train,data/MarketSquare/robotframework-browser,MarketSquarerobotframework-browser.json +train,data/torch-points3d/torch-points3d,torch-points3dtorch-points3d.json +train,data/dry-python/lambdas,dry-pythonlambdas.json +train,data/yuval9313/FastApi-RESTful,yuval9313FastApi-RESTful.json +train,data/GArmane/python-fastapi-hex-todo,GArmanepython-fastapi-hex-todo.json +train,data/lamenezes/simple-model,lamenezessimple-model.json +train,data/mozilla/gcp-ingestion,mozillagcp-ingestion.json +train,data/msiemens/tinydb,msiemenstinydb.json +train,data/Yelp/clusterman,Yelpclusterman.json +train,data/snorkel-team/snorkel,snorkel-teamsnorkel.json +train,data/br3ndonland/inboard,br3ndonlandinboard.json +train,data/BBVA/apicheck,BBVAapicheck.json +train,data/XRPLF/xrpl-py,XRPLFxrpl-py.json +train,data/godaddy/tartufo,godaddytartufo.json +train,data/sonatype-nexus-community/jake,sonatype-nexus-communityjake.json +train,data/supabase-community/postgrest-py,supabase-communitypostgrest-py.json +train,data/faustomorales/keras-ocr,faustomoraleskeras-ocr.json +train,data/geospace-code/georinex,geospace-codegeorinex.json +train,data/graphql-python/graphql-core,graphql-pythongraphql-core.json +train,data/wbolster/jsonlines,wbolsterjsonlines.json +train,data/Escape-Technologies/graphinder,Escape-Technologiesgraphinder.json +train,data/py-sherlock/sherlock,py-sherlocksherlock.json +train,data/aiven/karapace,aivenkarapace.json +train,data/ethereum/staking-deposit-cli,ethereumstaking-deposit-cli.json +train,data/ycm-core/lsp-examples,ycm-corelsp-examples.json +train,data/20c/vaping,20cvaping.json +train,data/mwhittaker/frankenpaxos,mwhittakerfrankenpaxos.json +train,data/RTIInternational/gobbli,RTIInternationalgobbli.json +train,data/mtgjson/mtgjson,mtgjsonmtgjson.json +train,data/pinterest/pymemcache,pinterestpymemcache.json +train,data/fonttools/fonttools,fonttoolsfonttools.json +train,data/ecies/py,eciespy.json +train,data/netdevops/hier_config,netdevopshier_config.json +train,data/ickc/pantable,ickcpantable.json +train,data/your-tools/python-cli-ui,your-toolspython-cli-ui.json +train,data/noshi91/algorithm-encyclopedia,noshi91algorithm-encyclopedia.json +train,data/dmfigol/network-programmability-stream,dmfigolnetwork-programmability-stream.json +train,data/lasa01/io_import_vmf,lasa01io_import_vmf.json +train,data/samuelcolvin/arq,samuelcolvinarq.json +train,data/maqp/tfc,maqptfc.json +train,data/criteo/autofaiss,criteoautofaiss.json +train,data/onyb/reobject,onybreobject.json +train,data/fat-forensics/fat-forensics,fat-forensicsfat-forensics.json +train,data/fetchai/agents-aea,fetchaiagents-aea.json +train,data/LukasMasuch/streamlit-pydantic,LukasMasuchstreamlit-pydantic.json +train,data/herrmannlab/highdicom,herrmannlabhighdicom.json +train,data/vprusso/toqito,vprussotoqito.json +train,data/PacktPublishing/Clean-Code-in-Python-Second-Edition,PacktPublishingClean-Code-in-Python-Second-Edition.json +train,data/viaprotocol/tokenlists,viaprotocoltokenlists.json +train,data/dropbox/merou,dropboxmerou.json +train,data/TRoboto/Maha,TRobotoMaha.json +train,data/Fraunhofer-AISEC/gallia,Fraunhofer-AISECgallia.json +train,data/synesthesiam/rhasspy,synesthesiamrhasspy.json +train,data/zappa/Zappa,zappaZappa.json +train,data/bashtage/linearmodels,bashtagelinearmodels.json +train,data/nigma/django-easy-pdf,nigmadjango-easy-pdf.json +train,data/index-py/index.py,index-pyindex.py.json +train,data/cjolowicz/hypermodern-python,cjolowiczhypermodern-python.json +train,data/zama-ai/concrete-ml,zama-aiconcrete-ml.json +train,data/fandsdev/django,fandsdevdjango.json +train,data/vitorfs/woid,vitorfswoid.json +train,data/WebKit/WebKit-http,WebKitWebKit-http.json +train,data/pyslackers/website,pyslackerswebsite.json +train,data/hungpham2511/toppra,hungpham2511toppra.json +train,data/kedder/ofxstatement,kedderofxstatement.json +train,data/pfnet/pysen,pfnetpysen.json +train,data/marcoeilers/nagini,marcoeilersnagini.json +train,data/druid-io/pydruid,druid-iopydruid.json +train,data/thomas-young-2013/open-box,thomas-young-2013open-box.json +train,data/secondmind-labs/trieste,secondmind-labstrieste.json +train,data/pirate/crypto-trader,piratecrypto-trader.json +train,data/ssato/python-anyconfig,ssatopython-anyconfig.json +train,data/tiangolo/pydantic-sqlalchemy,tiangolopydantic-sqlalchemy.json +train,data/DragonMinded/bemaniutils,DragonMindedbemaniutils.json +train,data/jgehrcke/github-repo-stats,jgehrckegithub-repo-stats.json +train,data/dinoperovic/django-salesman,dinoperovicdjango-salesman.json +train,data/Qiskit/qiskit-optimization,Qiskitqiskit-optimization.json +train,data/patrick-kidger/signatory,patrick-kidgersignatory.json +train,data/python-distro/distro,python-distrodistro.json +train,data/vyapp/vy,vyappvy.json +train,data/metachris/pdfx,metachrispdfx.json +train,data/hackingmaterials/matminer,hackingmaterialsmatminer.json +train,data/BurnySc2/python-sc2,BurnySc2python-sc2.json +train,data/milesmcc/shynet,milesmccshynet.json +train,data/emissary-ingress/emissary,emissary-ingressemissary.json +train,data/HallerPatrick/frosch,HallerPatrickfrosch.json +train,data/megagonlabs/bunkai,megagonlabsbunkai.json +train,data/r-bioinformatics/edgePy,r-bioinformaticsedgePy.json +train,data/joowani/colorpedia,joowanicolorpedia.json +train,data/kubeflow/pipelines,kubeflowpipelines.json +train,data/scrapli/scrapli_netconf,scrapliscrapli_netconf.json +train,data/gforcada/flake8-builtins,gforcadaflake8-builtins.json +train,data/rumble-kong-league/marketplace,rumble-kong-leaguemarketplace.json +train,data/florimondmanca/djangorestframework-api-key,florimondmancadjangorestframework-api-key.json +train,data/pfnet-research/pfhedge,pfnet-researchpfhedge.json +train,data/nix-community/pypi2nix,nix-communitypypi2nix.json +train,data/nikitastupin/clairvoyance,nikitastupinclairvoyance.json +train,data/prius/learning,priuslearning.json +train,data/sifive/freedom-e-sdk,sifivefreedom-e-sdk.json +train,data/jstasiak/python-zeroconf,jstasiakpython-zeroconf.json +train,data/laurentS/slowapi,laurentSslowapi.json +train,data/dmyersturnbull/tyrannosaurus,dmyersturnbulltyrannosaurus.json +train,data/staticjinja/staticjinja,staticjinjastaticjinja.json +train,data/apragacz/django-rest-registration,apragaczdjango-rest-registration.json +train,data/eirannejad/pyRevit,eirannejadpyRevit.json +train,data/QQxiaoming/quard_star_tutorial,QQxiaomingquard_star_tutorial.json +train,data/colour-science/colour-demosaicing,colour-sciencecolour-demosaicing.json +train,data/wemake-services/wemake-python-styleguide,wemake-serviceswemake-python-styleguide.json +train,data/avwx-rest/avwx-engine,avwx-restavwx-engine.json +train,data/adamcharnock/lightbus,adamcharnocklightbus.json +train,data/whtsky/pixelmatch-py,whtskypixelmatch-py.json +train,data/suned/pfun,sunedpfun.json +train,data/Boavizta/environmental-footprint-data,Boaviztaenvironmental-footprint-data.json +train,data/aporia-ai/mlnotify,aporia-aimlnotify.json +train,data/rbw/aiosnow,rbwaiosnow.json +train,data/lightkurve/lightkurve,lightkurvelightkurve.json +train,data/d5h-foss/grpc-interceptor,d5h-fossgrpc-interceptor.json +train,data/cmdmnt/commandment,cmdmntcommandment.json +train,data/instadeepai/jumanji,instadeepaijumanji.json +train,data/urbanairship/python-library,urbanairshippython-library.json +train,data/aviramha/ormsgpack,aviramhaormsgpack.json +train,data/yagays/ja-timex,yagaysja-timex.json +train,data/alisaifee/limits,alisaifeelimits.json +train,data/nlef/moonraker-telegram-bot,nlefmoonraker-telegram-bot.json +train,data/sysid/sse-starlette,sysidsse-starlette.json +train,data/Pycord-Development/pycord,Pycord-Developmentpycord.json +train,data/ubisoft/mixer,ubisoftmixer.json +train,data/goincrypto/cryptocom-exchange,goincryptocryptocom-exchange.json +train,data/amundsen-io/amundsen,amundsen-ioamundsen.json +train,data/0x727/ShuiZe_0x727,0x727ShuiZe_0x727.json +train,data/tomwojcik/starlette-context,tomwojcikstarlette-context.json +train,data/scala-ts/scala-ts,scala-tsscala-ts.json +train,data/trainindata/deploying-machine-learning-models,trainindatadeploying-machine-learning-models.json +train,data/DiljotSG/MCU-Countdown,DiljotSGMCU-Countdown.json +train,data/modin-project/modin,modin-projectmodin.json +train,data/tiangolo/uvicorn-gunicorn-starlette-docker,tiangolouvicorn-gunicorn-starlette-docker.json +train,data/amundsen-io/amundsenmetadatalibrary,amundsen-ioamundsenmetadatalibrary.json +train,data/scalabel/scalabel,scalabelscalabel.json +train,data/ponty/pyunpack,pontypyunpack.json +train,data/adamcaudill/yawast,adamcaudillyawast.json +train,data/pdreker/fritz_exporter,pdrekerfritz_exporter.json +train,data/gandersen101/spaczz,gandersen101spaczz.json +train,data/pmbarrett314/curses-menu,pmbarrett314curses-menu.json +train,data/internetarchive/openlibrary-client,internetarchiveopenlibrary-client.json +train,data/ltworf/relational,ltworfrelational.json +train,data/DerwenAI/kglab,DerwenAIkglab.json +train,data/dbrattli/aioreactive,dbrattliaioreactive.json +train,data/fnl/syntok,fnlsyntok.json +train,data/dair-iitd/openie6,dair-iitdopenie6.json +train,data/useblocks/sphinx-needs,useblockssphinx-needs.json +train,data/Abjad/abjad,Abjadabjad.json +train,data/radiac/django-yarr,radiacdjango-yarr.json +train,data/aio-libs/aiodocker,aio-libsaiodocker.json +train,data/aio-libs/aiohttp-jinja2,aio-libsaiohttp-jinja2.json +train,data/ethereum/beacon_chain,ethereumbeacon_chain.json +train,data/criteo/tf-yarn,criteotf-yarn.json +train,data/sschuhmann/Helium,sschuhmannHelium.json +train,data/stevearc/pypicloud,stevearcpypicloud.json +train,data/omadson/fuzzy-c-means,omadsonfuzzy-c-means.json +train,data/synesthesiam/docker-mozillatts,synesthesiamdocker-mozillatts.json +train,data/Ousret/charset_normalizer,Ousretcharset_normalizer.json +train,data/fracdiff/fracdiff,fracdifffracdiff.json +train,data/Toufool/Auto-Split,ToufoolAuto-Split.json +train,data/classy-python/ccbv,classy-pythonccbv.json +train,data/camelot-dev/excalibur,camelot-devexcalibur.json +train,data/tinkoff-ai/etna,tinkoff-aietna.json +train,data/GT4SD/gt4sd-core,GT4SDgt4sd-core.json +train,data/omry/omegaconf,omryomegaconf.json +train,data/rospogrigio/daikin_residential,rospogrigiodaikin_residential.json +train,data/jingw/pyhdfs,jingwpyhdfs.json +train,data/cole/aiosmtplib,coleaiosmtplib.json +train,data/PyBites-Open-Source/karmabot,PyBites-Open-Sourcekarmabot.json +train,data/allenai/allennlp-guide,allenaiallennlp-guide.json +train,data/lxtGH/CAE,lxtGHCAE.json +train,data/raveberry/raveberry,raveberryraveberry.json +train,data/cdgriffith/Box,cdgriffithBox.json +train,data/JustFixNYC/who-owns-what,JustFixNYCwho-owns-what.json +train,data/evereux/pycatia,evereuxpycatia.json +train,data/OliverSherouse/bls,OliverSherousebls.json +train,data/coderedcorp/wagtail-cache,coderedcorpwagtail-cache.json +train,data/flexmock/flexmock,flexmockflexmock.json +train,data/repobee/repobee,repobeerepobee.json +train,data/wkentaro/morefusion,wkentaromorefusion.json +train,data/grantjenks/python-runstats,grantjenkspython-runstats.json +train,data/ktrueda/parquet-tools,ktruedaparquet-tools.json +train,data/nabaztag2018/pynab,nabaztag2018pynab.json +train,data/cisagov/pshtt,cisagovpshtt.json +train,data/aws/aws-sam-cli,awsaws-sam-cli.json +train,data/newAM/idasen,newAMidasen.json +train,data/EmilStenstrom/conllu,EmilStenstromconllu.json +train,data/get-pytube/pytube3,get-pytubepytube3.json +train,data/ecoppen/futuresboard,ecoppenfuturesboard.json +train,data/pytest-dev/pytest-testinfra,pytest-devpytest-testinfra.json +train,data/postlund/pyatv,postlundpyatv.json +train,data/pandabuilder/pandachaika,pandabuilderpandachaika.json +train,data/ml-tooling/ml-workspace,ml-toolingml-workspace.json +train,data/coding-horror/basic-computer-games,coding-horrorbasic-computer-games.json +train,data/languitar/pass-git-helper,languitarpass-git-helper.json +train,data/ilevkivskyi/com2ann,ilevkivskyicom2ann.json +train,data/pallets/markupsafe,palletsmarkupsafe.json +train,data/palewire/savepagenow,palewiresavepagenow.json +train,data/Cartus/DCGCN,CartusDCGCN.json +train,data/click-contrib/click-didyoumean,click-contribclick-didyoumean.json +train,data/RoboJackets/robocup-software,RoboJacketsrobocup-software.json +train,data/pipebird/pipebird,pipebirdpipebird.json +train,data/CyberAgentAILab/minituna,CyberAgentAILabminituna.json +train,data/static-frame/static-frame,static-framestatic-frame.json +train,data/pablotrinidad/cride-platzi,pablotrinidadcride-platzi.json +train,data/diffqc/dqc,diffqcdqc.json +train,data/andelf/tronpy,andelftronpy.json +train,data/tracim/tracim,tracimtracim.json +train,data/tiangolo/blog-posts,tiangoloblog-posts.json +train,data/markstory/lint-review,markstorylint-review.json +train,data/kylepollina/objexplore,kylepollinaobjexplore.json +train,data/rytilahti/python-miio,rytilahtipython-miio.json +train,data/octoenergy/timeserio,octoenergytimeserio.json +train,data/henriquepgomide/caRtola,henriquepgomidecaRtola.json +train,data/SirVer/ultisnips,SirVerultisnips.json +train,data/RobertCraigie/prisma-client-py,RobertCraigieprisma-client-py.json +train,data/rspeer/wordfreq,rspeerwordfreq.json +train,data/vmiklos/ged2dot,vmiklosged2dot.json +train,data/rentruewang/koila,rentruewangkoila.json +train,data/lseffer/stock_screener,lsefferstock_screener.json +train,data/tsileo/microblog.pub,tsileomicroblog.pub.json +train,data/bbugyi200/funky,bbugyi200funky.json +train,data/pygobject/pycairo,pygobjectpycairo.json +train,data/scrapli/scrapli_community,scrapliscrapli_community.json +train,data/jdelic/django-dbconn-retry,jdelicdjango-dbconn-retry.json +train,data/aws/sagemaker-python-sdk,awssagemaker-python-sdk.json +train,data/mozilla/glean,mozillaglean.json +train,data/rlouf/mcx,rloufmcx.json +train,data/project-serum/validators,project-serumvalidators.json +train,data/zenml-io/zenml,zenml-iozenml.json +train,data/yk/clip_music_video,ykclip_music_video.json +train,data/home-assistant-ecosystem/home-assistant-cli,home-assistant-ecosystemhome-assistant-cli.json +train,data/Gr1N/nats-python,Gr1Nnats-python.json +train,data/graphcore/popart,graphcorepopart.json +train,data/toddbirchard/plotlydash-flask-tutorial,toddbirchardplotlydash-flask-tutorial.json +train,data/thibaudcolas/curlylint,thibaudcolascurlylint.json +train,data/camptocamp/c2cgeoportal,camptocampc2cgeoportal.json +train,data/joshmarshall/mogo,joshmarshallmogo.json +train,data/ethyca/fideslang,ethycafideslang.json +train,data/webdataset/webdataset,webdatasetwebdataset.json +train,data/nucleic/kiwi,nucleickiwi.json +train,data/zurutech/gans-from-theory-to-production,zurutechgans-from-theory-to-production.json +train,data/Starfish-develop/Starfish,Starfish-developStarfish.json +train,data/laughingman7743/PyAthenaJDBC,laughingman7743PyAthenaJDBC.json +train,data/OpenMined/PySyft,OpenMinedPySyft.json +train,data/kantek/kantek,kantekkantek.json +train,data/huggingface/datasets-server,huggingfacedatasets-server.json +train,data/typeddjango/pytest-mypy-plugins,typeddjangopytest-mypy-plugins.json +train,data/TheAlgorithms/algorithms-keeper,TheAlgorithmsalgorithms-keeper.json +train,data/amundsen-io/amundsensearchlibrary,amundsen-ioamundsensearchlibrary.json +train,data/tfeldmann/simplematch,tfeldmannsimplematch.json +train,data/bartlomiejduda/Tools,bartlomiejdudaTools.json +train,data/snowflakedb/SnowAlert,snowflakedbSnowAlert.json +train,data/stac-utils/pystac-client,stac-utilspystac-client.json +train,data/CVI-SZU/CLIMS,CVI-SZUCLIMS.json +train,data/CenterForOpenScience/waterbutler,CenterForOpenSciencewaterbutler.json +train,data/sajib1066/event-calendar,sajib1066event-calendar.json +train,data/sally20921/ConSSL,sally20921ConSSL.json +train,data/mnot/thor,mnotthor.json +train,data/microsoft/planetary-computer-sdk-for-python,microsoftplanetary-computer-sdk-for-python.json +train,data/TTWShell/hobbit-core,TTWShellhobbit-core.json +train,data/apache/flink-ml,apacheflink-ml.json +train,data/jupyter-lsp/jupyterlab-lsp,jupyter-lspjupyterlab-lsp.json +train,data/raiden-network/raiden,raiden-networkraiden.json +train,data/sematic-ai/sematic,sematic-aisematic.json +train,data/open-telemetry/opentelemetry-python,open-telemetryopentelemetry-python.json +train,data/fillipe-gsm/python-tsp,fillipe-gsmpython-tsp.json +train,data/barseghyanartur/tld,barseghyanarturtld.json +train,data/pyodide/pyodide,pyodidepyodide.json +train,data/TezRomacH/layer-to-layer-pytorch,TezRomacHlayer-to-layer-pytorch.json +train,data/dosisod/refurb,dosisodrefurb.json +train,data/darrenburns/dunk,darrenburnsdunk.json +train,data/laixintao/myrc,laixintaomyrc.json +train,data/trailofbits/PrivacyRaven,trailofbitsPrivacyRaven.json +train,data/ambv/flake8-mypy,ambvflake8-mypy.json +train,data/hwCloudDBSDDS/dds,hwCloudDBSDDSdds.json +train,data/nocproject/noc,nocprojectnoc.json +train,data/hazelcast/hazelcast-python-client,hazelcasthazelcast-python-client.json +train,data/fugue-project/fugue,fugue-projectfugue.json +train,data/Lightning-AI/metrics,Lightning-AImetrics.json +train,data/Dineshkarthik/telegram_media_downloader,Dineshkarthiktelegram_media_downloader.json +train,data/dbeley/rymscraper,dbeleyrymscraper.json +train,data/ClearcodeHQ/pytest-postgresql,ClearcodeHQpytest-postgresql.json +train,data/Quansight-Labs/uarray,Quansight-Labsuarray.json +train,data/materialsproject/custodian,materialsprojectcustodian.json +train,data/EDCD/EDMarketConnector,EDCDEDMarketConnector.json +train,data/facebookresearch/fairscale,facebookresearchfairscale.json +train,data/improbable-research/keanu,improbable-researchkeanu.json +train,data/ongteckwu/Resume-Rater,ongteckwuResume-Rater.json +train,data/Pylons/pyramid_openapi3,Pylonspyramid_openapi3.json +train,data/MolecularAI/aizynthfinder,MolecularAIaizynthfinder.json +train,data/BoboTiG/ebook-reader-dict,BoboTiGebook-reader-dict.json +train,data/PyCQA/prospector,PyCQAprospector.json +train,data/michaelhly/solana-py,michaelhlysolana-py.json +train,data/frictionlessdata/framework,frictionlessdataframework.json +train,data/aio-libs/aiomonitor,aio-libsaiomonitor.json +train,data/airbytehq/airbyte,airbytehqairbyte.json +train,data/OpenMined/PyDP,OpenMinedPyDP.json +train,data/SekouD/mlconjug,SekouDmlconjug.json +train,data/kensho-technologies/graphql-compiler,kensho-technologiesgraphql-compiler.json +train,data/alasdairtran/fourierflow,alasdairtranfourierflow.json +train,data/pgorecki/python-ddd,pgoreckipython-ddd.json +train,data/noirello/bonsai,noirellobonsai.json +train,data/MDAnalysis/mdanalysis,MDAnalysismdanalysis.json +train,data/ReactiveX/RxPY,ReactiveXRxPY.json +train,data/splintered-reality/py_trees,splintered-realitypy_trees.json +train,data/datacamp/viewflow,datacampviewflow.json +train,data/grantjenks/python-sortedcollections,grantjenkspython-sortedcollections.json +train,data/frenck/python-wled,frenckpython-wled.json +train,data/mozillazg/python-pinyin,mozillazgpython-pinyin.json +train,data/studio-ousia/luke,studio-ousialuke.json +train,data/amidaware/tacticalrmm,amidawaretacticalrmm.json +train,data/rigetti/pyquil,rigettipyquil.json +train,data/catmaid/CATMAID,catmaidCATMAID.json +train,data/GitGuardian/ggshield,GitGuardianggshield.json +train,data/jetbridge/sls-flask,jetbridgesls-flask.json +train,data/baxtree/subaligner,baxtreesubaligner.json +train,data/chrieke/prettymapp,chriekeprettymapp.json +train,data/opensearch-project/opensearch-build,opensearch-projectopensearch-build.json +train,data/altair-viz/altair_saver,altair-vizaltair_saver.json +train,data/erinxocon/requests-xml,erinxoconrequests-xml.json +train,data/facebookresearch/CodeGen,facebookresearchCodeGen.json +train,data/altair-viz/altair_viewer,altair-vizaltair_viewer.json +train,data/vizzuhq/ipyvizzu,vizzuhqipyvizzu.json +train,data/antonagestam/collectfast,antonagestamcollectfast.json +train,data/graphql-python/graphql-relay-py,graphql-pythongraphql-relay-py.json +train,data/instaloader/instaloader,instaloaderinstaloader.json +train,data/awslabs/sockeye,awslabssockeye.json +train,data/quantmind/ccy,quantmindccy.json +train,data/zephyrproject-rtos/zephyr,zephyrproject-rtoszephyr.json +train,data/microsoft/lisa,microsoftlisa.json +train,data/spacegraphcats/spacegraphcats,spacegraphcatsspacegraphcats.json +train,data/smacke/ffsubsync,smackeffsubsync.json +train,data/curly60e/pyblock,curly60epyblock.json +train,data/Saleor-Multi-Vendor/saleor-multi-vendor,Saleor-Multi-Vendorsaleor-multi-vendor.json +train,data/gawel/aiocron,gawelaiocron.json +train,data/rcarriga/vim-ultest,rcarrigavim-ultest.json +train,data/Parsl/parsl,Parslparsl.json +train,data/AlexIoannides/py-package-template,AlexIoannidespy-package-template.json +train,data/mjx-project/mjx,mjx-projectmjx.json +train,data/PrairieLearn/PrairieLearn,PrairieLearnPrairieLearn.json +train,data/IntelAI/nauta,IntelAInauta.json +train,data/dccuchile/wefe,dccuchilewefe.json +train,data/bdd100k/bdd100k,bdd100kbdd100k.json +train,data/MISP/PyMISP,MISPPyMISP.json +train,data/zeromq/pyzmq,zeromqpyzmq.json +train,data/D4-project/IPASN-History,D4-projectIPASN-History.json +train,data/ActivityWatch/aw-server,ActivityWatchaw-server.json +train,data/torchvideo/torchvideo,torchvideotorchvideo.json +train,data/Knowledge-Graph-Hub/kg-covid-19,Knowledge-Graph-Hubkg-covid-19.json +train,data/bcmyers/argonautica,bcmyersargonautica.json +train,data/common-workflow-language/cwltool,common-workflow-languagecwltool.json +train,data/NAFTeam/NAFF,NAFTeamNAFF.json +train,data/florimondmanca/asgi-lifespan,florimondmancaasgi-lifespan.json +train,data/gtalarico/pyairtable,gtalaricopyairtable.json +train,data/Shougo/denite.nvim,Shougodenite.nvim.json +train,data/zbrookle/dataframe_sql,zbrookledataframe_sql.json +train,data/tetienne/somfy-open-api,tetiennesomfy-open-api.json +train,data/IBM/aihwkit,IBMaihwkit.json +train,data/oleksis/youtube-dl-gui,oleksisyoutube-dl-gui.json +train,data/ijl/orjson,ijlorjson.json +train,data/mocobeta/janome,mocobetajanome.json +train,data/tombulled/innertube,tombulledinnertube.json +train,data/music-assistant/hass-music-assistant,music-assistanthass-music-assistant.json +train,data/jazzband/django-eav2,jazzbanddjango-eav2.json +train,data/PacktPublishing/Building-Data-Science-Applications-with-FastAPI,PacktPublishingBuilding-Data-Science-Applications-with-FastAPI.json +train,data/scrapli/nornir_scrapli,scraplinornir_scrapli.json +train,data/carefree0910/carefree-learn,carefree0910carefree-learn.json +train,data/flux-framework/flux-core,flux-frameworkflux-core.json +train,data/hackobi/AI-Scalpel-Trading-Bot,hackobiAI-Scalpel-Trading-Bot.json +train,data/evinism/mistql,evinismmistql.json +train,data/cuter-testing/cuter,cuter-testingcuter.json +train,data/django-waffle/django-waffle,django-waffledjango-waffle.json +train,data/connelldave/botocove,connelldavebotocove.json +train,data/tokusumi/fastapi-cloudauth,tokusumifastapi-cloudauth.json +train,data/Solvik/netbox-agent,Solviknetbox-agent.json +train,data/flopp/py-staticmaps,flopppy-staticmaps.json +train,data/dbt-labs/dbt-core,dbt-labsdbt-core.json +train,data/run-x/opta,run-xopta.json +train,data/strawberry-graphql/strawberry,strawberry-graphqlstrawberry.json +train,data/Yakifo/amqtt,Yakifoamqtt.json +train,data/marcusbuffett/command-line-chess,marcusbuffettcommand-line-chess.json +train,data/kigawas/fastapi-django,kigawasfastapi-django.json +train,data/rodrigobressan/entity_embeddings_categorical,rodrigobressanentity_embeddings_categorical.json +train,data/sonic182/aiosonic,sonic182aiosonic.json +train,data/vcs-python/vcspull,vcs-pythonvcspull.json +train,data/lagerfeuer/cryptocompare,lagerfeuercryptocompare.json +train,data/amundsen-io/amundsendatabuilder,amundsen-ioamundsendatabuilder.json +train,data/lyft/awspricing,lyftawspricing.json +train,data/aio-libs/aiohttp-devtools,aio-libsaiohttp-devtools.json +train,data/sbrunner/scan-to-paperless,sbrunnerscan-to-paperless.json +train,data/vas3k/infomate.club,vas3kinfomate.club.json +train,data/decompme/decomp.me,decompmedecomp.me.json +train,data/escaped/django-video-encoding,escapeddjango-video-encoding.json +train,data/rusty-celery/rusty-celery,rusty-celeryrusty-celery.json +train,data/mbarkhau/bumpver,mbarkhaubumpver.json +train,data/Lightning-AI/lightning,Lightning-AIlightning.json +train,data/andreax79/airflow-code-editor,andreax79airflow-code-editor.json +train,data/georgebv/pyextremes,georgebvpyextremes.json +train,data/ccie18643/PyTCP,ccie18643PyTCP.json +train,data/alex-grover/hexagonal-architecture-python,alex-groverhexagonal-architecture-python.json +train,data/Shougo/defx.nvim,Shougodefx.nvim.json +train,data/internetarchive/openlibrary,internetarchiveopenlibrary.json +train,data/Instagram/MonkeyType,InstagramMonkeyType.json +train,data/taikiinoue45/STAD,taikiinoue45STAD.json +train,data/BenevolentAI/MolBERT,BenevolentAIMolBERT.json +train,data/jamesturk/scrapelib,jamesturkscrapelib.json +train,data/allenai/allenact,allenaiallenact.json +train,data/cowrie/cowrie,cowriecowrie.json +train,data/onecommons/unfurl,onecommonsunfurl.json +train,data/sklarsa/django-sendgrid-v5,sklarsadjango-sendgrid-v5.json +train,data/hukkin/cosmospy,hukkincosmospy.json +train,data/ARBML/klaam,ARBMLklaam.json +train,data/ClickHouse/dbt-clickhouse,ClickHousedbt-clickhouse.json +train,data/Opentrons/opentrons,Opentronsopentrons.json +train,data/btclib-org/btclib,btclib-orgbtclib.json +train,data/cloudtools/troposphere,cloudtoolstroposphere.json +train,data/google/cloud-forensics-utils,googlecloud-forensics-utils.json +train,data/cedricbonhomme/newspipe,cedricbonhommenewspipe.json +train,data/robinhood/faust,robinhoodfaust.json +train,data/Abdenasser/dr_scaffold,Abdenasserdr_scaffold.json +train,data/nix-community/poetry2nix,nix-communitypoetry2nix.json +train,data/onnx/onnx,onnxonnx.json +train,data/camptocamp/tilecloud,camptocamptilecloud.json +train,data/nucypher/nucypher,nucyphernucypher.json +train,data/Limych/ha-gismeteo,Limychha-gismeteo.json +train,data/dmerejkowsky/pycp,dmerejkowskypycp.json +train,data/timothycrosley/portray,timothycrosleyportray.json +train,data/Ultimaker/Cura,UltimakerCura.json +train,data/bitranox/wrapt_timeout_decorator,bitranoxwrapt_timeout_decorator.json +train,data/ossf/fuzz-introspector,ossffuzz-introspector.json +train,data/fugue-project/tutorials,fugue-projecttutorials.json +train,data/gjcarneiro/yacron,gjcarneiroyacron.json +train,data/algolia/algoliasearch-client-python,algoliaalgoliasearch-client-python.json +train,data/codalab/codalab-worksheets,codalabcodalab-worksheets.json +train,data/numberoverzero/bottom,numberoverzerobottom.json +train,data/plasma-umass/scalene,plasma-umassscalene.json +train,data/wemake-services/wemake-django-template,wemake-serviceswemake-django-template.json +train,data/openstack/nova,openstacknova.json +train,data/kkirsche/ansible-generator,kkirscheansible-generator.json +train,data/tfeldmann/organize,tfeldmannorganize.json +train,data/pzmarzly/ancs4linux,pzmarzlyancs4linux.json +train,data/model-bakers/model_bakery,model-bakersmodel_bakery.json +train,data/DataBiosphere/toil,DataBiospheretoil.json +train,data/pystatgen/sgkit,pystatgensgkit.json +train,data/yunojuno/elasticsearch-django,yunojunoelasticsearch-django.json +train,data/messari/messari-python-api,messarimessari-python-api.json +train,data/comtravo/ctparse,comtravoctparse.json +train,data/qctrl/python-open-controls,qctrlpython-open-controls.json +train,data/deepchem/jaxchem,deepchemjaxchem.json +train,data/jacksmith15/poetry-workspace-plugin,jacksmith15poetry-workspace-plugin.json +train,data/freqtrade/freqtrade,freqtradefreqtrade.json +train,data/samuelcolvin/python-devtools,samuelcolvinpython-devtools.json +train,data/openstack/requirements,openstackrequirements.json +train,data/internetarchive/fatcat-scholar,internetarchivefatcat-scholar.json +train,data/ClusterLabs/pcs,ClusterLabspcs.json +train,data/optuna/optuna-dashboard,optunaoptuna-dashboard.json +train,data/thorgate/django-project-template,thorgatedjango-project-template.json +train,data/lovesegfault/beautysh,lovesegfaultbeautysh.json +train,data/crossbario/crossbar,crossbariocrossbar.json +train,data/DerwenAI/pytextrank,DerwenAIpytextrank.json +train,data/hackersandslackers/paramiko-tutorial,hackersandslackersparamiko-tutorial.json +train,data/bitcoin-core/HWI,bitcoin-coreHWI.json +train,data/Yelp/Tron,YelpTron.json +train,data/networktocode/diffsync,networktocodediffsync.json +train,data/getsentry/snuba,getsentrysnuba.json +train,data/ex0dus-0x/fuzzable,ex0dus-0xfuzzable.json +train,data/Scille/parsec-cloud,Scilleparsec-cloud.json +train,data/Yelp/swagger_spec_validator,Yelpswagger_spec_validator.json +train,data/ymyzk/mypy-playground,ymyzkmypy-playground.json +train,data/cruft/cruft,cruftcruft.json +train,data/tartiflette/tartiflette-asgi,tartiflettetartiflette-asgi.json +train,data/abersheeran/rpc.py,abersheeranrpc.py.json +train,data/PyCQA/astroid,PyCQAastroid.json +train,data/EconForge/dolo.py,EconForgedolo.py.json +train,data/Bears-R-Us/arkouda,Bears-R-Usarkouda.json +train,data/tiangolo/python-machine-learning-docker,tiangolopython-machine-learning-docker.json +train,data/rthalley/dnspython,rthalleydnspython.json +train,data/soaxelbrooke/python-bpe,soaxelbrookepython-bpe.json +train,data/tiangolo/uvicorn-gunicorn-docker,tiangolouvicorn-gunicorn-docker.json +train,data/Derfirm/pornhub-api,Derfirmpornhub-api.json +train,data/pynamodb/PynamoDB,pynamodbPynamoDB.json +train,data/microsoft/onefuzz,microsoftonefuzz.json +train,data/oughtinc/ergo,oughtincergo.json +train,data/looker-open-source/sdk-codegen,looker-open-sourcesdk-codegen.json +train,data/omnilib/ufmt,omnilibufmt.json +train,data/cylc/cylc-flow,cylccylc-flow.json +train,data/druids/django-GDPR,druidsdjango-GDPR.json +train,data/tokern/piicatcher,tokernpiicatcher.json +train,data/olist/correios,olistcorreios.json +train,data/scipy/scipy,scipyscipy.json +train,data/wesleybowman/UTide,wesleybowmanUTide.json +train,data/rospogrigio/localtuya,rospogrigiolocaltuya.json +train,data/apache/arrow-ballista,apachearrow-ballista.json +train,data/SwagLyrics/SwagLyrics-For-Spotify,SwagLyricsSwagLyrics-For-Spotify.json +train,data/mar10/wsgidav,mar10wsgidav.json +train,data/DonJayamanne/vscode-python-manager,DonJayamannevscode-python-manager.json +train,data/nabla-c0d3/trust_stores_observatory,nabla-c0d3trust_stores_observatory.json +train,data/koxudaxi/fastapi-code-generator,koxudaxifastapi-code-generator.json +train,data/into-ai/deeplearning2020,into-aideeplearning2020.json +train,data/python-microservices/pyms,python-microservicespyms.json +train,data/RDFLib/pySHACL,RDFLibpySHACL.json +train,data/palewire/news-homepages,palewirenews-homepages.json +train,data/c-w/gutenberg-http,c-wgutenberg-http.json +train,data/Textualize/textual,Textualizetextual.json +train,data/OpenAgricultureFoundation/openag-device-software,OpenAgricultureFoundationopenag-device-software.json +train,data/alerta/alerta-contrib,alertaalerta-contrib.json +train,data/lona-web-org/lona,lona-web-orglona.json +train,data/vkbottle/vkbottle,vkbottlevkbottle.json +train,data/TrungNguyen1909/qemu-t8030,TrungNguyen1909qemu-t8030.json +train,data/Netflix-Skunkworks/diffy,Netflix-Skunkworksdiffy.json +train,data/ClearcodeHQ/mirakuru,ClearcodeHQmirakuru.json +train,data/cosmicpython/code,cosmicpythoncode.json +train,data/collerek/ormar,collerekormar.json +train,data/aio-libs/aiomcache,aio-libsaiomcache.json +train,data/snapcore/snapcraft,snapcoresnapcraft.json +train,data/vizzuhq/ipyvizzu-story,vizzuhqipyvizzu-story.json +train,data/epam/Indigo,epamIndigo.json +train,data/datadvance/DjangoChannelsGraphqlWs,datadvanceDjangoChannelsGraphqlWs.json +train,data/studio-ousia/mojimoji,studio-ousiamojimoji.json +train,data/adafruit/circuitpython,adafruitcircuitpython.json +train,data/Chavithra/degiro-connector,Chavithradegiro-connector.json +train,data/kalaspuff/tomodachi,kalaspufftomodachi.json +train,data/PyCQA/isort,PyCQAisort.json +train,data/starlite-api/starlite,starlite-apistarlite.json +train,data/Akuli/porcupine,Akuliporcupine.json +train,data/awslabs/aws-orbit-workbench,awslabsaws-orbit-workbench.json +train,data/tokern/data-lineage,tokerndata-lineage.json +train,data/dsuch/pymqi,dsuchpymqi.json +train,data/FYWinds/takker,FYWindstakker.json +train,data/nrfconnect/sdk-nrf,nrfconnectsdk-nrf.json +train,data/qt/qtbase,qtqtbase.json +train,data/python-gino/gino,python-ginogino.json +train,data/darcamo/pyphysim,darcamopyphysim.json +train,data/usc-isi-i2/kgtk,usc-isi-i2kgtk.json +train,data/absent1706/sqlalchemy-mixins,absent1706sqlalchemy-mixins.json +train,data/psf/requests-html,psfrequests-html.json +train,data/pengzhiliang/MAE-pytorch,pengzhiliangMAE-pytorch.json +train,data/eugeneyan/testing-ml,eugeneyantesting-ml.json +train,data/common-workflow-language/schema_salad,common-workflow-languageschema_salad.json +train,data/MartinThoma/banana-gym,MartinThomabanana-gym.json +train,data/twocucao/YaDjangoBlog,twocucaoYaDjangoBlog.json +train,data/duneanalytics/spellbook,duneanalyticsspellbook.json +train,data/dcos/dcos,dcosdcos.json +train,data/whylabs/whylogs,whylabswhylogs.json +train,data/tomassosorio/OCR_tablenet,tomassosorioOCR_tablenet.json +train,data/cenkalti/kuyruk,cenkaltikuyruk.json +train,data/microCOVID/microCOVID,microCOVIDmicroCOVID.json +train,data/simon-weber/gpsoauth,simon-webergpsoauth.json +train,data/unionai-oss/pandera,unionai-osspandera.json +train,data/irrdnet/irrd,irrdnetirrd.json +train,data/freelawproject/courtlistener,freelawprojectcourtlistener.json +train,data/Googolxx/STF,GoogolxxSTF.json +train,data/siddhantgoel/tornado-sqlalchemy,siddhantgoeltornado-sqlalchemy.json +train,data/tiangolo/full-stack-fastapi-couchbase,tiangolofull-stack-fastapi-couchbase.json +train,data/whitphx/streamlit-webrtc,whitphxstreamlit-webrtc.json +train,data/gforcada/flake8-isort,gforcadaflake8-isort.json +train,data/DonJayamanne/pythonVSCode,DonJayamannepythonVSCode.json +train,data/geospace-code/pymap3d,geospace-codepymap3d.json +train,data/karma0/nombot,karma0nombot.json +train,data/microsoft/playwright-python,microsoftplaywright-python.json +train,data/fscdev/vkwave,fscdevvkwave.json +train,data/EconForge/interpolation.py,EconForgeinterpolation.py.json +train,data/tomaae/homeassistant-mikrotik_router,tomaaehomeassistant-mikrotik_router.json +train,data/albermax/innvestigate,albermaxinnvestigate.json +train,data/keshavsingh4522/hacktoberfest2021,keshavsingh4522hacktoberfest2021.json +train,data/onicagroup/runway,onicagrouprunway.json +train,data/nrfconnect/sdk-zephyr,nrfconnectsdk-zephyr.json +train,data/etiennedub/pyk4a,etiennedubpyk4a.json +train,data/bear/parsedatetime,bearparsedatetime.json +train,data/custom-components/pyscript,custom-componentspyscript.json +train,data/anovos/anovos,anovosanovos.json +train,data/pypa/trove-classifiers,pypatrove-classifiers.json +train,data/callahantiff/OMOP2OBO,callahantiffOMOP2OBO.json +train,data/encode/httpx,encodehttpx.json +train,data/mys-lang/mys,mys-langmys.json +train,data/art049/odmantic,art049odmantic.json +train,data/gridsync/gridsync,gridsyncgridsync.json +train,data/seanwu1105/vscode-qt-for-python,seanwu1105vscode-qt-for-python.json +train,data/polyaxon/traceml,polyaxontraceml.json +train,data/JnyJny/busylight,JnyJnybusylight.json +train,data/danger/python,dangerpython.json +train,data/pyvista/pyvista,pyvistapyvista.json +train,data/kroo/wyzecam,kroowyzecam.json +train,data/danihodovic/celery-exporter,danihodoviccelery-exporter.json +train,data/oughtinc/ice,oughtincice.json +train,data/synesthesiam/voice2json,synesthesiamvoice2json.json +train,data/bodywork-ml/bodywork-core,bodywork-mlbodywork-core.json +train,data/danielgtaylor/python-betterproto,danielgtaylorpython-betterproto.json +train,data/argoproj-labs/hera-workflows,argoproj-labshera-workflows.json +train,data/doloopwhile/pyjq,doloopwhilepyjq.json +train,data/ankitects/anki,ankitectsanki.json +train,data/Qiskit/qiskit-ibmq-provider,Qiskitqiskit-ibmq-provider.json +train,data/afroisalreadyinu/miniboss,afroisalreadyinuminiboss.json +train,data/uploadcare/pyuploadcare,uploadcarepyuploadcare.json +train,data/polyaxon/datatile,polyaxondatatile.json +train,data/schireson/pytest-alembic,schiresonpytest-alembic.json +train,data/scikit-learn-contrib/MAPIE,scikit-learn-contribMAPIE.json +train,data/trallnag/prometheus-fastapi-instrumentator,trallnagprometheus-fastapi-instrumentator.json +train,data/idea-fasoc/OpenFASOC,idea-fasocOpenFASOC.json +train,data/realpython/pytest-mypy,realpythonpytest-mypy.json +train,data/pytest-dev/pytest-bdd,pytest-devpytest-bdd.json +train,data/ML-KULeuven/soccer_xg,ML-KULeuvensoccer_xg.json +train,data/nabla-c0d3/sslyze,nabla-c0d3sslyze.json +train,data/hacf-fr/renault-api,hacf-frrenault-api.json +train,data/jaywink/socialhome,jaywinksocialhome.json +train,data/piccolo-orm/piccolo_admin,piccolo-ormpiccolo_admin.json +train,data/tmbo/questionary,tmboquestionary.json +train,data/clarkperkins/click-shell,clarkperkinsclick-shell.json +train,data/tobymao/sqlglot,tobymaosqlglot.json +train,data/glutanimate/image-occlusion-enhanced,glutanimateimage-occlusion-enhanced.json +train,data/Alcibiades-Capital/fasteth,Alcibiades-Capitalfasteth.json +train,data/kserve/kserve,kservekserve.json +train,data/fsspec/kerchunk,fsspeckerchunk.json +train,data/allenai/vampire,allenaivampire.json +train,data/drivendataorg/cloudpathlib,drivendataorgcloudpathlib.json +train,data/Kartones/flask-calendar,Kartonesflask-calendar.json +train,data/lgpage/nbtutor,lgpagenbtutor.json +train,data/justinmayhew/greenstalk,justinmayhewgreenstalk.json +train,data/shirayu/whispering,shirayuwhispering.json +train,data/micro-nova/AmpliPi,micro-novaAmpliPi.json +train,data/rastrea2r/rastrea2r,rastrea2rrastrea2r.json +train,data/web-platform-tests/wpt,web-platform-testswpt.json +train,data/sberbank-ai-lab/AutoMLWhitebox,sberbank-ai-labAutoMLWhitebox.json +train,data/roycoding/slots,roycodingslots.json +train,data/Mbed-TLS/mbedtls,Mbed-TLSmbedtls.json +train,data/openstack/cinder,openstackcinder.json +train,data/ycd/manage-fastapi,ycdmanage-fastapi.json +train,data/Arelle/Arelle,ArelleArelle.json +train,data/openvisionapi/ova-server,openvisionapiova-server.json +train,data/adap/flower,adapflower.json +train,data/ch-sa/labelCloud,ch-salabelCloud.json +train,data/aminalaee/mongox,aminalaeemongox.json +train,data/MycroftAI/mimic3,MycroftAImimic3.json +train,data/sberbank-ai-lab/LightAutoML,sberbank-ai-labLightAutoML.json +train,data/aio-libs/aiorwlock,aio-libsaiorwlock.json +train,data/google/sre_yield,googlesre_yield.json +train,data/huan/node-facenet,huannode-facenet.json +train,data/M0r13n/pyais,M0r13npyais.json +train,data/Doist/python-timezones,Doistpython-timezones.json +train,data/zanellia/prometeo,zanelliaprometeo.json +train,data/datawire/kubernaut,datawirekubernaut.json +train,data/browniebroke/django-codemod,browniebrokedjango-codemod.json +train,data/scivision/PyLivestream,scivisionPyLivestream.json +train,data/ConsenSys/mythril,ConsenSysmythril.json +train,data/abhinavsingh/proxy.py,abhinavsinghproxy.py.json +train,data/bneijt/autotrash,bneijtautotrash.json +train,data/tiangolo/uvicorn-gunicorn-fastapi-docker,tiangolouvicorn-gunicorn-fastapi-docker.json +train,data/guedesfelipe/pls-cli,guedesfelipepls-cli.json +train,data/full-stack-deep-learning/fsdl-text-recognizer-2021-labs,full-stack-deep-learningfsdl-text-recognizer-2021-labs.json +train,data/josephbestjames/airtable.py,josephbestjamesairtable.py.json +train,data/jcreinhold/intensity-normalization,jcreinholdintensity-normalization.json +train,data/healeycodes/andoma,healeycodesandoma.json +train,data/tiangolo/uwsgi-nginx-flask-docker,tiangolouwsgi-nginx-flask-docker.json +train,data/rednafi/fastapi-nano,rednafifastapi-nano.json +train,data/quintoandar/butterfree,quintoandarbutterfree.json +train,data/RansomLook/RansomLook,RansomLookRansomLook.json +train,data/Paebbels/pyVHDLParser,PaebbelspyVHDLParser.json +train,data/allenai/allennlp-models,allenaiallennlp-models.json +train,data/bellshade/Python,bellshadePython.json +train,data/PKSHATechnology-Research/camphr,PKSHATechnology-Researchcamphr.json +train,data/OpenBB-finance/OpenBBTerminal,OpenBB-financeOpenBBTerminal.json +train,data/aio-libs/aiocache,aio-libsaiocache.json +train,data/py-why/dowhy,py-whydowhy.json +train,data/martinResearch/DEODR,martinResearchDEODR.json +train,data/mblayman/homeschool,mblaymanhomeschool.json +train,data/moralmunky/Home-Assistant-Mail-And-Packages,moralmunkyHome-Assistant-Mail-And-Packages.json +train,data/zulip/zulip,zulipzulip.json +train,data/lk-geimfari/mimesis,lk-geimfarimimesis.json +train,data/quodlibet/mutagen,quodlibetmutagen.json +train,data/sdispater/tomlkit,sdispatertomlkit.json +train,data/twocucao/silverhand,twocucaosilverhand.json +train,data/mnot/redbot,mnotredbot.json +train,data/patterns-app/patterns-devkit,patterns-apppatterns-devkit.json +train,data/sinnwerkstatt/runrestic,sinnwerkstattrunrestic.json +train,data/aio-libs/yarl,aio-libsyarl.json +train,data/stencila/stencila,stencilastencila.json +train,data/Orange-OpenSource/hurl,Orange-OpenSourcehurl.json +train,data/GaloisInc/MATE,GaloisIncMATE.json +train,data/zulip/python-zulip-api,zulippython-zulip-api.json +train,data/microsoft/hi-ml,microsofthi-ml.json +train,data/aio-libs/async-timeout,aio-libsasync-timeout.json +train,data/cryzed/TrafficToll,cryzedTrafficToll.json +train,data/illBeRoy/taskipy,illBeRoytaskipy.json +train,data/python-poetry/poetry-core,python-poetrypoetry-core.json +train,data/timothycrosley/concentration,timothycrosleyconcentration.json +train,data/CIRCL/PyPDNS,CIRCLPyPDNS.json +train,data/pyproj4/pyproj,pyproj4pyproj.json +train,data/coala/coala-bears,coalacoala-bears.json +train,data/dragonchain/dragonchain,dragonchaindragonchain.json +train,data/facebookresearch/metaseq,facebookresearchmetaseq.json +train,data/klen/pylama,klenpylama.json +train,data/chris104957/maildown,chris104957maildown.json +train,data/ebellocchia/bip_utils,ebellocchiabip_utils.json +train,data/CiscoDevNet/cml-community,CiscoDevNetcml-community.json +train,data/colour-science/colour-checker-detection,colour-sciencecolour-checker-detection.json +train,data/inducer/relate,inducerrelate.json +train,data/stanford-oval/genie-cloud,stanford-ovalgenie-cloud.json +train,data/determined-ai/determined,determined-aidetermined.json +train,data/RDFLib/rdflib,RDFLibrdflib.json +train,data/matrix-org/sydent,matrix-orgsydent.json +train,data/bloomberg/memray,bloombergmemray.json +train,data/microsoft/planetary-computer-apis,microsoftplanetary-computer-apis.json +train,data/scikit-hep/resample,scikit-hepresample.json +train,data/numpy/numpy,numpynumpy.json +train,data/wemake-services/flake8-eradicate,wemake-servicesflake8-eradicate.json +train,data/nbro/ands,nbroands.json +train,data/kracekumar/python-typing-koans,kracekumarpython-typing-koans.json +train,data/everyclass/everyclass-server,everyclasseveryclass-server.json +train,data/cisagov/findcdn,cisagovfindcdn.json +train,data/pre-commit/mirrors-mypy,pre-commitmirrors-mypy.json +train,data/saleor/saleor,saleorsaleor.json +train,data/MarketSquare/webdrivermanager,MarketSquarewebdrivermanager.json +train,data/jbms/sphinx-immaterial,jbmssphinx-immaterial.json +train,data/iejMac/video2numpy,iejMacvideo2numpy.json +train,data/nasirhjafri/libyear,nasirhjafrilibyear.json +train,data/web-platform-tests/wpt.fyi,web-platform-testswpt.fyi.json +train,data/farridav/django-jazzmin,farridavdjango-jazzmin.json +train,data/wong2/pick,wong2pick.json +train,data/mims-harvard/TDC,mims-harvardTDC.json +train,data/akanz1/klib,akanz1klib.json +train,data/bethgelab/foolbox,bethgelabfoolbox.json +train,data/microsoft/msticpy,microsoftmsticpy.json +train,data/spectacles-ci/spectacles,spectacles-cispectacles.json +train,data/pytransitions/transitions,pytransitionstransitions.json +train,data/bluxmit/alnoda-workspaces,bluxmitalnoda-workspaces.json +train,data/thecode/ha-rpi_gpio,thecodeha-rpi_gpio.json +train,data/HybridRobotics/car-racing,HybridRoboticscar-racing.json +train,data/instrumenta/openapi2jsonschema,instrumentaopenapi2jsonschema.json +train,data/maximumstock/poe-currency-flip-planner,maximumstockpoe-currency-flip-planner.json +train,data/davidiommi/Pytorch--3D-Medical-Images-Segmentation--SALMON,davidiommiPytorch--3D-Medical-Images-Segmentation--SALMON.json +train,data/star-whale/starwhale,star-whalestarwhale.json +train,data/python-gitlab/python-gitlab,python-gitlabpython-gitlab.json +train,data/electrumsv/electrumsv,electrumsvelectrumsv.json +train,data/robocorp/rpaframework,robocorprpaframework.json +train,data/microsoft/CDM,microsoftCDM.json +train,data/srstevenson/nb-clean,srstevensonnb-clean.json +train,data/tugcanolgun/vigilio,tugcanolgunvigilio.json +train,data/fannheyward/coc-pyright,fannheywardcoc-pyright.json +train,data/alisaifee/flask-limiter,alisaifeeflask-limiter.json +train,data/WLM1ke/poptimizer,WLM1kepoptimizer.json +train,data/uds-se/debuggingbook,uds-sedebuggingbook.json +train,data/BradenM/micropy-cli,BradenMmicropy-cli.json +train,data/duo-labs/py_webauthn,duo-labspy_webauthn.json +train,data/stxnext/pymongo-migrate,stxnextpymongo-migrate.json +train,data/jannikmi/timezonefinder,jannikmitimezonefinder.json +train,data/log2timeline/dftimewolf,log2timelinedftimewolf.json +train,data/Toloka/crowd-kit,Tolokacrowd-kit.json +train,data/rotki/rotki,rotkirotki.json +train,data/robinhood-unofficial/pyrh,robinhood-unofficialpyrh.json +train,data/fhightower/ioc-finder,fhightowerioc-finder.json +train,data/cloud-custodian/cloud-custodian,cloud-custodiancloud-custodian.json +train,data/space-physics/lowtran,space-physicslowtran.json +train,data/Flask-Middleware/flask-security,Flask-Middlewareflask-security.json +train,data/revmischa/cloudcam,revmischacloudcam.json +train,data/facebookresearch/hiplot,facebookresearchhiplot.json +train,data/feeltheajf/trufflehog3,feeltheajftrufflehog3.json +train,data/webrecorder/browsertrix-old,webrecorderbrowsertrix-old.json +train,data/safe-global/safe-eth-py,safe-globalsafe-eth-py.json +train,data/allenai/allennlp-demo,allenaiallennlp-demo.json +train,data/TL-System/plato,TL-Systemplato.json +train,data/spmeesseman/vscode-taskexplorer,spmeessemanvscode-taskexplorer.json +train,data/jacobsvante/netsuite,jacobsvantenetsuite.json +train,data/InsightLab/PyMove,InsightLabPyMove.json +train,data/aertslab/SCope,aertslabSCope.json +train,data/toltec-dev/toltec,toltec-devtoltec.json +train,data/nolar/kopf,nolarkopf.json +train,data/wearerequired/lint-action,wearerequiredlint-action.json +train,data/iamdefinitelyahuman/py-solc-x,iamdefinitelyahumanpy-solc-x.json +train,data/schireson/pytest-mock-resources,schiresonpytest-mock-resources.json +train,data/Iapetus-11/Villager-Bot,Iapetus-11Villager-Bot.json +train,data/voxpupuli/puppetboard,voxpupulipuppetboard.json +train,data/aio-libs/aiohttp,aio-libsaiohttp.json +train,data/nat-n/poethepoet,nat-npoethepoet.json +train,data/Ambro17/slackify,Ambro17slackify.json +train,data/roscisz/TensorHive,rosciszTensorHive.json +train,data/doccano/doccano,doccanodoccano.json +train,data/persephone-tools/persephone,persephone-toolspersephone.json +train,data/tfranzel/drf-spectacular,tfranzeldrf-spectacular.json +train,data/UncleGoogle/galaxy-integration-humblebundle,UncleGooglegalaxy-integration-humblebundle.json +train,data/N-Wouda/ALNS,N-WoudaALNS.json +train,data/commaai/openpilot,commaaiopenpilot.json +train,data/NixOS/nixops,NixOSnixops.json +train,data/aiokitchen/aiomisc,aiokitchenaiomisc.json +train,data/razorx89/pydicom-seg,razorx89pydicom-seg.json +train,data/marcosschroh/cookiecutter-faust,marcosschrohcookiecutter-faust.json +train,data/flatpak/flatpak-builder-tools,flatpakflatpak-builder-tools.json +train,data/PetterS/monolith,PetterSmonolith.json +train,data/indygreg/python-zstandard,indygregpython-zstandard.json +train,data/Ouranosinc/xclim,Ouranosincxclim.json +train,data/ftexchange/ftx,ftexchangeftx.json +train,data/bridgecrewio/checkov,bridgecrewiocheckov.json +train,data/miyakogi/syncer,miyakogisyncer.json +train,data/tiangolo/meinheld-gunicorn-flask-docker,tiangolomeinheld-gunicorn-flask-docker.json +train,data/lovvskillz/python-discord-webhook,lovvskillzpython-discord-webhook.json +train,data/gpchelkin/scdlbot,gpchelkinscdlbot.json +train,data/stac-utils/stactools,stac-utilsstactools.json +train,data/aio-libs/aiohttp-remotes,aio-libsaiohttp-remotes.json +train,data/python-telegram-bot/python-telegram-bot,python-telegram-botpython-telegram-bot.json +train,data/joedevivo/vscode-circuitpython,joedevivovscode-circuitpython.json +train,data/BiomedSciAI/fuse-med-ml,BiomedSciAIfuse-med-ml.json +train,data/Kitware/SMQTK,KitwareSMQTK.json +train,data/stac-utils/pystac,stac-utilspystac.json +train,data/geopolars/geopolars,geopolarsgeopolars.json +train,data/ibis-project/ibis,ibis-projectibis.json +train,data/hardbyte/python-can,hardbytepython-can.json +train,data/cuducos/fio-de-ariadne,cuducosfio-de-ariadne.json +train,data/saturday06/VRM_Addon_for_Blender,saturday06VRM_Addon_for_Blender.json +train,data/python-poetry/poetry,python-poetrypoetry.json +train,data/theislab/ncem,theislabncem.json +train,data/explosion/thinc,explosionthinc.json +train,data/goodboy/tractor,goodboytractor.json +train,data/stan-dev/cmdstanpy,stan-devcmdstanpy.json +train,data/justindujardin/pathy,justindujardinpathy.json +train,data/microsoftgraph/msgraph-sdk-python-core,microsoftgraphmsgraph-sdk-python-core.json +train,data/aws/jsii,awsjsii.json +train,data/jborean93/pypsrp,jborean93pypsrp.json +train,data/Floorp-Projects/Floorp,Floorp-ProjectsFloorp.json +train,data/Trusted-AI/adversarial-robustness-toolbox,Trusted-AIadversarial-robustness-toolbox.json +train,data/robocorp/robotframework-lsp,robocorprobotframework-lsp.json +train,data/qiskit-community/qiskit-translations,qiskit-communityqiskit-translations.json +train,data/graphql-python/graphene-pydantic,graphql-pythongraphene-pydantic.json +train,data/TezRomacH/python-package-template,TezRomacHpython-package-template.json +train,data/overhangio/tutor,overhangiotutor.json +train,data/marcosschroh/dataclasses-avroschema,marcosschrohdataclasses-avroschema.json +train,data/fsouza/dotfiles,fsouzadotfiles.json +train,data/pallets/itsdangerous,palletsitsdangerous.json +train,data/dawnbeen/c_formatter_42,dawnbeenc_formatter_42.json +train,data/vmagamedov/grpclib,vmagamedovgrpclib.json +train,data/omnilib/aiosqlite,omnilibaiosqlite.json +train,data/RasaHQ/rasa,RasaHQrasa.json +train,data/cmu-delphi/delphi-epidata,cmu-delphidelphi-epidata.json +train,data/p1c2u/openapi-spec-validator,p1c2uopenapi-spec-validator.json +train,data/mees/calvin,meescalvin.json +train,data/temporalio/sdk-python,temporaliosdk-python.json +train,data/pythongssapi/python-gssapi,pythongssapipython-gssapi.json +train,data/rednafi/think-async,rednafithink-async.json +train,data/snok/django-guid,snokdjango-guid.json +train,data/CyberAgentAILab/cmaes,CyberAgentAILabcmaes.json +train,data/InterDigitalInc/CompressAI,InterDigitalIncCompressAI.json +train,data/PacktPublishing/Expert-Python-Programming-Fourth-Edition,PacktPublishingExpert-Python-Programming-Fourth-Edition.json +train,data/laughingman7743/PyAthena,laughingman7743PyAthena.json +train,data/itamarst/crochet,itamarstcrochet.json +train,data/sabuhish/fastapi-mqtt,sabuhishfastapi-mqtt.json +train,data/jasmcaus/caer,jasmcauscaer.json +train,data/dbt-labs/dbt-spark,dbt-labsdbt-spark.json +train,data/getsentry/sentry,getsentrysentry.json +train,data/seik/stilio,seikstilio.json +train,data/pytorch/hydra-torch,pytorchhydra-torch.json +train,data/TheAlgorithms/Python,TheAlgorithmsPython.json +train,data/RunestoneInteractive/RunestoneServer,RunestoneInteractiveRunestoneServer.json +train,data/gopro/OpenGoPro,goproOpenGoPro.json +train,data/twocucao/tifa,twocucaotifa.json +train,data/networktocode/yangify,networktocodeyangify.json +train,data/JarryShaw/PyPCAPKit,JarryShawPyPCAPKit.json +train,data/facebookresearch/SLIP,facebookresearchSLIP.json +train,data/dahlia/cjk-compsci-terms,dahliacjk-compsci-terms.json +train,data/dynaconf/dynaconf,dynaconfdynaconf.json +train,data/michaelharms/comcrawl,michaelharmscomcrawl.json +train,data/zeroepoch/plotbitrate,zeroepochplotbitrate.json +train,data/getsentry/sentry-python,getsentrysentry-python.json +train,data/cjolowicz/nox-poetry,cjolowicznox-poetry.json +train,data/vusec/collabfuzz,vuseccollabfuzz.json +train,data/fpgmaas/cookiecutter-poetry,fpgmaascookiecutter-poetry.json +train,data/brainix/pottery,brainixpottery.json +train,data/OWASP/raider,OWASPraider.json +train,data/facebook/ThreatExchange,facebookThreatExchange.json +train,data/sgratzl/slack_cleaner2,sgratzlslack_cleaner2.json +train,data/grafana/django-saml2-auth,grafanadjango-saml2-auth.json +train,data/openapi-generators/openapi-python-client,openapi-generatorsopenapi-python-client.json +train,data/microsoft/causica,microsoftcausica.json +train,data/signorrayan/RedTeam_toolkit,signorrayanRedTeam_toolkit.json +train,data/svix/python-ksuid,svixpython-ksuid.json +train,data/paul-nameless/tg,paul-namelesstg.json +train,data/tiangolo/sqlmodel,tiangolosqlmodel.json +train,data/mholtzscher/spacy_readability,mholtzscherspacy_readability.json +train,data/inyutin/aiohttp_retry,inyutinaiohttp_retry.json +train,data/yugabyte/yugabyte-db,yugabyteyugabyte-db.json +train,data/terraform-aws-modules/terraform-aws-notify-slack,terraform-aws-modulesterraform-aws-notify-slack.json +train,data/charliermarsh/ruff,charliermarshruff.json +train,data/duckdb/duckdb,duckdbduckdb.json +train,data/ansible/ansible-language-server,ansibleansible-language-server.json +train,data/vtess/FEMU,vtessFEMU.json +train,data/dfurtado/dataclass-csv,dfurtadodataclass-csv.json +train,data/pietrolesci/energizer,pietrolescienergizer.json +train,data/altair-viz/altair-transform,altair-vizaltair-transform.json +train,data/roniemartinez/dude,roniemartinezdude.json +train,data/Jigsaw-Code/net-analysis,Jigsaw-Codenet-analysis.json +train,data/Ulauncher/Ulauncher,UlauncherUlauncher.json +train,data/senwu/emmental,senwuemmental.json +train,data/oxigraph/oxigraph,oxigraphoxigraph.json +train,data/omnilib/aiomultiprocess,omnilibaiomultiprocess.json +train,data/vchaptsev/cookiecutter-django-vue,vchaptsevcookiecutter-django-vue.json +train,data/hjacobs/kube-ops-view,hjacobskube-ops-view.json +train,data/kogan/django-subscriptions,kogandjango-subscriptions.json +train,data/PyCQA/docformatter,PyCQAdocformatter.json +train,data/alandtse/tesla,alandtsetesla.json +train,data/alexander-akhmetov/python-telegram,alexander-akhmetovpython-telegram.json +train,data/SPFlow/SPFlow,SPFlowSPFlow.json +train,data/fastavro/fastavro,fastavrofastavro.json +train,data/dominodatalab/domino-research,dominodatalabdomino-research.json +train,data/nubank/fklearn,nubankfklearn.json +train,data/PacktPublishing/Mastering-Object-Oriented-Python-Second-Edition,PacktPublishingMastering-Object-Oriented-Python-Second-Edition.json +train,data/diodonfrost/terraform-aws-lambda-scheduler-stop-start,diodonfrostterraform-aws-lambda-scheduler-stop-start.json +train,data/mmuckley/torchkbnufft,mmuckleytorchkbnufft.json +train,data/FroggyTaipei/froggy-service,FroggyTaipeifroggy-service.json +train,data/aio-libs/aiologstash,aio-libsaiologstash.json +train,data/dj-stripe/dj-stripe,dj-stripedj-stripe.json +train,data/tiangolo/asyncer,tiangoloasyncer.json +train,data/dmontagu/fastapi-utils,dmontagufastapi-utils.json +train,data/odrling/peony-twitter,odrlingpeony-twitter.json +train,data/conversationai/conversationai-models,conversationaiconversationai-models.json +train,data/zbrookle/avionix,zbrookleavionix.json +train,data/ethereum/py-evm,ethereumpy-evm.json +train,data/explosion/confection,explosionconfection.json +train,data/qtile/qtile,qtileqtile.json +train,data/SeldonIO/alibi,SeldonIOalibi.json +train,data/Appen/UHV-OTS-Speech,AppenUHV-OTS-Speech.json +train,data/Limych/ha-temperature-feels-like,Limychha-temperature-feels-like.json +train,data/AlvarBer/Persimmon,AlvarBerPersimmon.json +train,data/HackSoftware/Django-Styleguide-Example,HackSoftwareDjango-Styleguide-Example.json +train,data/Limych/ha-average,Limychha-average.json +train,data/nedbat/scriv,nedbatscriv.json +train,data/appium/python-client,appiumpython-client.json +train,data/best-doctor/flake8-variables-names,best-doctorflake8-variables-names.json +train,data/Corvia/django-tenant-users,Corviadjango-tenant-users.json +train,data/lRomul/argus,lRomulargus.json +train,data/omanges/turfpy,omangesturfpy.json +train,data/Ericsson/codechecker,Ericssoncodechecker.json +train,data/CUNY-CL/wikipron,CUNY-CLwikipron.json +train,data/jonasrauber/eagerpy,jonasraubereagerpy.json +train,data/brainiak/brainiak,brainiakbrainiak.json +train,data/pypa/cibuildwheel,pypacibuildwheel.json +train,data/isaaccorley/torchrs,isaaccorleytorchrs.json +train,data/aio-libs/multidict,aio-libsmultidict.json +train,data/dcs-liberation/dcs_liberation,dcs-liberationdcs_liberation.json +train,data/computationalcore/cryptosteganography,computationalcorecryptosteganography.json +train,data/ninoseki/uzen,ninosekiuzen.json +train,data/brokenloop/jsontopydantic,brokenloopjsontopydantic.json +train,data/servo/servo,servoservo.json +train,data/christianhujer/expensereport,christianhujerexpensereport.json +train,data/gingsi/coot-videotext,gingsicoot-videotext.json +train,data/banksalad/python,banksaladpython.json +train,data/Lightning-AI/lightning-bolts,Lightning-AIlightning-bolts.json +train,data/mtkennerly/dunamai,mtkennerlydunamai.json +train,data/delta-io/delta-sharing,delta-iodelta-sharing.json +train,data/ICLRandD/Blackstone,ICLRandDBlackstone.json +train,data/vitorfs/colossus,vitorfscolossus.json +train,data/Yorko/bert-finetuning-catalyst,Yorkobert-finetuning-catalyst.json +train,data/Granulate/gprofiler,Granulategprofiler.json +train,data/jspahrsummers/adt,jspahrsummersadt.json +train,data/pypi/warehouse,pypiwarehouse.json +train,data/vzhd1701/evernote-backup,vzhd1701evernote-backup.json +train,data/strongbugman/ant_nest,strongbugmanant_nest.json +train,data/dbader/schedule,dbaderschedule.json +train,data/materialsproject/pymatgen,materialsprojectpymatgen.json +train,data/seandstewart/typical,seandstewarttypical.json +train,data/pearl-core/pearl,pearl-corepearl.json +train,data/nucleic/atom,nucleicatom.json +train,data/SeldonIO/alibi-detect,SeldonIOalibi-detect.json +train,data/Zuehlke/ConfZ,ZuehlkeConfZ.json +train,data/awslabs/mlmax,awslabsmlmax.json +train,data/allenai/tango,allenaitango.json +train,data/tomasfarias/airflow-dbt-python,tomasfariasairflow-dbt-python.json +train,data/FerrariDG/async-ml-inference,FerrariDGasync-ml-inference.json +train,data/rsinger86/drf-flex-fields,rsinger86drf-flex-fields.json +train,data/polarmutex/fava-envelope,polarmutexfava-envelope.json +train,data/ktbyers/netmiko,ktbyersnetmiko.json +train,data/GACWR/OpenUBA,GACWROpenUBA.json +train,data/facebookresearch/labgraph,facebookresearchlabgraph.json +train,data/ray-project/ray,ray-projectray.json +train,data/rhasspy/rhasspy,rhasspyrhasspy.json +train,data/kanidm/kanidm,kanidmkanidm.json +train,data/Lookyloo/lookyloo,Lookyloolookyloo.json +train,data/n8henrie/pycookiecheat,n8henriepycookiecheat.json +train,data/pwenker/chessli,pwenkerchessli.json +train,data/logancyang/loss-landscape-anim,logancyangloss-landscape-anim.json +train,data/abey79/vsketch,abey79vsketch.json +train,data/csernazs/pytest-httpserver,csernazspytest-httpserver.json +train,data/SpaceVim/SpaceVim,SpaceVimSpaceVim.json +train,data/PyCQA/pydocstyle,PyCQApydocstyle.json +train,data/ethereum/sharding,ethereumsharding.json +train,data/linaro-its/aws2-wrap,linaro-itsaws2-wrap.json +train,data/Project-MONAI/MONAI,Project-MONAIMONAI.json +train,data/nlpub/chinese-whispers-python,nlpubchinese-whispers-python.json +train,data/aio-libs/async-lru,aio-libsasync-lru.json +train,data/bocadilloproject/bocadillo,bocadilloprojectbocadillo.json +train,data/mirukana/mirage,mirukanamirage.json +train,data/Mayandev/django_morec,Mayandevdjango_morec.json +train,data/klensy/wt-tools,klensywt-tools.json +train,data/microsoft/task_oriented_dialogue_as_dataflow_synthesis,microsofttask_oriented_dialogue_as_dataflow_synthesis.json +train,data/trim21/transmission-rpc,trim21transmission-rpc.json +train,data/home-assistant-libs/zwave-js-server-python,home-assistant-libszwave-js-server-python.json +train,data/Azure/feast-azure,Azurefeast-azure.json +train,data/mcbeet/beet,mcbeetbeet.json +train,data/dair-iitd/imojie,dair-iitdimojie.json +train,data/obsidian-community/obsidian-hub,obsidian-communityobsidian-hub.json +train,data/secondmind-labs/GPflux,secondmind-labsGPflux.json +train,data/Textualize/rich-cli,Textualizerich-cli.json +train,data/jodal/pykka,jodalpykka.json +train,data/uclanlp/visualbert,uclanlpvisualbert.json +train,data/cgarwood/homeassistant-zwave_mqtt,cgarwoodhomeassistant-zwave_mqtt.json +train,data/simonepri/lm-scorer,simoneprilm-scorer.json +train,data/bincyber/pkictl,bincyberpkictl.json +train,data/nackjicholson/aiosql,nackjicholsonaiosql.json +train,data/aio-libs/aiopg,aio-libsaiopg.json +train,data/Enforcer/clean-architecture,Enforcerclean-architecture.json +train,data/rhasspy/larynx,rhasspylarynx.json +train,data/altair-viz/vega_datasets,altair-vizvega_datasets.json +train,data/gdsfactory/gdsfactory,gdsfactorygdsfactory.json +train,data/synesthesiam/opentts,synesthesiamopentts.json +train,data/streamlink/streamlink,streamlinkstreamlink.json +train,data/carpentries/amy,carpentriesamy.json +train,data/algorand/auction-demo,algorandauction-demo.json +train,data/apache/libcloud,apachelibcloud.json +train,data/domvwt/esparto,domvwtesparto.json +train,data/apoclyps/reviews,apoclypsreviews.json +train,data/aiven/pghoard,aivenpghoard.json +train,data/Yelp/detect-secrets,Yelpdetect-secrets.json +train,data/lnbits/lnbits,lnbitslnbits.json +train,data/dflook/terraform-github-actions,dflookterraform-github-actions.json +train,data/beeb/pancaketrade,beebpancaketrade.json +train,data/skorokithakis/catt,skorokithakiscatt.json +train,data/flopp/GpxTrackPoster,floppGpxTrackPoster.json +train,data/callahantiff/PheKnowLator,callahantiffPheKnowLator.json +train,data/pytorch/ort,pytorchort.json +train,data/playpauseandstop/rororo,playpauseandstoprororo.json +train,data/taverntesting/tavern,taverntestingtavern.json +train,data/pola-rs/polars,pola-rspolars.json +train,data/akdor1154/python-csql,akdor1154python-csql.json +train,data/dansanderson/picotool,dansandersonpicotool.json +train,data/nidhaloff/igel,nidhaloffigel.json +train,data/meilisearch/meilisearch-python,meilisearchmeilisearch-python.json +train,data/tortoise/aerich,tortoiseaerich.json +train,data/SamR1/FitTrackee,SamR1FitTrackee.json +train,data/medipixel/rl_algorithms,medipixelrl_algorithms.json +train,data/RhinoSecurityLabs/pacu,RhinoSecurityLabspacu.json +train,data/rytilahti/python-songpal,rytilahtipython-songpal.json +train,data/RyanJarv/cdn-proxy,RyanJarvcdn-proxy.json +train,data/databricks/dbt-databricks,databricksdbt-databricks.json +train,data/databand-ai/dbnd,databand-aidbnd.json +train,data/claws/aioprometheus,clawsaioprometheus.json +train,data/nccgroup/Solitude,nccgroupSolitude.json +train,data/akamhy/waybackpy,akamhywaybackpy.json +train,data/mlco2/codecarbon,mlco2codecarbon.json +train,data/rowanz/r2c,rowanzr2c.json +train,data/GoogleCloudPlatform/cloud-sql-python-connector,GoogleCloudPlatformcloud-sql-python-connector.json +train,data/GehirnInc/python-jwt,GehirnIncpython-jwt.json +train,data/mirumee/ariadne,mirumeeariadne.json +train,data/pandora-analysis/pandora,pandora-analysispandora.json +train,data/OpenRarity/open-rarity,OpenRarityopen-rarity.json +train,data/fuhrysteve/marshmallow-jsonschema,fuhrystevemarshmallow-jsonschema.json +train,data/fgmacedo/python-statemachine,fgmacedopython-statemachine.json +train,data/async-worker/aiologger,async-workeraiologger.json +train,data/mammuth/bitcoin-arbitrage-trading-bot,mammuthbitcoin-arbitrage-trading-bot.json +train,data/replicate/cog,replicatecog.json +train,data/breathe-doc/breathe,breathe-docbreathe.json +train,data/ipspace/netlab,ipspacenetlab.json +train,data/starburstdata/dbt-trino,starburstdatadbt-trino.json +train,data/LibreLingo/LibreLingo,LibreLingoLibreLingo.json +train,data/BEEmod/BEE2.4,BEEmodBEE2.4.json +train,data/yukinarit/pyserde,yukinaritpyserde.json +train,data/dolthub/doltpy,dolthubdoltpy.json +train,data/languitar/autosuspend,languitarautosuspend.json +train,data/dynobo/normcap,dynobonormcap.json +train,data/ambv/aiotone,ambvaiotone.json +train,data/funkyfuture/deck-chores,funkyfuturedeck-chores.json +train,data/clokep/celery-batches,clokepcelery-batches.json +train,data/osm-fr/osmose-backend,osm-frosmose-backend.json +train,data/QQuick/Transcrypt,QQuickTranscrypt.json +train,data/spaceml-org/ml4floods,spaceml-orgml4floods.json +train,data/westerndigitalcorporation/pyvcd,westerndigitalcorporationpyvcd.json +train,data/dmontagu/fastapi_client,dmontagufastapi_client.json +train,data/Fatal1ty/aioapns,Fatal1tyaioapns.json +train,data/numpy/numpy-stubs,numpynumpy-stubs.json +train,data/pytorch/torchdistx,pytorchtorchdistx.json +train,data/ml-tooling/lazydocs,ml-toolinglazydocs.json +train,data/facebookresearch/multimodal,facebookresearchmultimodal.json +train,data/facebookresearch/mtrl,facebookresearchmtrl.json +train,data/NREL/pysam,NRELpysam.json +train,data/tribe29/checkmk,tribe29checkmk.json +train,data/google/jax,googlejax.json +train,data/piccolo-orm/piccolo,piccolo-ormpiccolo.json +train,data/pulumi/pulumi,pulumipulumi.json +train,data/python/typeshed,pythontypeshed.json +train,data/TankerHQ/sdk-js,TankerHQsdk-js.json +train,data/realpython/materials,realpythonmaterials.json +train,data/bayesimpact/bob-emploi,bayesimpactbob-emploi.json +train,data/madpah/requirements-parser,madpahrequirements-parser.json +train,data/SergeyShk/ruTS,SergeyShkruTS.json +train,data/open-telemetry/opentelemetry-python-contrib,open-telemetryopentelemetry-python-contrib.json +train,data/Cornerstone-OnDemand/modelkit,Cornerstone-OnDemandmodelkit.json +train,data/Danielhiversen/flux_led,Danielhiversenflux_led.json +train,data/Qiskit/qiskit-aqua,Qiskitqiskit-aqua.json +train,data/arviz-devs/arviz,arviz-devsarviz.json +train,data/intel/neural-compressor,intelneural-compressor.json +train,data/aiven/journalpump,aivenjournalpump.json +train,data/colour-science/colour-hdri,colour-sciencecolour-hdri.json +train,data/facebookresearch/mbrl-lib,facebookresearchmbrl-lib.json +train,data/microsoft/onnxruntime,microsoftonnxruntime.json +train,data/avanov/Plim,avanovPlim.json +train,data/dprog-philippe-docourt/django-qr-code,dprog-philippe-docourtdjango-qr-code.json +train,data/pajbot/pajbot,pajbotpajbot.json +train,data/cleanlab/cleanlab,cleanlabcleanlab.json +train,data/SysCV/bdd100k-models,SysCVbdd100k-models.json +train,data/kinnala/scikit-fem,kinnalascikit-fem.json +train,data/thirdweb-dev/python-sdk,thirdweb-devpython-sdk.json +train,data/AshenOneYe/WFHelper,AshenOneYeWFHelper.json +train,data/Tinkoff/invest-python,Tinkoffinvest-python.json +train,data/mborgerson/xemu,mborgersonxemu.json +train,data/pasqal-io/Pulser,pasqal-ioPulser.json +train,data/rchain/rchain,rchainrchain.json +train,data/facebookresearch/Mephisto,facebookresearchMephisto.json +train,data/python-telegram-bot/rules-bot,python-telegram-botrules-bot.json +train,data/kjelljorner/morfeus,kjelljornermorfeus.json +train,data/tiangolo/meinheld-gunicorn-docker,tiangolomeinheld-gunicorn-docker.json +train,data/ur-whitelab/exmol,ur-whitelabexmol.json +train,data/timothycrosley/hypothesis-auto,timothycrosleyhypothesis-auto.json +train,data/jwkvam/bowtie,jwkvambowtie.json +train,data/federicotdn/wikiquote,federicotdnwikiquote.json +train,data/larsyencken/csvdiff,larsyenckencsvdiff.json +train,data/ManimCommunity/manim,ManimCommunitymanim.json +train,data/zwicker-group/py-pde,zwicker-grouppy-pde.json +train,data/danielperna84/hahomematic,danielperna84hahomematic.json +train,data/votingworks/arlo,votingworksarlo.json +train,data/vinissimus/async-asgi-testclient,vinissimusasync-asgi-testclient.json +train,data/nficano/yakutils,nficanoyakutils.json +train,data/allenai/scholarphi,allenaischolarphi.json +train,data/wemake-services/django-test-migrations,wemake-servicesdjango-test-migrations.json +train,data/mixmoe/HibiAPI,mixmoeHibiAPI.json +train,data/tmux-python/tmuxp,tmux-pythontmuxp.json +train,data/encode/typesystem,encodetypesystem.json +train,data/heuer/segno,heuersegno.json +train,data/alcarithemad/zfsp,alcarithemadzfsp.json +train,data/sybrenstuvel/python-rsa,sybrenstuvelpython-rsa.json +train,data/MarcTheSpark/scamp,MarcTheSparkscamp.json +train,data/kuwala-io/kuwala,kuwala-iokuwala.json +train,data/osohq/oso,osohqoso.json +train,data/demisto/content,demistocontent.json +train,data/ciscorn/starlette-graphene3,ciscornstarlette-graphene3.json +train,data/PostHog/posthog-foss,PostHogposthog-foss.json +train,data/nsidc/earthdata,nsidcearthdata.json +train,data/gchamon/sysrsync,gchamonsysrsync.json +train,data/samuelcolvin/aioaws,samuelcolvinaioaws.json +train,data/KLUE-benchmark/KLUE-baseline,KLUE-benchmarkKLUE-baseline.json +train,data/sabeechen/hassio-google-drive-backup,sabeechenhassio-google-drive-backup.json +train,data/mcneel/rhino.inside-revit,mcneelrhino.inside-revit.json +train,data/wemake-services/dump-env,wemake-servicesdump-env.json +train,data/jackjyq/xiaohei-zuowen,jackjyqxiaohei-zuowen.json +train,data/gaogaotiantian/objprint,gaogaotiantianobjprint.json +train,data/kdeldycke/meta-package-manager,kdeldyckemeta-package-manager.json +train,data/ealcobaca/pymfe,ealcobacapymfe.json +train,data/engnadeau/pybotics,engnadeaupybotics.json +train,data/CyberPunkMetalHead/gateio-crypto-trading-bot-binance-announcements-new-coins,CyberPunkMetalHeadgateio-crypto-trading-bot-binance-announcements-new-coins.json +train,data/eth-brownie/brownie,eth-browniebrownie.json +train,data/Systemcluster/The-Witcher-3-Mod-manager,SystemclusterThe-Witcher-3-Mod-manager.json +train,data/nsidnev/fastapi-realworld-example-app,nsidnevfastapi-realworld-example-app.json +train,data/elifesciences/sciencebeam-parser,elifesciencessciencebeam-parser.json +train,data/hashintel/hash,hashintelhash.json +train,data/apache/incubator-sedona,apacheincubator-sedona.json +train,data/freiheit/discord_feedbot,freiheitdiscord_feedbot.json +train,data/cgarwood/python-openzwave-mqtt,cgarwoodpython-openzwave-mqtt.json +train,data/nkolot/ProHMR,nkolotProHMR.json +train,data/karlch/vimiv-qt,karlchvimiv-qt.json +train,data/encode/starlette,encodestarlette.json +train,data/Riverside-Healthcare/djLint,Riverside-HealthcaredjLint.json +train,data/pmgbergen/porepy,pmgbergenporepy.json +train,data/mostafa/grest,mostafagrest.json +train,data/salesforce/ml4ir,salesforceml4ir.json +train,data/openvinotoolkit/training_extensions,openvinotoolkittraining_extensions.json +train,data/ducdetronquito/scalpl,ducdetronquitoscalpl.json +train,data/DonDebonair/slack-machine,DonDebonairslack-machine.json +train,data/stevearc/dql,stevearcdql.json +train,data/Pincer-org/Pincer,Pincer-orgPincer.json +train,data/luoliyan/incremental-reading,luoliyanincremental-reading.json +train,data/scalyr/scalyr-agent-2,scalyrscalyr-agent-2.json +train,data/maxisoft/Freenom-dns-updater,maxisoftFreenom-dns-updater.json +train,data/safe-global/safe-transaction-service,safe-globalsafe-transaction-service.json +train,data/alexander-akhmetov/python-shortcuts,alexander-akhmetovpython-shortcuts.json +train,data/p1c2u/openapi-core,p1c2uopenapi-core.json +train,data/n8henrie/fauxmo,n8henriefauxmo.json +train,data/untitled-ai/jupyter_ascending,untitled-aijupyter_ascending.json +train,data/mkdocstrings/mkdocstrings,mkdocstringsmkdocstrings.json +train,data/HXSecurity/DongTai,HXSecurityDongTai.json +train,data/bimmerconnected/bimmer_connected,bimmerconnectedbimmer_connected.json +train,data/Ultimaker/Uranium,UltimakerUranium.json +train,data/jamesoff/simplemonitor,jamesoffsimplemonitor.json +train,data/jacebrowning/memegen,jacebrowningmemegen.json +train,data/ehForwarderBot/efb-wechat-slave,ehForwarderBotefb-wechat-slave.json +train,data/ProdPerfect/monday,ProdPerfectmonday.json +train,data/terra-money/terra.py,terra-moneyterra.py.json +train,data/kobinpy/kobin,kobinpykobin.json +train,data/aio-libs/aiohttp-session,aio-libsaiohttp-session.json +train,data/mozilla/bigquery-etl,mozillabigquery-etl.json +train,data/martinl/openpilot,martinlopenpilot.json +train,data/feature-engine/feature_engine,feature-enginefeature_engine.json +train,data/redbearder/sapspa,redbeardersapspa.json +train,data/awslabs/amazon-transcribe-streaming-sdk,awslabsamazon-transcribe-streaming-sdk.json +train,data/xaviml/controllerx,xavimlcontrollerx.json +train,data/noctuid/zscroll,noctuidzscroll.json +train,data/dbt-labs/dbt-bigquery,dbt-labsdbt-bigquery.json +train,data/faust-streaming/faust,faust-streamingfaust.json +train,data/trailofbits/ManticoreUI,trailofbitsManticoreUI.json +train,data/Madoshakalaka/pipenv-setup,Madoshakalakapipenv-setup.json +train,data/brentspell/torch-yin,brentspelltorch-yin.json +train,data/twardoch/fonttools-opentype-feature-freezer,twardochfonttools-opentype-feature-freezer.json +train,data/fourjr/rainbot,fourjrrainbot.json +train,data/SigmaHQ/pySigma,SigmaHQpySigma.json +train,data/datafusion-contrib/datafusion-python,datafusion-contribdatafusion-python.json +train,data/welchbj/bscan,welchbjbscan.json +train,data/superdesk/superdesk,superdesksuperdesk.json +train,data/radix-ai/graphchain,radix-aigraphchain.json +train,data/Labelbox/labelbox-python,Labelboxlabelbox-python.json +train,data/himkt/konoha,himktkonoha.json +train,data/georgian-io/Transformers-Domain-Adaptation,georgian-ioTransformers-Domain-Adaptation.json +train,data/SolarEdgeTech/pyctuator,SolarEdgeTechpyctuator.json +train,data/allenai/allennlp-semparse,allenaiallennlp-semparse.json +train,data/stan-dev/pystan,stan-devpystan.json +train,data/SpaceXLaunchBot/SpaceXLaunchBot,SpaceXLaunchBotSpaceXLaunchBot.json +train,data/guilatrova/tryceratops,guilatrovatryceratops.json +train,data/escaped/django-inline-actions,escapeddjango-inline-actions.json +train,data/pemistahl/lingua-py,pemistahllingua-py.json +train,data/docker-science/cookiecutter-docker-science,docker-sciencecookiecutter-docker-science.json +train,data/Lightning-AI/lightning-transformers,Lightning-AIlightning-transformers.json +train,data/elastic/eland,elasticeland.json +train,data/aws/serverless-application-model,awsserverless-application-model.json +train,data/hjacobs/kube-resource-report,hjacobskube-resource-report.json +train,data/uber/bayesmark,uberbayesmark.json +train,data/callebtc/cashu,callebtccashu.json +train,data/autogoal/autogoal,autogoalautogoal.json +train,data/OriginQ/QPanda-2,OriginQQPanda-2.json +train,data/jazzband/django-axes,jazzbanddjango-axes.json +train,data/ansible/vscode-ansible,ansiblevscode-ansible.json +train,data/pallets/click,palletsclick.json +train,data/Josverl/micropython-stubber,Josverlmicropython-stubber.json +train,data/thunderstore-io/Thunderstore,thunderstore-ioThunderstore.json +train,data/e-valuation/EvaP,e-valuationEvaP.json +train,data/alerta/alerta,alertaalerta.json +train,data/galaxyproject/galaxy,galaxyprojectgalaxy.json +train,data/alanwilter/acpype,alanwilteracpype.json +train,data/ufal/neuralmonkey,ufalneuralmonkey.json +train,data/ludbek/webpreview,ludbekwebpreview.json +train,data/zayfod/pycozmo,zayfodpycozmo.json +train,data/claws/dump1090-exporter,clawsdump1090-exporter.json +train,data/jasperges/pose-thumbnails,jaspergespose-thumbnails.json +train,data/cloudtools/awacs,cloudtoolsawacs.json +train,data/pyvisa/pyvisa,pyvisapyvisa.json +train,data/RDFLib/sparqlwrapper,RDFLibsparqlwrapper.json +train,data/cve-search/PyCVESearch,cve-searchPyCVESearch.json +train,data/docat-org/docat,docat-orgdocat.json +train,data/blurg/sauron-engine,blurgsauron-engine.json +train,data/pappasam/jedi-language-server,pappasamjedi-language-server.json +train,data/henry-prior/jax-rl,henry-priorjax-rl.json +train,data/Fatal1ty/tinkoff-api,Fatal1tytinkoff-api.json +train,data/amor71/LiuAlgoTrader,amor71LiuAlgoTrader.json +train,data/abilian/olapy,abilianolapy.json +train,data/imagineai/create-django-app,imagineaicreate-django-app.json +train,data/bookingcom/upliftml,bookingcomupliftml.json +train,data/oracle/macest,oraclemacest.json +train,data/graphcore/examples,graphcoreexamples.json +train,data/FreeRTOS/coreMQTT,FreeRTOScoreMQTT.json +train,data/CogStack/MedCAT,CogStackMedCAT.json +train,data/m-burst/flake8-pytest-style,m-burstflake8-pytest-style.json +train,data/ifduyue/python-xxhash,ifduyuepython-xxhash.json +train,data/replit/replit-py,replitreplit-py.json +train,data/proteneer/timemachine,proteneertimemachine.json +train,data/ANL-CEEESA/MIPLearn,ANL-CEEESAMIPLearn.json +train,data/pythonitalia/pycon,pythonitaliapycon.json +train,data/jeremiecoullon/SGMCMCJax,jeremiecoullonSGMCMCJax.json +train,data/GoogleCloudPlatform/oozie-to-airflow,GoogleCloudPlatformoozie-to-airflow.json +train,data/pallets/flask,palletsflask.json +train,data/allenai/python-package-template,allenaipython-package-template.json +train,data/approvals/ApprovalTests.cpp,approvalsApprovalTests.cpp.json +train,data/kingname/Tinepeas,kingnameTinepeas.json +train,data/gbtami/pychess-variants,gbtamipychess-variants.json +train,data/CycloneDX/cyclonedx-python,CycloneDXcyclonedx-python.json +train,data/HTTP-APIs/hydrus,HTTP-APIshydrus.json +train,data/herrmannlab/dicomweb-client,herrmannlabdicomweb-client.json +train,data/golemfactory/clay,golemfactoryclay.json +train,data/miyakogi/wdom,miyakogiwdom.json +train,data/wemake-services/flake8-broken-line,wemake-servicesflake8-broken-line.json +train,data/cleverhans-lab/cleverhans,cleverhans-labcleverhans.json +train,data/YiriMiraiProject/YiriMirai,YiriMiraiProjectYiriMirai.json +train,data/chambliss/Multilingual_NER,chamblissMultilingual_NER.json +train,data/google/graphicsfuzz,googlegraphicsfuzz.json +train,data/up42/up42-py,up42up42-py.json +train,data/hikari-py/hikari,hikari-pyhikari.json +train,data/summa-tx/relays,summa-txrelays.json +train,data/KDE/syntax-highlighting,KDEsyntax-highlighting.json +train,data/pywemo/pywemo,pywemopywemo.json +train,data/bitsbb01/m3u8_creator,bitsbb01m3u8_creator.json +train,data/cqfn/aibolit,cqfnaibolit.json +train,data/cffls/pycardano,cfflspycardano.json +train,data/cider-security-research/cicd-goat,cider-security-researchcicd-goat.json +train,data/annotation-ai/python-project-template,annotation-aipython-project-template.json +train,data/studio-ousia/bpr,studio-ousiabpr.json +train,data/golemfactory/yagna,golemfactoryyagna.json +train,data/abhinavk99/jikanpy,abhinavk99jikanpy.json +train,data/Dentosal/python-sc2,Dentosalpython-sc2.json +train,data/miyakogi/pyppeteer,miyakogipyppeteer.json +train,data/SeldonIO/seldon-core,SeldonIOseldon-core.json +train,data/GaloisInc/saw-script,GaloisIncsaw-script.json +train,data/j-marple-dev/model_compression,j-marple-devmodel_compression.json +train,data/uwdata/draco,uwdatadraco.json +train,data/nucypher/pyUmbral,nucypherpyUmbral.json +train,data/full-stack-deep-learning/fsdl-text-recognizer,full-stack-deep-learningfsdl-text-recognizer.json +train,data/mpi4py/mpi4py,mpi4pympi4py.json +train,data/freelawproject/eyecite,freelawprojecteyecite.json +train,data/aws/chalice,awschalice.json +train,data/samuelcolvin/dirty-equals,samuelcolvindirty-equals.json +train,data/rsinger86/drf-typed-views,rsinger86drf-typed-views.json +train,data/MightyCreak/diffuse,MightyCreakdiffuse.json +train,data/iktakahiro/dddpy,iktakahirodddpy.json +train,data/sambarnes/fullstack-starknet,sambarnesfullstack-starknet.json +train,data/abersheeran/asgi-ratelimit,abersheeranasgi-ratelimit.json +train,data/scaleway/postal-address,scalewaypostal-address.json +train,data/allenporter/python-google-nest-sdm,allenporterpython-google-nest-sdm.json +train,data/ponty/PyVirtualDisplay,pontyPyVirtualDisplay.json +train,data/requests-cache/requests-cache,requests-cacherequests-cache.json +train,data/NannyML/nannyml,NannyMLnannyml.json +train,data/Ribbit-Network/ribbit-network-frog-sensor,Ribbit-Networkribbit-network-frog-sensor.json +train,data/woven-planet/l5kit,woven-planetl5kit.json +train,data/nbQA-dev/nbQA,nbQA-devnbQA.json +train,data/cryzed/Selenium-Requests,cryzedSelenium-Requests.json +train,data/digitalbitbox/bitbox02-firmware,digitalbitboxbitbox02-firmware.json +train,data/aeturrell/skimpy,aeturrellskimpy.json +train,data/plynx-team/plynx,plynx-teamplynx.json +train,data/zatosource/zato,zatosourcezato.json +train,data/materialsvirtuallab/m3gnet,materialsvirtuallabm3gnet.json +train,data/Qiskit/qiskit-machine-learning,Qiskitqiskit-machine-learning.json +train,data/v923z/micropython-ulab,v923zmicropython-ulab.json +train,data/mtkennerly/poetry-dynamic-versioning,mtkennerlypoetry-dynamic-versioning.json +train,data/whitphx/stlite,whitphxstlite.json +train,data/tiangolo/full-stack-flask-couchbase,tiangolofull-stack-flask-couchbase.json +train,data/abey79/vpype,abey79vpype.json +train,data/censys/censys-python,censyscensys-python.json +train,data/lbolla/EMpy,lbollaEMpy.json +train,data/digantamisra98/Echo,digantamisra98Echo.json +train,data/wechaty/python-wechaty,wechatypython-wechaty.json +train,data/coderedcorp/django-sass,coderedcorpdjango-sass.json +train,data/tiepvupsu/tabml,tiepvupsutabml.json +train,data/repology/repology-updater,repologyrepology-updater.json +train,data/activeloopai/deeplake,activeloopaideeplake.json +train,data/zigpy/zha-device-handlers,zigpyzha-device-handlers.json +train,data/Azure-Samples/azure-python-labs,Azure-Samplesazure-python-labs.json +train,data/python-mario/mario,python-mariomario.json +train,data/BMW-InnovationLab/BMW-YOLOv4-Training-Automation,BMW-InnovationLabBMW-YOLOv4-Training-Automation.json +train,data/polyaxon/hypertune,polyaxonhypertune.json +train,data/SEED-platform/seed,SEED-platformseed.json +train,data/ClemBotProject/ClemBot,ClemBotProjectClemBot.json +train,data/graphql-python/graphene-tornado,graphql-pythongraphene-tornado.json +train,data/bee-san/pyWhat,bee-sanpyWhat.json +train,data/jamesturk/spatula,jamesturkspatula.json +train,data/sabuhish/fastapi-mail,sabuhishfastapi-mail.json +train,data/urllib3/urllib3,urllib3urllib3.json +train,data/stanfordnqp/spins-b,stanfordnqpspins-b.json +train,data/scanapi/scanapi,scanapiscanapi.json +train,data/flacjacket/pywayland,flacjacketpywayland.json +train,data/esphome/aioesphomeapi,esphomeaioesphomeapi.json +train,data/Shopify/shopify_python,Shopifyshopify_python.json +train,data/aio-libs/aiokafka,aio-libsaiokafka.json +train,data/DT42/BerryNet,DT42BerryNet.json +train,data/ioos/compliance-checker,iooscompliance-checker.json +train,data/dwavesystems/dimod,dwavesystemsdimod.json +train,data/hjacobs/kube-janitor,hjacobskube-janitor.json +train,data/shacker/django-todo,shackerdjango-todo.json +train,data/spcl/serverless-benchmarks,spclserverless-benchmarks.json +train,data/shidenggui/easyquotation,shidengguieasyquotation.json +train,data/pyvista/pyvistaqt,pyvistapyvistaqt.json +train,data/python-trio/trustme,python-triotrustme.json +train,data/facebookresearch/xformers,facebookresearchxformers.json +train,data/facebook/usort,facebookusort.json +train,data/serum-community/pyserum,serum-communitypyserum.json +train,data/facebookresearch/craftassist,facebookresearchcraftassist.json +train,data/public/sonora,publicsonora.json +train,data/g0v/2020voting-guide,g0v2020voting-guide.json +train,data/benkehoe/aws-sso-util,benkehoeaws-sso-util.json +train,data/encode/databases,encodedatabases.json +train,data/ZeroIntensity/pointers.py,ZeroIntensitypointers.py.json +train,data/SAP/machine-learning-lab,SAPmachine-learning-lab.json +train,data/kmyk-jikka/Jikka,kmyk-jikkaJikka.json +train,data/tableau/server-client-python,tableauserver-client-python.json +train,data/kmille/dkim-verify,kmilledkim-verify.json +train,data/adferrand/dnsrobocert,adferranddnsrobocert.json +train,data/MISP/mail_to_misp,MISPmail_to_misp.json +train,data/dcos/dcos-e2e,dcosdcos-e2e.json +train,data/akamhy/videohash,akamhyvideohash.json +train,data/GridTools/gt4py,GridToolsgt4py.json +train,data/encode/httpcore,encodehttpcore.json +train,data/apache/doris,apachedoris.json +train,data/encode/uvicorn,encodeuvicorn.json +train,data/frictionlessdata/livemark,frictionlessdatalivemark.json +train,data/timothycrosley/streamdeck-ui,timothycrosleystreamdeck-ui.json +train,data/cedricbonhomme/Stegano,cedricbonhommeStegano.json +train,data/icedland/iced,icedlandiced.json +train,data/copier-org/copier,copier-orgcopier.json +train,data/networkx/networkx,networkxnetworkx.json +train,data/great-expectations/great_expectations,great-expectationsgreat_expectations.json +train,data/quantumlib/Cirq,quantumlibCirq.json +train,data/allenai/scispacy,allenaiscispacy.json +train,data/P403n1x87/austin-tui,P403n1x87austin-tui.json +train,data/openai/lm-human-preferences,openailm-human-preferences.json +train,data/terrencepreilly/darglint,terrencepreillydarglint.json +train,data/awslabs/aws-ddk,awslabsaws-ddk.json +train,data/Kruptein/PlanarAlly,KrupteinPlanarAlly.json +train,data/conda-incubator/conda-lock,conda-incubatorconda-lock.json +train,data/dropbox/sqlalchemy-stubs,dropboxsqlalchemy-stubs.json +train,data/cihai/cihai,cihaicihai.json +train,data/leverxgroup/esrgan,leverxgroupesrgan.json +train,data/returntocorp/bento,returntocorpbento.json +train,data/mkorpela/overrides,mkorpelaoverrides.json +train,data/gregdavill/KiBuzzard,gregdavillKiBuzzard.json +train,data/insanum/sncli,insanumsncli.json +train,data/amzn/amazon-ray,amznamazon-ray.json +train,data/someengineering/resoto,someengineeringresoto.json +train,data/mrlesmithjr/ansible-manage-lvm,mrlesmithjransible-manage-lvm.json +train,data/Irrational-Encoding-Wizardry/lvsfunc,Irrational-Encoding-Wizardrylvsfunc.json +train,data/tiangolo/typer-cli,tiangolotyper-cli.json +train,data/pytorch/ignite,pytorchignite.json +train,data/Synss/python-mbedtls,Synsspython-mbedtls.json +train,data/databricks/koalas,databrickskoalas.json +train,data/doccano/doccano-client,doccanodoccano-client.json +train,data/7zx/overload,7zxoverload.json +train,data/Hongbo-Miao/hongbomiao.com,Hongbo-Miaohongbomiao.com.json +train,data/JuliaRegistries/TagBot,JuliaRegistriesTagBot.json +train,data/s-knibbs/dataclasses-jsonschema,s-knibbsdataclasses-jsonschema.json +train,data/fluentpython/example-code-2e,fluentpythonexample-code-2e.json +train,data/sfu-db/dataprep,sfu-dbdataprep.json +train,data/mingrammer/pyreportcard,mingrammerpyreportcard.json +train,data/ivre/ivre,ivreivre.json +train,data/arXiv/arxiv-browse,arXivarxiv-browse.json +train,data/amolenaar/roles,amolenaarroles.json +train,data/tokenspice/tokenspice,tokenspicetokenspice.json +test,data/srstevenson/xdg,srstevensonxdg.json +test,data/sbrunner/deskew,sbrunnerdeskew.json +test,data/gijzelaerr/python-snap7,gijzelaerrpython-snap7.json +test,data/full-stack-deep-learning/fsdl-text-recognizer-project,full-stack-deep-learningfsdl-text-recognizer-project.json +test,data/jwkvam/celluloid,jwkvamcelluloid.json +test,data/python-desert/desert,python-desertdesert.json +test,data/intel/qemu-sgx,intelqemu-sgx.json +test,data/metaopt/torchopt,metaopttorchopt.json +test,data/google/megalista,googlemegalista.json +test,data/ddelange/pipgrip,ddelangepipgrip.json +test,data/flaskcwg/flask-docs-zh,flaskcwgflask-docs-zh.json +test,data/Davestring/ESCOM,DavestringESCOM.json +test,data/asyrjasalo/RESTinstance,asyrjasaloRESTinstance.json +test,data/Rostlab/nalaf,Rostlabnalaf.json +test,data/aws-samples/aws-cdk-project-structure-python,aws-samplesaws-cdk-project-structure-python.json +test,data/spotify/pedalboard,spotifypedalboard.json +test,data/aio-libs/janus,aio-libsjanus.json +test,data/probberechts/soccerdata,probberechtssoccerdata.json +test,data/microsoft/electionguard-python,microsoftelectionguard-python.json +test,data/redcap-tools/PyCap,redcap-toolsPyCap.json +test,data/datopian/giftless,datopiangiftless.json +test,data/tko22/flask-boilerplate,tko22flask-boilerplate.json +test,data/pypa/bandersnatch,pypabandersnatch.json +test,data/luizalabs/lasier,luizalabslasier.json +test,data/objectiv/objectiv-analytics,objectivobjectiv-analytics.json +test,data/streamlit/streamlit,streamlitstreamlit.json +test,data/mevellea/telegram_menu,mevelleatelegram_menu.json +test,data/kuutsav/LeetComp,kuutsavLeetComp.json +test,data/qemu/qemu,qemuqemu.json +test,data/hjacobs/kube-downscaler,hjacobskube-downscaler.json +test,data/openai/DALL-E,openaiDALL-E.json +test,data/gouline/dbt-metabase,goulinedbt-metabase.json +test,data/minitorch/Module-0,minitorchModule-0.json +test,data/encode/broadcaster,encodebroadcaster.json +test,data/AndreMiras/EtherollApp,AndreMirasEtherollApp.json +test,data/blakeblackshear/frigate-hass-integration,blakeblackshearfrigate-hass-integration.json +test,data/sb-ai-lab/LightAutoML,sb-ai-labLightAutoML.json +test,data/brandtbucher/specialist,brandtbucherspecialist.json +test,data/Qiskit/qiskit-finance,Qiskitqiskit-finance.json +test,data/microsoft/LocalizedStringKit,microsoftLocalizedStringKit.json +test,data/zyxue/ncbitax2lin,zyxuencbitax2lin.json +test,data/yukiarrr/Il2cppSpy,yukiarrrIl2cppSpy.json +test,data/henryruhs/chroma-feedback,henryruhschroma-feedback.json +test,data/quora/qcore,quoraqcore.json +test,data/poljar/matrix-nio,poljarmatrix-nio.json +test,data/quay/quay,quayquay.json +test,data/mindsdb/mindsdb,mindsdbmindsdb.json +test,data/kennethreitz-archive/requests3,kennethreitz-archiverequests3.json +test,data/mxmlnkn/ratarmount,mxmlnknratarmount.json +test,data/prideout/svg3d,prideoutsvg3d.json +test,data/grillazz/fastapi-sqlalchemy-asyncpg,grillazzfastapi-sqlalchemy-asyncpg.json +test,data/megvii-research/hpman,megvii-researchhpman.json +test,data/epicserve/django-base-site,epicservedjango-base-site.json +test,data/cookiejar/cookietemple,cookiejarcookietemple.json +test,data/dpgaspar/Flask-AppBuilder,dpgasparFlask-AppBuilder.json +test,data/theupdateframework/python-tuf,theupdateframeworkpython-tuf.json +test,data/GoogleCloudPlatform/bigquery-utils,GoogleCloudPlatformbigquery-utils.json +test,data/long2ice/rearq,long2icerearq.json +test,data/doorstop-dev/doorstop,doorstop-devdoorstop.json +test,data/lnresearch/topology,lnresearchtopology.json +test,data/tough-dev-school/education-backend,tough-dev-schooleducation-backend.json +test,data/TikhonJelvis/RL-book,TikhonJelvisRL-book.json +test,data/svix/svix-webhooks,svixsvix-webhooks.json +test,data/bureaucratic-labs/dostoevsky,bureaucratic-labsdostoevsky.json +test,data/RasaHQ/rasa-sdk,RasaHQrasa-sdk.json +test,data/devpi/devpi,devpidevpi.json +test,data/Pebaz/nimporter,Pebaznimporter.json +test,data/danihodovic/django-allauth-ui,danihodovicdjango-allauth-ui.json +test,data/meltano/meltano,meltanomeltano.json +test,data/openedx/edx-platform,openedxedx-platform.json +test,data/nvbn/py-backwards,nvbnpy-backwards.json +test,data/facelessuser/soupsieve,facelessusersoupsieve.json +test,data/pallets-eco/flask-caching,pallets-ecoflask-caching.json +test,data/cltk/cltk,cltkcltk.json +test,data/mongodb/mongodb-kubernetes-operator,mongodbmongodb-kubernetes-operator.json +test,data/cvpaperchallenge/Ascender,cvpaperchallengeAscender.json +test,data/OSInside/kiwi,OSInsidekiwi.json +test,data/MariiaSizova/breakfastapi,MariiaSizovabreakfastapi.json +test,data/ML-KULeuven/socceraction,ML-KULeuvensocceraction.json +test,data/datapane/datapane,datapanedatapane.json +test,data/nelson-liu/contextual-repr-analysis,nelson-liucontextual-repr-analysis.json +test,data/HelloGitHub-Team/Hydra,HelloGitHub-TeamHydra.json +test,data/alshedivat/meta-blocks,alshedivatmeta-blocks.json +test,data/Erotemic/ubelt,Erotemicubelt.json +test,data/HyphaApp/hypha,HyphaApphypha.json +test,data/springload/draftjs_exporter,springloaddraftjs_exporter.json +test,data/skyplane-project/skyplane,skyplane-projectskyplane.json +test,data/mCodingLLC/SlapThatLikeButton-TestingStarterProject,mCodingLLCSlapThatLikeButton-TestingStarterProject.json +test,data/jdb78/pytorch-forecasting,jdb78pytorch-forecasting.json +test,data/cs-cordero/py-ts-interfaces,cs-corderopy-ts-interfaces.json +test,data/Animenosekai/translate,Animenosekaitranslate.json +test,data/nautechsystems/nautilus_trader,nautechsystemsnautilus_trader.json +test,data/alecthomas/injector,alecthomasinjector.json +test,data/jacebrowning/datafiles,jacebrowningdatafiles.json +test,data/facebook/PathPicker,facebookPathPicker.json +test,data/facebookresearch/nevergrad,facebookresearchnevergrad.json +test,data/DeanWay/fastapi-versioning,DeanWayfastapi-versioning.json +test,data/G-Research/armada,G-Researcharmada.json +test,data/unknown-horizons/unknown-horizons,unknown-horizonsunknown-horizons.json +test,data/jasonrollins/shareplum,jasonrollinsshareplum.json +test,data/CCS-Lab/hBayesDM,CCS-LabhBayesDM.json +test,data/dckiller51/bodymiscale,dckiller51bodymiscale.json +test,data/PyCQA/pylint,PyCQApylint.json +test,data/emscripten-core/emscripten,emscripten-coreemscripten.json +test,data/AudiusProject/audius-protocol,AudiusProjectaudius-protocol.json +test,data/deel-ai/xplique,deel-aixplique.json +test,data/divkit/divkit,divkitdivkit.json +test,data/explosion/catalogue,explosioncatalogue.json +test,data/pysamp/PySAMP,pysampPySAMP.json +test,data/aristocratos/bpytop,aristocratosbpytop.json +test,data/NiklasRosenstein/pydoc-markdown,NiklasRosensteinpydoc-markdown.json +test,data/SeldonIO/MLServer,SeldonIOMLServer.json +test,data/alan-turing-institute/rse-course,alan-turing-instituterse-course.json +test,data/lonsty/xvideos-dl,lonstyxvideos-dl.json +test,data/codemagic-ci-cd/cli-tools,codemagic-ci-cdcli-tools.json +test,data/samuelcolvin/watchfiles,samuelcolvinwatchfiles.json +test,data/celery/kombu,celerykombu.json +test,data/tatp22/multidim-positional-encoding,tatp22multidim-positional-encoding.json +test,data/PacktPublishing/Speed-up-your-Python-with-Rust,PacktPublishingSpeed-up-your-Python-with-Rust.json +test,data/pviafore/RobustPython,pviaforeRobustPython.json +test,data/samuelcolvin/rtoml,samuelcolvinrtoml.json +test,data/uriyyo/fastapi-pagination,uriyyofastapi-pagination.json +test,data/kevinheavey/anchorpy,kevinheaveyanchorpy.json +test,data/zli117/EInk-Calendar,zli117EInk-Calendar.json +test,data/MartinThoma/hwrt,MartinThomahwrt.json +test,data/Tishka17/dataclass_factory,Tishka17dataclass_factory.json +test,data/jyoung8607/openpilot,jyoung8607openpilot.json +test,data/acoustid/acoustid-server,acoustidacoustid-server.json +test,data/pyinat/pyinaturalist,pyinatpyinaturalist.json +test,data/twisted/klein,twistedklein.json +test,data/tskit-dev/msprime,tskit-devmsprime.json +test,data/Gsllchb/Handright,GsllchbHandright.json +test,data/ahawker/ulid,ahawkerulid.json +test,data/mkorpela/pabot,mkorpelapabot.json +test,data/pallets/werkzeug,palletswerkzeug.json +test,data/dbt-labs/dbt-snowflake,dbt-labsdbt-snowflake.json +test,data/bluesky/bluesky,blueskybluesky.json +test,data/unitaryfund/mitiq,unitaryfundmitiq.json +test,data/tiangolo/uvicorn-gunicorn-machine-learning-docker,tiangolouvicorn-gunicorn-machine-learning-docker.json +test,data/rahiel/telegram-send,rahieltelegram-send.json +test,data/yampelo/beagle,yampelobeagle.json +test,data/home-assistant-libs/pytradfri,home-assistant-libspytradfri.json +test,data/sobjornstad/AnkiLPCG,sobjornstadAnkiLPCG.json +test,data/dotnet/performance,dotnetperformance.json +test,data/colour-science/colour,colour-sciencecolour.json +test,data/sissaschool/xmlschema,sissaschoolxmlschema.json +test,data/mapillary/mapillary_tools,mapillarymapillary_tools.json +test,data/roniemartinez/latex2mathml,roniemartinezlatex2mathml.json +test,data/commitizen-tools/commitizen,commitizen-toolscommitizen.json +test,data/RSSerpent/RSSerpent,RSSerpentRSSerpent.json +test,data/tornadoweb/tornado,tornadowebtornado.json +test,data/timothycrosley/pdocs,timothycrosleypdocs.json +test,data/Yura52/rtdl,Yura52rtdl.json +test,data/TaleLin/lin-cms-flask,TaleLinlin-cms-flask.json +test,data/tiangolo/poetry-version-plugin,tiangolopoetry-version-plugin.json +test,data/liaopeiyuan/TransferDet,liaopeiyuanTransferDet.json +test,data/determined-ai/yogadl,determined-aiyogadl.json +test,data/mikeshardmind/SinbadCogs,mikeshardmindSinbadCogs.json +test,data/jrialland/python-astar,jriallandpython-astar.json +test,data/ServiceNow/picard,ServiceNowpicard.json +test,data/spokestack/spokestack-python,spokestackspokestack-python.json +test,data/turner-townsend/flask-pydantic-spec,turner-townsendflask-pydantic-spec.json +test,data/kyclark/biofx_python,kyclarkbiofx_python.json +test,data/faustomorales/vit-keras,faustomoralesvit-keras.json +test,data/cuducos/calculadora-do-cidadao,cuducoscalculadora-do-cidadao.json +test,data/aio-libs/aiojobs,aio-libsaiojobs.json +test,data/dylan-profiler/visions,dylan-profilervisions.json +test,data/zero323/pyspark-stubs,zero323pyspark-stubs.json +test,data/amundsen-io/amundsenfrontendlibrary,amundsen-ioamundsenfrontendlibrary.json +test,data/mjpieters/aiolimiter,mjpietersaiolimiter.json +test,data/pytorch/serve,pytorchserve.json +test,data/ask/mode,askmode.json +test,data/DeebotUniverse/Deebot-4-Home-Assistant,DeebotUniverseDeebot-4-Home-Assistant.json +test,data/tiangolo/full-stack-fastapi-postgresql,tiangolofull-stack-fastapi-postgresql.json +test,data/gigamonkey/monorepoize,gigamonkeymonorepoize.json +test,data/microsoft/torchgeo,microsofttorchgeo.json +test,data/justindujardin/mathy,justindujardinmathy.json +test,data/MartinThoma/flake8-simplify,MartinThomaflake8-simplify.json +test,data/johnthagen/python-blueprint,johnthagenpython-blueprint.json +test,data/sco1/flake8-annotations,sco1flake8-annotations.json +test,data/cuducos/bot-followers,cuducosbot-followers.json +test,data/XKNX/xknx,XKNXxknx.json +test,data/bashtage/arch,bashtagearch.json +test,data/fish2000/instakit,fish2000instakit.json +test,data/airbnb/binaryalert,airbnbbinaryalert.json +test,data/rowanz/swagaf,rowanzswagaf.json +test,data/mbrg/power-pwn,mbrgpower-pwn.json +test,data/supakeen/pinnwand,supakeenpinnwand.json +test,data/mathiasertl/django-ca,mathiasertldjango-ca.json +test,data/nextcord/nextcord-v3,nextcordnextcord-v3.json +test,data/elbakramer/koapy,elbakramerkoapy.json +test,data/Minyus/causallift,Minyuscausallift.json +test,data/common-workflow-language/common-workflow-language,common-workflow-languagecommon-workflow-language.json +test,data/certbot/certbot,certbotcertbot.json +test,data/leikoilja/ha-google-home,leikoiljaha-google-home.json +test,data/HypothesisWorks/hypothesis,HypothesisWorkshypothesis.json +test,data/woltapp/wolt-python-package-cookiecutter,woltappwolt-python-package-cookiecutter.json +test,data/Unleash/unleash-client-python,Unleashunleash-client-python.json +test,data/Fatal1ty/mashumaro,Fatal1tymashumaro.json +test,data/explosion/spaCy,explosionspaCy.json +test,data/panther-labs/panther-analysis,panther-labspanther-analysis.json +test,data/abersheeran/a2wsgi,abersheerana2wsgi.json +test,data/laramies/theHarvester,laramiestheHarvester.json +test,data/AnalogJ/lexicon,AnalogJlexicon.json +test,data/frenck/python-adguardhome,frenckpython-adguardhome.json +test,data/halhorn/deep_dialog_tutorial,halhorndeep_dialog_tutorial.json +test,data/flatironinstitute/CaImAn,flatironinstituteCaImAn.json +test,data/summa-tx/bitcoin-spv,summa-txbitcoin-spv.json +test,data/wustho/epy,wusthoepy.json +test,data/aws/aws-encryption-sdk-cli,awsaws-encryption-sdk-cli.json +test,data/adamghill/django-unicorn,adamghilldjango-unicorn.json +test,data/LineaLabs/lineapy,LineaLabslineapy.json +test,data/abhisheks008/ML-Crate,abhisheks008ML-Crate.json +test,data/yoeo/guesslang,yoeoguesslang.json +test,data/sopel-irc/sopel,sopel-ircsopel.json +test,data/jokull/python-ts-graphql-demo,jokullpython-ts-graphql-demo.json +test,data/jordaneremieff/djantic,jordaneremieffdjantic.json +test,data/y123456yz/reading-and-annotate-mongodb-3.6,y123456yzreading-and-annotate-mongodb-3.6.json +test,data/PostHog/drf-exceptions-hog,PostHogdrf-exceptions-hog.json +test,data/neondatabase/neon,neondatabaseneon.json +test,data/stac-utils/pgstac,stac-utilspgstac.json +test,data/repleo/bounca,repleobounca.json +test,data/paritytrading/philadelphia,paritytradingphiladelphia.json +test,data/facebookresearch/GraphLog,facebookresearchGraphLog.json +test,data/flyteorg/flytekit,flyteorgflytekit.json +test,data/rochacbruno/flask-project-template,rochacbrunoflask-project-template.json +test,data/salesforce/cloudsplaining,salesforcecloudsplaining.json +test,data/remcohaszing/pywakeonlan,remcohaszingpywakeonlan.json +test,data/encode/orm,encodeorm.json +test,data/outscale/osc-cli,outscaleosc-cli.json +test,data/NVlabs/two-shot-brdf-shape,NVlabstwo-shot-brdf-shape.json +test,data/mozilla/fx-private-relay,mozillafx-private-relay.json +test,data/rspeer/python-ftfy,rspeerpython-ftfy.json +test,data/tsatsujnr139/fastapi-role-based-access-control-auth-service,tsatsujnr139fastapi-role-based-access-control-auth-service.json +test,data/hhursev/recipe-scrapers,hhursevrecipe-scrapers.json +test,data/TylerYep/torchinfo,TylerYeptorchinfo.json +test,data/bergercookie/taskwarrior-syncall,bergercookietaskwarrior-syncall.json +test,data/rhasspy/gruut,rhasspygruut.json +test,data/dongweiming/lyanna,dongweiminglyanna.json +test,data/roniemartinez/real-time-charts-with-flask,roniemartinezreal-time-charts-with-flask.json +test,data/zcemycl/TF2DeepFloorplan,zcemyclTF2DeepFloorplan.json +test,data/deknowny/vkquick,deknownyvkquick.json +test,data/spulec/moto,spulecmoto.json +test,data/altair-viz/altair_data_server,altair-vizaltair_data_server.json +test,data/hay-kot/mealie,hay-kotmealie.json +test,data/awslabs/gluonts,awslabsgluonts.json +test,data/neogeny/TatSu,neogenyTatSu.json +test,data/Kludex/fastapi-health,Kludexfastapi-health.json +test,data/0x0FB0/pulsar,0x0FB0pulsar.json +test,data/ergrelet/unlicense,ergreletunlicense.json +test,data/microsoft/python-type-stubs,microsoftpython-type-stubs.json +test,data/Kotaimen/awscfncli,Kotaimenawscfncli.json +test,data/nyu-mll/jiant,nyu-mlljiant.json +test,data/microsoft/playwright-pytest,microsoftplaywright-pytest.json +test,data/ietf-tools/datatracker,ietf-toolsdatatracker.json +test,data/ilcardella/TradingBot,ilcardellaTradingBot.json +test,data/Neoteroi/BlackSheep,NeoteroiBlackSheep.json +test,data/Curt-Park/rainbow-is-all-you-need,Curt-Parkrainbow-is-all-you-need.json +test,data/zurutech/ashpy,zurutechashpy.json +test,data/PyBites-Open-Source/pybites-carbon,PyBites-Open-Sourcepybites-carbon.json +test,data/bram2w/baserow,bram2wbaserow.json +test,data/cisco/mindmeld,ciscomindmeld.json +test,data/tencent-quantum-lab/tensorcircuit,tencent-quantum-labtensorcircuit.json +test,data/immuni-app/immuni-backend-common,immuni-appimmuni-backend-common.json +test,data/hjacobs/kube-web-view,hjacobskube-web-view.json +test,data/joouha/euporie,joouhaeuporie.json +test,data/xchwarze/samsung-tv-ws-api,xchwarzesamsung-tv-ws-api.json +test,data/aio-libs/aiozipkin,aio-libsaiozipkin.json +test,data/Flexget/Flexget,FlexgetFlexget.json +test,data/PyCQA/flake8-pyi,PyCQAflake8-pyi.json +test,data/materialsvirtuallab/megnet,materialsvirtuallabmegnet.json +test,data/sentinel-hub/sentinelhub-py,sentinel-hubsentinelhub-py.json +test,data/ancasag/ensembleObjectDetection,ancasagensembleObjectDetection.json +test,data/nornir-automation/nornir,nornir-automationnornir.json +test,data/allenai/allennlp,allenaiallennlp.json +test,data/jabbalaci/JSON-path,jabbalaciJSON-path.json +test,data/gojek/merlin,gojekmerlin.json +test,data/keybase/pykeybasebot,keybasepykeybasebot.json +test,data/ethyca/fides,ethycafides.json +test,data/WebLogo/weblogo,WebLogoweblogo.json +test,data/LDO-CERT/orochi,LDO-CERTorochi.json +test,data/openstack/oslo.config,openstackoslo.config.json +test,data/lucyparsons/OpenOversight,lucyparsonsOpenOversight.json +test,data/AsheKR/django-query-capture,AsheKRdjango-query-capture.json +test,data/SeldonIO/tempo,SeldonIOtempo.json +test,data/Textualize/rich,Textualizerich.json +test,data/mpkocher/pydantic-cli,mpkocherpydantic-cli.json +test,data/ternaus/retinaface,ternausretinaface.json +test,data/wkentaro/imgviz,wkentaroimgviz.json +test,data/alephdata/fingerprints,alephdatafingerprints.json +test,data/FlexMeasures/flexmeasures,FlexMeasuresflexmeasures.json +test,data/facebookresearch/hydra,facebookresearchhydra.json +test,data/python-microservices/microservices-scaffold,python-microservicesmicroservices-scaffold.json +test,data/GoogleCloudPlatform/gcpdiag,GoogleCloudPlatformgcpdiag.json +test,data/MerosCrypto/Meros,MerosCryptoMeros.json +test,data/jupyter-server/jupyverse,jupyter-serverjupyverse.json +test,data/youtype/mypy_boto3_builder,youtypemypy_boto3_builder.json +test,data/borchero/pycave,borcheropycave.json +test,data/polyaxon/polyaxon,polyaxonpolyaxon.json +test,data/ponty/EasyProcess,pontyEasyProcess.json +test,data/cosmicpython/book,cosmicpythonbook.json +test,data/your-tools/tbump,your-toolstbump.json +test,data/sudoguy/tiktok_bot,sudoguytiktok_bot.json +test,data/HazyResearch/fonduer,HazyResearchfonduer.json +test,data/motional/nuplan-devkit,motionalnuplan-devkit.json +test,data/apiflask/apiflask,apiflaskapiflask.json +test,data/juftin/camply,juftincamply.json +test,data/duo-labs/webauthn.io,duo-labswebauthn.io.json +test,data/procrastinate-org/procrastinate,procrastinate-orgprocrastinate.json +test,data/tiangolo/uwsgi-nginx-docker,tiangolouwsgi-nginx-docker.json +test,data/supertokens/supertokens-python,supertokenssupertokens-python.json +test,data/BibliothecaForAdventurers/realms-contracts,BibliothecaForAdventurersrealms-contracts.json +test,data/AtsushiSakai/PythonRobotics,AtsushiSakaiPythonRobotics.json +test,data/ioos/erddapy,iooserddapy.json +test,data/psf/black,psfblack.json +test,data/felinae98/nonebot-bison,felinae98nonebot-bison.json +test,data/guardicore/monkey,guardicoremonkey.json +test,data/alerta/zabbix-alerta,alertazabbix-alerta.json +test,data/pytorch/torchdynamo,pytorchtorchdynamo.json +test,data/hotzenklotz/picobrew-server,hotzenklotzpicobrew-server.json +test,data/aws/aws-sdk-pandas,awsaws-sdk-pandas.json +test,data/awslabs/aws-lambda-powertools-python,awslabsaws-lambda-powertools-python.json +test,data/yeraydiazdiaz/lunr.py,yeraydiazdiazlunr.py.json +test,data/blakeblackshear/frigate,blakeblackshearfrigate.json +test,data/VOICEVOX/voicevox_engine,VOICEVOXvoicevox_engine.json +test,data/carlmontanari/scrapli,carlmontanariscrapli.json +test,data/evidentlyai/evidently,evidentlyaievidently.json +test,data/facebookincubator/Bowler,facebookincubatorBowler.json +test,data/VirtusLab/git-machete,VirtusLabgit-machete.json +test,data/roedoejet/g2p,roedoejetg2p.json +test,data/smallwat3r/shhh,smallwat3rshhh.json +test,data/Embroidermodder/Embroidermodder,EmbroidermodderEmbroidermodder.json +test,data/pyscript/pyscript-cli,pyscriptpyscript-cli.json +test,data/requests-cache/aiohttp-client-cache,requests-cacheaiohttp-client-cache.json +test,data/j-marple-dev/AYolov2,j-marple-devAYolov2.json +test,data/jbms/finance-dl,jbmsfinance-dl.json +test,data/GPflow/GPflow,GPflowGPflow.json +test,data/neo4j/neo4j-python-driver,neo4jneo4j-python-driver.json +test,data/feast-dev/feast,feast-devfeast.json +test,data/wemake-services/dotenv-linter,wemake-servicesdotenv-linter.json +test,data/kurtmckee/feedparser,kurtmckeefeedparser.json +test,data/pallets/jinja,palletsjinja.json +test,data/networktocode/netutils,networktocodenetutils.json +test,data/freedomofpress/dangerzone,freedomofpressdangerzone.json +test,data/tsileo/little-boxes,tsileolittle-boxes.json +test,data/byceps/byceps,bycepsbyceps.json +test,data/polygon-io/client-python,polygon-ioclient-python.json +test,data/fedejaure/cookiecutter-modern-pypackage,fedejaurecookiecutter-modern-pypackage.json +test,data/aminohealth/wonk,aminohealthwonk.json +test,data/zama-ai/concrete-numpy,zama-aiconcrete-numpy.json +test,data/rism-digital/verovio,rism-digitalverovio.json +test,data/rochacbruno/fastapi-project-template,rochacbrunofastapi-project-template.json +test,data/tortoise/tortoise-orm,tortoisetortoise-orm.json +test,data/chdsbd/kodiak,chdsbdkodiak.json +test,data/flask-extensions/Flask-GoogleMaps,flask-extensionsFlask-GoogleMaps.json +test,data/etesync/server,etesyncserver.json +test,data/flyte/mqtt-io,flytemqtt-io.json +test,data/ToucanToco/weaverbird,ToucanTocoweaverbird.json +test,data/Oxygem/Kanmail,OxygemKanmail.json +test,data/wemake-services/django-split-settings,wemake-servicesdjango-split-settings.json +test,data/ioxiocom/openapi-to-fastapi,ioxiocomopenapi-to-fastapi.json +test,data/vanilla-manifesto/vanilla-di-manifesto,vanilla-manifestovanilla-di-manifesto.json +test,data/tableau/hyper-api-samples,tableauhyper-api-samples.json +test,data/GeneralNewsExtractor/GeneralNewsExtractor,GeneralNewsExtractorGeneralNewsExtractor.json +test,data/dry-python/classes,dry-pythonclasses.json +test,data/Bogdanp/web-app-from-scratch,Bogdanpweb-app-from-scratch.json +test,data/dovahcrow/patchify.py,dovahcrowpatchify.py.json +test,data/MrPowerScripts/reddit-karma-farming-bot,MrPowerScriptsreddit-karma-farming-bot.json +test,data/trainindata/testing-and-monitoring-ml-deployments,trainindatatesting-and-monitoring-ml-deployments.json +test,data/luphord/nelson_siegel_svensson,luphordnelson_siegel_svensson.json +test,data/openvinotoolkit/openvino,openvinotoolkitopenvino.json +test,data/vmware/versatile-data-kit,vmwareversatile-data-kit.json +test,data/kedro-org/kedro-viz,kedro-orgkedro-viz.json +test,data/tektoncd/experimental,tektoncdexperimental.json +test,data/aiven/myhoard,aivenmyhoard.json +test,data/QCoDeS/Qcodes,QCoDeSQcodes.json +test,data/oremanj/python-netfilterqueue,oremanjpython-netfilterqueue.json +test,data/ansible/ansible-lint,ansibleansible-lint.json +test,data/sainnhe/dotfiles,sainnhedotfiles.json +test,data/peeringdb/peeringdb-py,peeringdbpeeringdb-py.json +test,data/antismash/antismash,antismashantismash.json +test,data/bosondata/badwolf,bosondatabadwolf.json +test,data/aio-libs/create-aio-app,aio-libscreate-aio-app.json +test,data/fastapi-admin/fastapi-admin,fastapi-adminfastapi-admin.json +test,data/FasterSpeeding/Tanjun,FasterSpeedingTanjun.json +test,data/python-cmd2/cmd2,python-cmd2cmd2.json +test,data/avinassh/py-caskdb,avinasshpy-caskdb.json +test,data/sublimelsp/LSP-pyright,sublimelspLSP-pyright.json +test,data/hail-is/hail,hail-ishail.json +test,data/uds-se/fuzzingbook,uds-sefuzzingbook.json +test,data/arraylabs/pymyq,arraylabspymyq.json +test,data/miki725/importanize,miki725importanize.json +test,data/brentspell/hifi-gan-bwe,brentspellhifi-gan-bwe.json +test,data/GSA/fedramp-automation,GSAfedramp-automation.json +test,data/PrefectHQ/prefect,PrefectHQprefect.json +test,data/vector-of-bool/bpt,vector-of-boolbpt.json +test,data/redzej/graphene-permissions,redzejgraphene-permissions.json +test,data/CharlesBlonde/libsoundtouch,CharlesBlondelibsoundtouch.json +test,data/nuxeo/nuxeo-drive,nuxeonuxeo-drive.json +test,data/CiviWiki/OpenCiviWiki,CiviWikiOpenCiviWiki.json +test,data/csingley/ofxtools,csingleyofxtools.json +test,data/crossbario/autobahn-python,crossbarioautobahn-python.json +test,data/kodemore/kink,kodemorekink.json +test,data/IndustryEssentials/ymir,IndustryEssentialsymir.json +test,data/BrikerMan/Kashgari,BrikerManKashgari.json +test,data/py-pdf/PyPDF2,py-pdfPyPDF2.json +test,data/DontShaveTheYak/cf2tf,DontShaveTheYakcf2tf.json +test,data/glutanimate/anki-addon-builder,glutanimateanki-addon-builder.json +test,data/omnilib/aioitertools,omnilibaioitertools.json +test,data/mjpieters/adventofcode,mjpietersadventofcode.json +test,data/maropu/spark-sql-flow-plugin,maropuspark-sql-flow-plugin.json +test,data/cleder/fastkml,clederfastkml.json +test,data/biolink/kgx,biolinkkgx.json +test,data/mosecorg/mosec,mosecorgmosec.json +test,data/okfde/froide,okfdefroide.json +test,data/Netflix/repokid,Netflixrepokid.json +test,data/HackerBCI/EEGwithRaspberryPI,HackerBCIEEGwithRaspberryPI.json +test,data/jsvine/pdfplumber,jsvinepdfplumber.json +test,data/octoml/synr,octomlsynr.json +test,data/audapolis/audapolis,audapolisaudapolis.json +test,data/aws-samples/aws-security-reference-architecture-examples,aws-samplesaws-security-reference-architecture-examples.json +test,data/goat-community/goat,goat-communitygoat.json +test,data/baking-bad/pytezos,baking-badpytezos.json +test,data/CYang828/xbot,CYang828xbot.json +test,data/seanpar203/event-bus,seanpar203event-bus.json +test,data/getsentry/responses,getsentryresponses.json +test,data/cnpryer/huak,cnpryerhuak.json +test,data/DS4SD/deepsearch-toolkit,DS4SDdeepsearch-toolkit.json +test,data/jacebrowning/gitman,jacebrowninggitman.json +test,data/hit9/bitproto,hit9bitproto.json +test,data/jamesob/coldcore,jamesobcoldcore.json +test,data/msys2/msys2-web,msys2msys2-web.json +test,data/solana-labs/solana-program-library,solana-labssolana-program-library.json +test,data/cdump/investments,cdumpinvestments.json +test,data/nteract/bookstore,nteractbookstore.json +test,data/aws-solutions/qnabot-on-aws,aws-solutionsqnabot-on-aws.json +test,data/ckan/ckan,ckanckan.json +test,data/emkademy/jumpcutter,emkademyjumpcutter.json +test,data/SonarSource/sonar-python,SonarSourcesonar-python.json +test,data/tveastman/secateur,tveastmansecateur.json +test,data/humrochagf/revelation,humrochagfrevelation.json +test,data/Ravn-Tech/HyperTag,Ravn-TechHyperTag.json +test,data/eugenesiow/super-image,eugenesiowsuper-image.json +test,data/pathpy/pathpy,pathpypathpy.json +valid,data/shidenggui/easytrader,shidengguieasytrader.json +valid,data/danielfernau/unifi-protect-video-downloader,danielfernauunifi-protect-video-downloader.json +valid,data/typeddjango/django-stubs,typeddjangodjango-stubs.json +valid,data/flashbots/mev-inspect-py,flashbotsmev-inspect-py.json +valid,data/nihp-public/covid19-app-system-public,nihp-publiccovid19-app-system-public.json +valid,data/vyperlang/titanoboa,vyperlangtitanoboa.json +valid,data/PabloLec/RecoverPy,PabloLecRecoverPy.json +valid,data/DamnWidget/anaconda,DamnWidgetanaconda.json +valid,data/NeuroTechX/eeg-notebooks,NeuroTechXeeg-notebooks.json +valid,data/thesadru/genshin.py,thesadrugenshin.py.json +valid,data/hack-different/ipwndfu,hack-differentipwndfu.json +valid,data/JetBrains-Research/code2seq,JetBrains-Researchcode2seq.json +valid,data/crawl/crawl,crawlcrawl.json +valid,data/pyppeteer/pyppeteer,pyppeteerpyppeteer.json +valid,data/wmuron/motpy,wmuronmotpy.json +valid,data/ServiceNow/azimuth,ServiceNowazimuth.json +valid,data/MiniZinc/minizinc-python,MiniZincminizinc-python.json +valid,data/ChrisCrossCrash/r-place-blender,ChrisCrossCrashr-place-blender.json +valid,data/spotDL/spotify-downloader,spotDLspotify-downloader.json +valid,data/StefanUlbrich/design-by-contract,StefanUlbrichdesign-by-contract.json +valid,data/joshtemple/lkml,joshtemplelkml.json +valid,data/alorence/django-modern-rpc,alorencedjango-modern-rpc.json +valid,data/bitpicky/dbt-sugar,bitpickydbt-sugar.json +valid,data/jboynyc/textnets,jboynyctextnets.json +valid,data/rochacbruno/python-project-template,rochacbrunopython-project-template.json +valid,data/reddit/baseplate.py,redditbaseplate.py.json +valid,data/OSOceanAcoustics/echopype,OSOceanAcousticsechopype.json +valid,data/kiwicom/the-zoo,kiwicomthe-zoo.json +valid,data/jerry-git/pytest-split,jerry-gitpytest-split.json +valid,data/sentinel-hub/eo-learn,sentinel-hubeo-learn.json +valid,data/celery/celery,celerycelery.json +valid,data/splitgraph/sgr,splitgraphsgr.json +valid,data/Hironsan/asari,Hironsanasari.json +valid,data/ml-tooling/opyrator,ml-toolingopyrator.json +valid,data/matrix-org/synapse,matrix-orgsynapse.json +valid,data/Qiskit/qiskit-nature,Qiskitqiskit-nature.json +valid,data/searxng/searxng,searxngsearxng.json +valid,data/HazyResearch/manifest,HazyResearchmanifest.json +valid,data/hh-h/aiohttp-swagger3,hh-haiohttp-swagger3.json +valid,data/se2p/pynguin,se2ppynguin.json +valid,data/vlasovskikh/funcparserlib,vlasovskikhfuncparserlib.json +valid,data/SourceCode-AI/aura,SourceCode-AIaura.json +valid,data/mitmproxy/pdoc,mitmproxypdoc.json +valid,data/awtkns/fastapi-crudrouter,awtknsfastapi-crudrouter.json +valid,data/ElementsProject/lightning,ElementsProjectlightning.json +valid,data/tr11/python-configuration,tr11python-configuration.json +valid,data/wbawakate/fairtorch,wbawakatefairtorch.json +valid,data/kaisero/fireREST,kaiserofireREST.json +valid,data/nazrulworld/fhir.resources,nazrulworldfhir.resources.json +valid,data/arvvoid/plugin.video.hbogoeu,arvvoidplugin.video.hbogoeu.json +valid,data/epwalsh/nlp-models,epwalshnlp-models.json +valid,data/microsoft/knowledge-extraction-recipes-forms,microsoftknowledge-extraction-recipes-forms.json +valid,data/AstroMatt/book-python,AstroMattbook-python.json +valid,data/aio-libs/aioredis-py,aio-libsaioredis-py.json +valid,data/pymc-devs/pymc4,pymc-devspymc4.json +valid,data/martin-majlis/Wikipedia-API,martin-majlisWikipedia-API.json +valid,data/aisingapore/PeekingDuck,aisingaporePeekingDuck.json +valid,data/microsoft/unilm,microsoftunilm.json +valid,data/ding-lab/CharGer,ding-labCharGer.json +valid,data/tf-encrypted/tf-encrypted,tf-encryptedtf-encrypted.json +valid,data/alecthomas/flask_injector,alecthomasflask_injector.json +valid,data/And3rsL/Deebot-for-Home-Assistant,And3rsLDeebot-for-Home-Assistant.json +valid,data/deezer/spleeter,deezerspleeter.json +valid,data/CamDavidsonPilon/lifelines,CamDavidsonPilonlifelines.json +valid,data/algorand/pyteal,algorandpyteal.json +valid,data/argonne-lcf/balsam,argonne-lcfbalsam.json +valid,data/danobot/entity-controller,danobotentity-controller.json +valid,data/MIC-DKFZ/batchgenerators,MIC-DKFZbatchgenerators.json +valid,data/statelyai/xstate-python,statelyaixstate-python.json +valid,data/allenai/allennlp-as-a-library-example,allenaiallennlp-as-a-library-example.json +valid,data/JanssenProject/jans,JanssenProjectjans.json +valid,data/opendatacube/odc-stac,opendatacubeodc-stac.json +valid,data/Yelp/bravado-core,Yelpbravado-core.json +valid,data/rigetti/quantumflow,rigettiquantumflow.json +valid,data/usds/justice40-tool,usdsjustice40-tool.json +valid,data/aws/amazon-redshift-python-driver,awsamazon-redshift-python-driver.json +valid,data/HathorNetwork/hathor-core,HathorNetworkhathor-core.json +valid,data/jleclanche/python-bna,jleclanchepython-bna.json +valid,data/makism/dyconnmap,makismdyconnmap.json +valid,data/gaogaotiantian/viztracer,gaogaotiantianviztracer.json +valid,data/fsociety-team/fsociety,fsociety-teamfsociety.json +valid,data/gitpython-developers/GitPython,gitpython-developersGitPython.json +valid,data/cuthbertLab/music21,cuthbertLabmusic21.json +valid,data/elarivie/pyReaderWriterLock,elariviepyReaderWriterLock.json +valid,data/AllenCellModeling/aicsimageio,AllenCellModelingaicsimageio.json +valid,data/spacetx/starfish,spacetxstarfish.json +valid,data/ajslater/picopt,ajslaterpicopt.json +valid,data/peeter123/digikey-api,peeter123digikey-api.json +valid,data/rednafi/hook-slinger,rednafihook-slinger.json +valid,data/facebookresearch/theseus,facebookresearchtheseus.json +valid,data/typeddjango/djangorestframework-stubs,typeddjangodjangorestframework-stubs.json +valid,data/lmmentel/mendeleev,lmmentelmendeleev.json +valid,data/arrowtype/recursive,arrowtyperecursive.json +valid,data/kmkurn/pytorch-crf,kmkurnpytorch-crf.json +valid,data/nextcloud/appstore,nextcloudappstore.json +valid,data/nats-io/nats.py,nats-ionats.py.json +valid,data/specklesystems/specklepy,specklesystemsspecklepy.json +valid,data/rom1504/clip-retrieval,rom1504clip-retrieval.json +valid,data/Udzu/pudzu,Udzupudzu.json +valid,data/lukesmurray/markdown-anki-decks,lukesmurraymarkdown-anki-decks.json +valid,data/pygame/pygameweb,pygamepygameweb.json +valid,data/apache/spark,apachespark.json +valid,data/tomv564/pyls-mypy,tomv564pyls-mypy.json +valid,data/thorn-oss/perception,thorn-ossperception.json +valid,data/uniswap-python/uniswap-python,uniswap-pythonuniswap-python.json +valid,data/pydantic/pydantic,pydanticpydantic.json +valid,data/ansible/ansible,ansibleansible.json +valid,data/Kludex/fastapi-microservices,Kludexfastapi-microservices.json +valid,data/facebookresearch/fairo,facebookresearchfairo.json +valid,data/sally20921/SinForkGAN-pytorch,sally20921SinForkGAN-pytorch.json +valid,data/rmariano/Clean-code-in-Python,rmarianoClean-code-in-Python.json +valid,data/mozilla/foundation.mozilla.org,mozillafoundation.mozilla.org.json +valid,data/pandas-dev/pandas,pandas-devpandas.json +valid,data/facebookincubator/ptr,facebookincubatorptr.json +valid,data/Netflix/consoleme,Netflixconsoleme.json +valid,data/AcademySoftwareFoundation/aswf-docker,AcademySoftwareFoundationaswf-docker.json +valid,data/ninoseki/shodan-dojo,ninosekishodan-dojo.json +valid,data/replicate/keepsake,replicatekeepsake.json +valid,data/reorx/python-terminal-color,reorxpython-terminal-color.json +valid,data/python-trio/trio,python-triotrio.json +valid,data/repology/repology-webapp,repologyrepology-webapp.json +valid,data/xoolive/traffic,xoolivetraffic.json +valid,data/kyclark/tiny_python_projects,kyclarktiny_python_projects.json +valid,data/belambert/edit-distance,belambertedit-distance.json +valid,data/jordaneremieff/mangum,jordaneremieffmangum.json +valid,data/AcademySoftwareFoundation/OpenColorIO-Config-ACES,AcademySoftwareFoundationOpenColorIO-Config-ACES.json +valid,data/networktocode/ntc-rosetta,networktocodentc-rosetta.json +valid,data/asappresearch/flambe,asappresearchflambe.json +valid,data/sscpac/statick,sscpacstatick.json +valid,data/tencentmusic/cube-studio,tencentmusiccube-studio.json +valid,data/SUNET/cnaas-nms,SUNETcnaas-nms.json +valid,data/sjev/trading-with-python,sjevtrading-with-python.json +valid,data/PyLops/pylops,PyLopspylops.json +valid,data/microsoft/vscode-python,microsoftvscode-python.json +valid,data/exentriquesolutions/nip.io,exentriquesolutionsnip.io.json +valid,data/quodlibet/quodlibet,quodlibetquodlibet.json +valid,data/fedora-infra/anitya,fedora-infraanitya.json +valid,data/slimovich/Realworld-fastapi-gino-template,slimovichRealworld-fastapi-gino-template.json +valid,data/tmux-python/libtmux,tmux-pythonlibtmux.json +valid,data/osiweb/unified_retro_keyboard,osiwebunified_retro_keyboard.json +valid,data/ssokolow/quicktile,ssokolowquicktile.json +valid,data/HoloArchivists/twspace-dl,HoloArchiviststwspace-dl.json +valid,data/astronomer/astro-sdk,astronomerastro-sdk.json +valid,data/adekmaulana/aioaria2-mirror-bot,adekmaulanaaioaria2-mirror-bot.json +valid,data/rstcheck/rstcheck,rstcheckrstcheck.json +valid,data/Finistere/antidote,Finistereantidote.json +valid,data/rom1504/img2dataset,rom1504img2dataset.json +valid,data/ActivityWatch/aw-watcher-window,ActivityWatchaw-watcher-window.json +valid,data/winpython/winpython,winpythonwinpython.json +valid,data/dedupeio/dedupe,dedupeiodedupe.json +valid,data/vangorra/python_withings_api,vangorrapython_withings_api.json +valid,data/pymedphys/pymedphys,pymedphyspymedphys.json +valid,data/dsmrreader/dsmr-reader,dsmrreaderdsmr-reader.json +valid,data/FFRI/ProjectChampollion,FFRIProjectChampollion.json +valid,data/facebook/TestSlide,facebookTestSlide.json +valid,data/jodal/comics,jodalcomics.json +valid,data/your-tools/tsrc,your-toolstsrc.json +valid,data/PostHog/posthog,PostHogposthog.json +valid,data/openai/kubernetes-ec2-autoscaler,openaikubernetes-ec2-autoscaler.json +valid,data/goauthentik/authentik,goauthentikauthentik.json +valid,data/chmouel/gnome-next-meeting-applet,chmouelgnome-next-meeting-applet.json +valid,data/unchained-capital/hermit,unchained-capitalhermit.json +valid,data/plone/guillotina,ploneguillotina.json +valid,data/cherrypy/cheroot,cherrypycheroot.json +valid,data/piccolo-orm/piccolo_api,piccolo-ormpiccolo_api.json +valid,data/materialsvirtuallab/mlearn,materialsvirtuallabmlearn.json +valid,data/runtimeverification/evm-semantics,runtimeverificationevm-semantics.json +valid,data/long2ice/asyncmy,long2iceasyncmy.json +valid,data/GaloisInc/cryptol,GaloisInccryptol.json +valid,data/navis-org/navis,navis-orgnavis.json +valid,data/mintel/pytest-localstack,mintelpytest-localstack.json +valid,data/luizalabs/shared-memory-dict,luizalabsshared-memory-dict.json +valid,data/tophat/syrupy,tophatsyrupy.json +valid,data/widdowquinn/pyani,widdowquinnpyani.json +valid,data/sobjornstad/TiddlyRemember,sobjornstadTiddlyRemember.json +valid,data/NVIDIA-Merlin/Transformers4Rec,NVIDIA-MerlinTransformers4Rec.json +valid,data/stlehmann/Flask-MQTT,stlehmannFlask-MQTT.json +valid,data/quic/aimet-model-zoo,quicaimet-model-zoo.json +valid,data/dfinity/ic,dfinityic.json +valid,data/koxudaxi/datamodel-code-generator,koxudaxidatamodel-code-generator.json +valid,data/ceph/ceph,cephceph.json +valid,data/sumeshi/evtx2es,sumeshievtx2es.json +valid,data/woefe/ytcc,woefeytcc.json +valid,data/shibuiwilliam/ml-system-in-actions,shibuiwilliamml-system-in-actions.json +valid,data/siddhantgoel/streaming-form-data,siddhantgoelstreaming-form-data.json +valid,data/voxpupuli/pypuppetdb,voxpupulipypuppetdb.json +valid,data/Project-MONAI/MONAILabel,Project-MONAIMONAILabel.json +valid,data/MaterializeInc/materialize,MaterializeIncmaterialize.json +valid,data/DGEXSolutions/osrd,DGEXSolutionsosrd.json +valid,data/python-telegram-bot/ptbcontrib,python-telegram-botptbcontrib.json +valid,data/epwalsh/pytorch-crf,epwalshpytorch-crf.json +valid,data/cantools/cantools,cantoolscantools.json +valid,data/Mini-Conf/Mini-Conf,Mini-ConfMini-Conf.json +valid,data/drivendataorg/erdantic,drivendataorgerdantic.json +valid,data/aiven/aiven-client,aivenaiven-client.json +valid,data/OpenMined/SyMPC,OpenMinedSyMPC.json +valid,data/Xilinx/qemu,Xilinxqemu.json +valid,data/nasaharvest/cropharvest,nasaharvestcropharvest.json +valid,data/mozilla/gecko-dev,mozillagecko-dev.json +valid,data/silviogutierrez/reactivated,silviogutierrezreactivated.json +valid,data/apryor6/fastapi_example,apryor6fastapi_example.json +valid,data/pymc-devs/pymc,pymc-devspymc.json +valid,data/sqlfluff/sqlfluff,sqlfluffsqlfluff.json +valid,data/caraml-dev/turing,caraml-devturing.json +valid,data/PacktPublishing/Clean-Code-in-Python,PacktPublishingClean-Code-in-Python.json +valid,data/gaphor/gaphor,gaphorgaphor.json +valid,data/csingley/ibflex,csingleyibflex.json +valid,data/mrbeardad/SpaceVim,mrbeardadSpaceVim.json +valid,data/backube/volsync,backubevolsync.json +valid,data/pytube/pytube,pytubepytube.json +valid,data/jitsejan/python-flask-with-javascript,jitsejanpython-flask-with-javascript.json +valid,data/ninoseki/eml_analyzer,ninosekieml_analyzer.json +valid,data/IBCNServices/pyRDF2Vec,IBCNServicespyRDF2Vec.json +valid,data/tkarabela/pysubs2,tkarabelapysubs2.json +valid,data/preset-io/elasticsearch-dbapi,preset-ioelasticsearch-dbapi.json +valid,data/dapr/python-sdk,daprpython-sdk.json +valid,data/NVIDIA-Merlin/models,NVIDIA-Merlinmodels.json +valid,data/qutebrowser/qutebrowser,qutebrowserqutebrowser.json +valid,data/nccgroup/ScoutSuite,nccgroupScoutSuite.json +valid,data/executablebooks/mdformat,executablebooksmdformat.json +valid,data/lowRISC/opentitan,lowRISCopentitan.json +valid,data/de-code/python-tf-bodypix,de-codepython-tf-bodypix.json diff --git a/type4py/__main__.py b/type4py/__main__.py index 475a8de..b1f74fe 100644 --- a/type4py/__main__.py +++ b/type4py/__main__.py @@ -10,51 +10,60 @@ warnings.filterwarnings("ignore") data_loading_comb = {'train': data_loaders.load_combined_train_data, 'valid': data_loaders.load_combined_valid_data, - 'test': data_loaders.load_combined_test_data, 'labels': data_loaders.load_combined_labels, + 'test': data_loaders.load_combined_test_data, 'labels': data_loaders.load_combined_labels, 'name': 'complete'} -data_loading_comb_sep = {'train': data_loaders.load_combined_train_data_split, 'valid': data_loaders.load_combined_valid_data_split, - 'test': data_loaders.load_combined_test_data, 'labels': data_loaders.load_combined_labels_split, - 'name': 'complete'} +data_loading_comb_sep = {'train': data_loaders.load_combined_train_data_split, + 'valid': data_loaders.load_combined_valid_data_split, + 'test': data_loaders.load_combined_test_data, + 'labels': data_loaders.load_combined_labels_split, + 'name': 'complete'} -data_loading_woi = {'train': data_loaders.load_combined_train_data_woi, 'valid': data_loaders.load_combined_valid_data_woi, - 'test': data_loaders.load_combined_test_data_woi, 'labels': data_loaders.load_combined_labels, - 'name': 'woi'} +data_loading_woi = {'train': data_loaders.load_combined_train_data_woi, + 'valid': data_loaders.load_combined_valid_data_woi, + 'test': data_loaders.load_combined_test_data_woi, 'labels': data_loaders.load_combined_labels, + 'name': 'woi'} -data_loading_woc = {'train': data_loaders.load_combined_train_data_woc, 'valid': data_loaders.load_combined_valid_data_woc, - 'test': data_loaders.load_combined_test_data_woc, 'labels': data_loaders.load_combined_labels, - 'name': 'woc'} +data_loading_woc = {'train': data_loaders.load_combined_train_data_woc, + 'valid': data_loaders.load_combined_valid_data_woc, + 'test': data_loaders.load_combined_test_data_woc, 'labels': data_loaders.load_combined_labels, + 'name': 'woc'} -data_loading_wov = {'train': data_loaders.load_combined_train_data_wov, 'valid': data_loaders.load_combined_valid_data_wov, - 'test': data_loaders.load_combined_test_data_wov, 'labels': data_loaders.load_combined_labels, - 'name': 'wov'} +data_loading_wov = {'train': data_loaders.load_combined_train_data_wov, + 'valid': data_loaders.load_combined_valid_data_wov, + 'test': data_loaders.load_combined_test_data_wov, 'labels': data_loaders.load_combined_labels, + 'name': 'wov'} data_loading_param = {'train': data_loaders.load_param_train_data, 'valid': data_loaders.load_param_valid_data, - 'test': data_loaders.load_param_test_data, 'labels': data_loaders.load_param_labels, - 'name': 'argument'} + 'test': data_loaders.load_param_test_data, 'labels': data_loaders.load_param_labels, + 'name': 'argument'} data_loading_ret = {'train': data_loaders.load_ret_train_data, 'valid': data_loaders.load_ret_valid_data, - 'test': data_loaders.load_ret_test_data, 'labels': data_loaders.load_ret_labels, - 'name': 'return'} + 'test': data_loaders.load_ret_test_data, 'labels': data_loaders.load_ret_labels, + 'name': 'return'} data_loading_var = {'train': data_loaders.load_var_train_data, 'valid': data_loaders.load_var_valid_data, - 'test': data_loaders.load_var_test_data, 'labels': data_loaders.load_var_labels, - 'name': 'variable'} + 'test': data_loaders.load_var_test_data, 'labels': data_loaders.load_var_labels, + 'name': 'variable'} + def extract(args): p = Pipeline(args.c, args.o, True, False, args.d) p.run(find_repos_list(args.c) if args.l is None else find_repos_list(args.c)[:args.l], args.w) - + + def preprocess(args): from type4py.preprocess import preprocess_ext_fns setup_logs_file(args.o, "preprocess") preprocess_ext_fns(args.o, args.l, args.rvth) + def vectorize(args): from type4py.vectorize import vectorize_args_ret setup_logs_file(args.o, "vectorize") vectorize_args_ret(args.o) + def learn(args): from type4py.learn import train setup_logs_file(args.o, "learn") @@ -67,12 +76,14 @@ def learn(args): else: train(args.o, data_loading_comb, args.p, args.v) + def learn_split(args): from type4py.learn_split import train_split setup_logs_file(args.o, "learn_sep") if args.c: train_split(args.o, data_loading_comb_sep, args.dt, args.p, args.v) + def predict(args): from type4py.predict import test setup_logs_file(args.o, "predict") @@ -85,18 +96,20 @@ def predict(args): elif args.c: test(args.o, data_loading_comb, args.l, args.rtc) + def gen_cluster(args): from type4py.gen_cluster import gen_cluster setup_logs_file(args.o, "gen_clusters") gen_cluster(args.o, data_loading_comb_sep, args.dt) + def eval(args): from type4py.eval import evaluate setup_logs_file(args.o, "eval") tasks = {'c': {'Parameter', 'Return', 'Variable'}, 'p': {'Parameter'}, 'r': {'Return'}, 'v': {'Variable'}} if args.woi: - evaluate(args.o, data_loading_woi['name'], tasks[args.t] , args.tp, args.mrr) + evaluate(args.o, data_loading_woi['name'], tasks[args.t], args.tp, args.mrr) elif args.woc: evaluate(args.o, data_loading_woc['name'], tasks[args.t], args.tp, args.mrr) elif args.wov: @@ -104,12 +117,19 @@ def eval(args): else: evaluate(args.o, data_loading_comb['name'], tasks[args.t], args.tp, args.mrr) + def infer(args): from type4py.deploy.infer import infer_main setup_logs_file(args.m, 'infer') infer_main(args.m, args.f) +def infer_project(args): + from type4py.deploy.infer_project import infer_project_main + setup_logs_file(args.m, 'infer_project') + infer_project_main(args.m, args.p, args.o, args.split) + + def main(): arg_parser = argparse.ArgumentParser() sub_parsers = arg_parser.add_subparsers(dest='cmd') @@ -119,15 +139,19 @@ def main(): extract_parser.add_argument('--c', '--corpus', required=True, type=str, help="Path to the Python corpus or dataset") extract_parser.add_argument('--o', '--output', required=True, type=str, help="Path to store processed projects") extract_parser.add_argument('--d', '--deduplicate', required=False, type=str, help="Path to duplicate files") - extract_parser.add_argument('--w', '--workers', required=False, default=4, type=int, help="Number of workers to extract functions from the input corpus") - extract_parser.add_argument('--l', '--limit', required=False, type=int, help="Limits the number of projects to be processed") + extract_parser.add_argument('--w', '--workers', required=False, default=4, type=int, + help="Number of workers to extract functions from the input corpus") + extract_parser.add_argument('--l', '--limit', required=False, type=int, + help="Limits the number of projects to be processed") extract_parser.set_defaults(func=extract) # Preprocess phase preprocess_parser = sub_parsers.add_parser('preprocess') preprocess_parser.add_argument('--o', '--output', required=True, type=str, help="Path to processed projects") - preprocess_parser.add_argument('--l', '--limit', required=False, type=int, help="Limits the number of projects to be processed") - preprocess_parser.add_argument('--rvth', '--random-vth', default=False, action="store_true", help="Apply available type hints with a probability [Default=0.5] *ONLY FOR PRODUCTION*") + preprocess_parser.add_argument('--l', '--limit', required=False, type=int, + help="Limits the number of projects to be processed") + preprocess_parser.add_argument('--rvth', '--random-vth', default=False, action="store_true", + help="Apply available type hints with a probability [Default=0.5] *ONLY FOR PRODUCTION*") preprocess_parser.set_defaults(func=preprocess) # Vectorize phase @@ -141,32 +165,39 @@ def main(): learning_parser.add_argument('--c', '--complete', default=True, action="store_true", help="Complete Type4Py model") learning_parser.add_argument('--woi', default=False, action="store_true", help="Type4py model w/o identifiers") learning_parser.add_argument('--woc', default=False, action="store_true", help="Type4py model w/o code contexts") - learning_parser.add_argument('--wov', default=False, action="store_true", help="Type4py model w/o visible type hints") - learning_parser.add_argument('--p', '--parameters', required=False, type=str, help="Path to the JSON file of model's hyper-parameters") - learning_parser.add_argument('--v', '--validation', default=False, action="store_true", help="Evaluating Type4Py on the validation set when training") + learning_parser.add_argument('--wov', default=False, action="store_true", + help="Type4py model w/o visible type hints") + learning_parser.add_argument('--p', '--parameters', required=False, type=str, + help="Path to the JSON file of model's hyper-parameters") + learning_parser.add_argument('--v', '--validation', default=False, action="store_true", + help="Evaluating Type4Py on the validation set when training") learning_parser.set_defaults(func=learn) # Learning phase split learning_parser_sep = sub_parsers.add_parser('learn_sep') learning_parser_sep.add_argument('--o', '--output', required=True, type=str, help="Path to processed projects") - learning_parser_sep.add_argument('--c', '--complete', default=True, action="store_true", help="Complete Type4Py model") + learning_parser_sep.add_argument('--c', '--complete', default=True, action="store_true", + help="Complete Type4Py model") learning_parser_sep.add_argument('--dt', '--datatype', required=True, type=str, help="Datatype for training phase") learning_parser_sep.add_argument('--p', '--parameters', required=False, type=str, - help="Path to the JSON file of model's hyper-parameters") + help="Path to the JSON file of model's hyper-parameters") learning_parser_sep.add_argument('--v', '--validation', default=False, action="store_true", - help="Evaluating Type4Py on the validation set when training") + help="Evaluating Type4Py on the validation set when training") learning_parser_sep.set_defaults(func=learn_split) # Prediction phase predict_parser = sub_parsers.add_parser('predict') predict_parser.add_argument('--o', '--output', required=True, type=str, help="Path to processed projects") predict_parser.add_argument('--c', '--complete', default=True, action="store_true", help="Complete Type4Py model") - predict_parser.add_argument('--l', '--limit', required=False, type=int, help="Limiting the size of type vocabulary when building type clusters") - predict_parser.add_argument('--rtc', '--reduced-tc', default=False, action="store_true", help="Use reduced type clusters") + predict_parser.add_argument('--l', '--limit', required=False, type=int, + help="Limiting the size of type vocabulary when building type clusters") + predict_parser.add_argument('--rtc', '--reduced-tc', default=False, action="store_true", + help="Use reduced type clusters") predict_parser.add_argument('--woi', default=False, action="store_true", help="Type4py model w/o identifiers") predict_parser.add_argument('--woc', default=False, action="store_true", help="Type4py model w/o code contexts") - predict_parser.add_argument('--wov', default=False, action="store_true", help="Type4py model w/o visible type hints") + predict_parser.add_argument('--wov', default=False, action="store_true", + help="Type4py model w/o visible type hints") predict_parser.set_defaults(func=predict) # gen type cluster incremental: predict phase @@ -175,13 +206,14 @@ def main(): predict_parser.add_argument('--dt', '--datatype', required=True, help="Datatype for generating type clusters") predict_parser.set_defaults(func=gen_cluster) - # Evaluation phase eval_parser = sub_parsers.add_parser('eval') eval_parser.add_argument('--o', '--output', required=True, type=str, help="Path to processed projects") - eval_parser.add_argument('--t', '--task', default="c", type=str, help="Prediction tasks (combined -> c |parameters -> p| return -> r| variable -> v)") + eval_parser.add_argument('--t', '--task', default="c", type=str, + help="Prediction tasks (combined -> c |parameters -> p| return -> r| variable -> v)") eval_parser.add_argument('--tp', '--topn', default=10, type=int, help="Report top-n predictions [default n=10]") - eval_parser.add_argument('--mrr', default=False, action="store_true", help="Calculates MRR for all considered metrics") + eval_parser.add_argument('--mrr', default=False, action="store_true", + help="Calculates MRR for all considered metrics") eval_parser.add_argument('--woi', default=False, action="store_true", help="Type4py model w/o identifiers") eval_parser.add_argument('--woc', default=False, action="store_true", help="Type4py model w/o code contexts") eval_parser.add_argument('--wov', default=False, action="store_true", help="Type4py model w/o visible type hints") @@ -194,9 +226,23 @@ def main(): # Inference infer_parser = sub_parsers.add_parser('infer') infer_parser.add_argument('--m', '--model', required=True, type=str, help="Path to the pre-trained Type4Py model") - infer_parser.add_argument('--f', '--file', required=True, type=str, help="Path to the input source file for inference") + infer_parser.add_argument('--f', '--file', required=True, type=str, + help="Path to the input source file for inference") infer_parser.set_defaults(func=infer) + # Inference project_base + infer_parser_pro = sub_parsers.add_parser('infer_project') + infer_parser_pro.add_argument('--m', '--model', required=True, type=str, + help="Path to the pre-trained Type4Py model") + infer_parser_pro.add_argument('--p', '--path', required=True, type=str, + help="Path to python projects folder for inference") + infer_parser_pro.add_argument('--o', '--output', required=True, type=str, + help="Path to store the ml_infer outputs") + # split according to dataset_split_repo.csv + infer_parser_pro.add_argument('--split', '--split_file', required=True, type=str, + help="file to store the split of projects") + infer_parser_pro.set_defaults(func=infer_project) + # To ONNX format onnx_parser = sub_parsers.add_parser('to_onnx') onnx_parser.add_argument("--o", required=True, type=str, help="Path to processed projects") @@ -212,5 +258,6 @@ def main(): args = arg_parser.parse_args() args.func(args) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/type4py/deploy/infer.py b/type4py/deploy/infer.py index 3b2aa96..89e2e44 100644 --- a/type4py/deploy/infer.py +++ b/type4py/deploy/infer.py @@ -249,9 +249,9 @@ def infer_preds_score(type_embeds: np.array) -> List[List[Tuple[str, float]]]: preds = infer_single_dp(pre_trained_m.type_clusters_idx, pre_trained_m.type4py_model_params['k'], pre_trained_m.type_clusters_labels, te) if filter_pred_types: - type_embeds_preds.append(filter_preds(list(zip(list(pre_trained_m.label_enc.inverse_transform([p for p,s in preds])), [s for p,s in preds])))) + type_embeds_preds.append(filter_preds(list(zip(list(pre_trained_m.label_enc.inverse_transform([int(p) for p,s in preds])), [s for p,s in preds])))) else: - type_embeds_preds.append(list(zip(list(pre_trained_m.label_enc.inverse_transform([p for p,s in preds])), [s for p,s in preds]))) + type_embeds_preds.append(list(zip(list(pre_trained_m.label_enc.inverse_transform([int(p) for p,s in preds])), [s for p,s in preds]))) return type_embeds_preds diff --git a/type4py/deploy/infer_project.py b/type4py/deploy/infer_project.py new file mode 100644 index 0000000..e3a5156 --- /dev/null +++ b/type4py/deploy/infer_project.py @@ -0,0 +1,89 @@ +import os +from typing import List +import pandas as pd +import tqdm +import json + +from type4py.deploy.infer import PretrainedType4Py, type_annotate_file +from type4py import logger +from libsa4py.exceptions import ParseError + +from libsa4py.utils import list_files, find_repos_list +from pathlib import Path + +def find_test_list(project_dir, dataset_split): + if os.path.exists(dataset_split): + repos_list: List[dict] = [] + + df = pd.read_csv(dataset_split) + test_df = df[df['set'] == 'test'] + for index, row in test_df.iterrows(): + project = row['project'] + author = project.split('/')[1] + repo = project.split('/')[2] + project_path = os.path.join(project_dir, author, repo) + if os.path.isdir(project_path): + repos_list.append({"author": author, "repo": repo}) + return repos_list + + else: + # logger.info(f"dataset_split file: {dataset_split} does not exist!") + raise FileNotFoundError(f"dataset_split file: {dataset_split} does not exist!") + +def infer(repo, model, project_dir, tar_dir): + project_author = repo["author"] + project_name = repo["repo"] + project_path = os.path.join(project_dir, project_author, project_name) + id_tuple = (project_author, project_name) + project_id = "/".join(id_tuple) + project_analyzed_files: dict = {project_id: {"src_files": {}, "type_annot_cove": 0.0}} + print(f'Running pipeline for project {project_path}') + + print(f'Extracting for {project_path}...') + project_files = list_files(project_path) + print(f"{project_path} has {len(project_files)} files") + + project_files = [(f, str(Path(f).relative_to(Path(project_path).parent))) for f in project_files] + + if len(project_files) != 0: + for filename, f_relative in project_files: + try: + ext_type_hints = type_annotate_file(model, None, filename) + project_analyzed_files[project_id]["src_files"][filename] = \ + ext_type_hints + except ParseError as err: + print("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) + except UnicodeDecodeError: + print(f"Could not read file {filename}") + except Exception as err: + print("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) + + if len(project_analyzed_files[project_id]["src_files"].keys()) != 0: + project_analyzed_files[project_id]["type_annot_cove"] = \ + round(sum([project_analyzed_files[project_id]["src_files"][s]["type_annot_cove"] for s in + project_analyzed_files[project_id]["src_files"].keys()]) / len( + project_analyzed_files[project_id]["src_files"].keys()), 2) + + processed_file = os.path.join(tar_dir, f"{project_author}{project_name}_mlInfer.json") + with open(processed_file, 'w') as json_f: + json.dump(project_analyzed_files, json_f, indent=4) + + +def infer_projects(model, project_dir, tar_dir, split_file): + if split_file is not None: + repo_infos_test = find_test_list(project_dir, split_file) + logger.info(f'Totally find {len(repo_infos_test)} projects in test set') + else: + logger.info(f"dataset_split file not provided, infer all projects in {project_dir}") + repo_infos_test = find_repos_list(project_dir) + logger.info(f'Totally find {len(repo_infos_test)} projects in project dir') + + for repo in tqdm(repo_infos_test): + infer(repo, model, project_dir, tar_dir) + + +def infer_project_main(model_path, input_path, output_path, split_file): + t4py_pretrained_m = PretrainedType4Py(model_path, "gpu", pre_read_type_cluster=False, use_pca=True) + t4py_pretrained_m.load_pretrained_model() + + infer_projects(t4py_pretrained_m, input_path, output_path, split_file) From 4cc376f0777d2807728aed0363149959fcf3f4f6 Mon Sep 17 00:00:00 2001 From: fenglang Date: Tue, 28 Mar 2023 09:54:06 +0200 Subject: [PATCH 05/43] add more comments --- type4py/__main__.py | 4 ++++ type4py/data_loaders.py | 6 +++--- type4py/gen_cluster.py | 3 +++ type4py/learn_split.py | 30 ++++++++++++++++++++++++------ type4py/reduce.py | 1 + type4py/vectorize.py | 3 +++ 6 files changed, 38 insertions(+), 9 deletions(-) diff --git a/type4py/__main__.py b/type4py/__main__.py index b1f74fe..ae6b38c 100644 --- a/type4py/__main__.py +++ b/type4py/__main__.py @@ -13,6 +13,7 @@ 'test': data_loaders.load_combined_test_data, 'labels': data_loaders.load_combined_labels, 'name': 'complete'} +# add split data loading function data_loading_comb_sep = {'train': data_loaders.load_combined_train_data_split, 'valid': data_loaders.load_combined_valid_data_split, 'test': data_loaders.load_combined_test_data, @@ -77,6 +78,7 @@ def learn(args): train(args.o, data_loading_comb, args.p, args.v) +# add learn_split function for CLI command "learn_split" def learn_split(args): from type4py.learn_split import train_split setup_logs_file(args.o, "learn_sep") @@ -97,6 +99,7 @@ def predict(args): test(args.o, data_loading_comb, args.l, args.rtc) +# add gen_cluster function for CLI command "gen_clu" def gen_cluster(args): from type4py.gen_cluster import gen_cluster setup_logs_file(args.o, "gen_clusters") @@ -124,6 +127,7 @@ def infer(args): infer_main(args.m, args.f) +# add projects-based infer function for command "infer_project" def infer_project(args): from type4py.deploy.infer_project import infer_project_main setup_logs_file(args.m, 'infer_project') diff --git a/type4py/data_loaders.py b/type4py/data_loaders.py index 57bce64..94777cb 100644 --- a/type4py/data_loaders.py +++ b/type4py/data_loaders.py @@ -36,7 +36,7 @@ def load_combined_train_data(output_path: str): load_data_tensors_TW(join(output_path, 'vectors', 'train', 'ret_train_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'train', 'var_train_aval_types_dp.npy')))) - +# load combined train dataset based on datatype def load_combined_train_data_split(output_path: str, type: str): if type == "var": logger.info("Loading Variable set...") @@ -70,7 +70,7 @@ def load_combined_valid_data(output_path: str): load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'ret_valid_aval_types_dp.npy')), load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'var_valid_aval_types_dp.npy')))) - +# load combined valid dataset based on datatype def load_combined_valid_data_split(output_path: str, type: str): if type == "var": return load_data_tensors_TW(join(output_path, 'vectors', 'valid', 'identifiers_var_valid_datapoints_x.npy')), \ @@ -115,7 +115,7 @@ def load_combined_labels(output_path: str): load_flat_labels_tensors(join(output_path, 'vectors', 'test', 'ret_test_dps_y_all.npy')), load_flat_labels_tensors(join(output_path, 'vectors', 'test', 'var_test_dps_y_all.npy')))) - +# load combined labels for dataset def load_combined_labels_split(output_path: str, type: str): if type == "var": return load_data_tensors_TW(join(output_path, 'vectors', 'train', 'var_train_dps_y_all.npy')), \ diff --git a/type4py/gen_cluster.py b/type4py/gen_cluster.py index 64d863b..9acfa7d 100644 --- a/type4py/gen_cluster.py +++ b/type4py/gen_cluster.py @@ -269,6 +269,8 @@ def gen_cluster(output_path: str, data_loading_funcs: dict, datatype: str, type_ if datatype not in {"var", "param", "ret"}: raise DataTypeNotExistError(f"datatype input {datatype} not in [ var, param, ret] list") + # check existing AnnoyIndex and embedd_labels before generate new + # checking and loading the existing Annoy_Index logger.info("Checking the existing AnnoyIndex...") cluster_file, processed_type = find_existing_index(data_loading_funcs, output_path) @@ -292,6 +294,7 @@ def gen_cluster(output_path: str, data_loading_funcs: dict, datatype: str, type_ model_params['batches']) logger.info(f"Train and Valid data loaded") + # generate new anny_index and embed_labels annoy_index, embed_labels = build_type_clusters(model.model, output_path, train_data_loader, valid_data_loader, type_vocab, cluster_file, embedded_file) logger.info("Created type clusters") diff --git a/type4py/learn_split.py b/type4py/learn_split.py index 2abbdee..1d3bc83 100644 --- a/type4py/learn_split.py +++ b/type4py/learn_split.py @@ -23,20 +23,25 @@ logger.name = __name__ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + class ModelNotFit(Exception): pass + class NotCompleteModel(ModelNotFit): def __init__(self): super().__init__("learn_split may just fit for complete model!") + class TrainedModel(Exception): pass + class ModelTrainedError(TrainedModel): def __init__(self): super().__init__("Model has been trained for this dataset!") + class Type4Py(nn.Module): """ Complete model @@ -212,6 +217,7 @@ def compute_validation_loss_dsl(model: TripletModel, criterion, train_valid_load return valid_total_loss, 0.0 + def check_pickle_file(type, data_loading_funcs, output_path): var_exist = False param_exist = False @@ -227,6 +233,8 @@ def check_pickle_file(type, data_loading_funcs, output_path): logger.info(f"find existing {data_loading_funcs['name']}_common_types_ret.pkl file !") return var_exist, param_exist, ret_exist + +# find existing trained model, return trained_types def find_existing_model(data_loading_funcs, output_path): prefix = f"type4py_{data_loading_funcs['name']}_model" suffix = ".pt" @@ -238,22 +246,27 @@ def find_existing_model(data_loading_funcs, output_path): return filename, trained return None, None -def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, model_params_path=None, validation: bool = False): + +def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, model_params_path=None, + validation: bool = False): logger.info(f"Training Type4Py model") logger.info(f"***********************************************************************") # Model's hyper parameters model_params = load_model_params(model_params_path) + + # data loading process based on datatype data_type_list = ["var", "param", "ret"] if dataset_type not in data_type_list: raise ValueError(f"{dataset_type} is not in the default data type list!") - train_data_loader, valid_data_loader = load_training_data_per_model_sep(data_loading_funcs, output_path,dataset_type, - model_params['batches'], - load_valid_data=validation, - no_workers=cpu_count() // 2) + train_data_loader, valid_data_loader = load_training_data_per_model_sep(data_loading_funcs, output_path, + dataset_type, + model_params['batches'], + load_valid_data=validation, + no_workers=cpu_count() // 2) - # Loading label encoder and finding ubiquitous & common types + # Loading label encoder and check existing count_types file le_all = pickle.load(open(join(output_path, "label_encoder_all.pkl"), 'rb')) count_types = Counter(train_data_loader.dataset.labels.data.numpy()) @@ -267,18 +280,22 @@ def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, type_filename = dataset_type + # if find existing types in "var" dataset, load them for updating for final common types if var_exists and dataset_type != "var": with open(join(output_path, f"{data_loading_funcs['name']}_common_types_var.pkl"), 'rb') as f1: count_types_var = pickle.load(f1) count_types.update(count_types_var) + # also add suffix to filename type_filename = type_filename + "_var" + # if find existing types in "param" dataset, load them for updating for final common types if param_exits and dataset_type != "param": with open(join(output_path, f"{data_loading_funcs['name']}_common_types_param.pkl"), 'rb') as f2: count_types_param = pickle.load(f2) count_types.update(count_types_param) type_filename = type_filename + "_param" + # if find existing types in "ret" dataset, load them for updating for final common types if ret_exists and dataset_type != "ret": with open(join(output_path, f"{data_loading_funcs['name']}_common_types_ret.pkl"), 'rb') as f3: count_types_ret = pickle.load(f3) @@ -301,6 +318,7 @@ def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, with open(join(output_path, f"{data_loading_funcs['name']}_common_types_{type_filename}.pkl"), 'wb') as f: pickle.dump(common_types, f) + # get the trained_model name and trained_types trained_model_name, trained_types = find_existing_model(data_loading_funcs, output_path) if trained_types == None: diff --git a/type4py/reduce.py b/type4py/reduce.py index 32ac682..91acca9 100644 --- a/type4py/reduce.py +++ b/type4py/reduce.py @@ -24,6 +24,7 @@ def __init__(self): def reduce_tc(args): model_params = load_model_params() type_cluster_index = AnnoyIndex(model_params['output_size'], 'euclidean') + # check if there is existing type_cluster if os.path.exists(join(args.o, "type4py_complete_type_cluster")): logger.info("Loading type clusters: type4py_complete_type_cluster") type_cluster_index.load(join(args.o, "type4py_complete_type_cluster")) diff --git a/type4py/vectorize.py b/type4py/vectorize.py index 7e0a496..e4dd872 100644 --- a/type4py/vectorize.py +++ b/type4py/vectorize.py @@ -232,6 +232,9 @@ def process_datapoints(df, output_path, embedding_type, type, trans_func, cached datapoints = df.apply(trans_func, axis=1) # optimize np.stack for datapoints in batches when handling large datasets + # emd_shape is based on the "identifiers" and "tokens" + + # define batch sizes and rows for datapoints_x batch_size = 1000 num_rows = datapoints.shape[0] From 084aaa2f9f2a88374b8eab84d6ffc3f32ac9b24c Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 6 Apr 2023 18:17:15 +0200 Subject: [PATCH 06/43] fix the issues --- type4py/__init__.py | 1 + type4py/__main__.py | 30 ++- type4py/exceptions.py | 37 ++++ .../{gen_cluster.py => gen_type_cluster.py} | 158 +------------- type4py/learn_split.py | 198 +----------------- type4py/predict_split.py | 70 +++++++ type4py/reduce.py | 7 +- type4py/vectorize.py | 11 +- 8 files changed, 145 insertions(+), 367 deletions(-) create mode 100644 type4py/exceptions.py rename type4py/{gen_cluster.py => gen_type_cluster.py} (57%) create mode 100644 type4py/predict_split.py diff --git a/type4py/__init__.py b/type4py/__init__.py index 9f89ddf..6fbb08c 100644 --- a/type4py/__init__.py +++ b/type4py/__init__.py @@ -17,3 +17,4 @@ MAX_PARAM_TYPE_DEPTH = 2 TOKEN_SEQ_LEN = (7, 3) AVAILABLE_TYPE_APPLY_PROB = 0.5 +IDENTIFIER_SEQ_LEN = 31 \ No newline at end of file diff --git a/type4py/__main__.py b/type4py/__main__.py index ae6b38c..37c08e5 100644 --- a/type4py/__main__.py +++ b/type4py/__main__.py @@ -100,10 +100,16 @@ def predict(args): # add gen_cluster function for CLI command "gen_clu" -def gen_cluster(args): - from type4py.gen_cluster import gen_cluster +def gen_type_cluster(args): + from type4py.gen_type_cluster import gen_type_cluster setup_logs_file(args.o, "gen_clusters") - gen_cluster(args.o, data_loading_comb_sep, args.dt) + gen_type_cluster(args.o, data_loading_comb_sep, args.dt) + +def predict_split(args): + from type4py.predict_split import test_split + setup_logs_file(args.o, "predict_sep") + if args.c: + test_split(args.o, data_loading_comb_sep) def eval(args): @@ -178,7 +184,7 @@ def main(): learning_parser.set_defaults(func=learn) # Learning phase split - learning_parser_sep = sub_parsers.add_parser('learn_sep') + learning_parser_sep = sub_parsers.add_parser('learns') learning_parser_sep.add_argument('--o', '--output', required=True, type=str, help="Path to processed projects") learning_parser_sep.add_argument('--c', '--complete', default=True, action="store_true", help="Complete Type4Py model") @@ -204,11 +210,17 @@ def main(): help="Type4py model w/o visible type hints") predict_parser.set_defaults(func=predict) - # gen type cluster incremental: predict phase - predict_parser = sub_parsers.add_parser('gen_clu') - predict_parser.add_argument('--o', '--output', required=True, type=str, help="Path to processed projects") - predict_parser.add_argument('--dt', '--datatype', required=True, help="Datatype for generating type clusters") - predict_parser.set_defaults(func=gen_cluster) + # gen type cluster incremental: predict phase generate type cluster + predict_parser_gen_cluster = sub_parsers.add_parser('gen_type_clu') + predict_parser_gen_cluster.add_argument('--o', '--output', required=True, type=str, help="Path to processed projects") + predict_parser_gen_cluster.add_argument('--dt', '--datatype', required=True, help="Datatype for generating type clusters") + predict_parser_gen_cluster.set_defaults(func=gen_type_cluster) + + # gen predictions via type cluster: predict phase generate predictions + predict_parser_gen_pred = sub_parsers.add_parser('predicts') + predict_parser_gen_pred.add_argument('--o', '--output', required=True, type=str, help="Path to processed projects") + predict_parser_gen_pred.add_argument('--c', '--complete', default=True, action="store_true", help="Complete Type4Py model") + predict_parser_gen_pred.set_defaults(func=predict_split) # Evaluation phase eval_parser = sub_parsers.add_parser('eval') diff --git a/type4py/exceptions.py b/type4py/exceptions.py new file mode 100644 index 0000000..219d62b --- /dev/null +++ b/type4py/exceptions.py @@ -0,0 +1,37 @@ +class ModelNotFit(Exception): + pass + + +class NotCompleteModel(ModelNotFit): + def __init__(self): + super().__init__("learn_split may just fit for complete model!") + + +class TrainedModel(Exception): + pass + + +class ModelTrainedError(TrainedModel): + def __init__(self): + super().__init__("Model has been trained for this dataset!") + + +class EmdTypeError(Exception): + pass + + +class EmdTypeNotFound(EmdTypeError): + def __init__(self): + super().__init__("Embedding Type not found!") + + +class TypeClusterNotFound(Exception): + def __init__(self): + super().__init__("Type clusters not found!") + +class ModelNotfound(Exception): + pass + +class ModelNotExistsError(ModelNotfound): + def __init__(self, model_name): + super().__init__(f"Model {model_name} not found!") diff --git a/type4py/gen_cluster.py b/type4py/gen_type_cluster.py similarity index 57% rename from type4py/gen_cluster.py rename to type4py/gen_type_cluster.py index 9acfa7d..5a0e1a7 100644 --- a/type4py/gen_cluster.py +++ b/type4py/gen_type_cluster.py @@ -1,12 +1,14 @@ import argparse import os -from type4py.learn import load_model, TripletModel +from type4py.learn import load_model, TripletModel, Type4Py +from type4py.predict import predict_type_embed, predict_type_embed_task from type4py.data_loaders import select_data, TripletDataset, load_test_data_per_model, load_training_data_per_model_sep from type4py.deploy.infer import compute_types_score from type4py.utils import load_model_params, setup_logs_file from type4py import logger, MIN_DATA_POINTS, KNN_TREE_SIZE, data_loaders +from type4py.exceptions import ModelNotExistsError from libsa4py.utils import save_json from typing import Tuple, List from os.path import join @@ -25,130 +27,11 @@ logger.name = __name__ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') -class ModelNotfound(Exception): - pass - -class ModelNotExistsError(ModelNotfound): - def __init__(self, model_name): - super().__init__(f"Model {model_name} not found!") - -class Type4Py(nn.Module): - """ - Complete model - """ - - def __init__(self, input_size: int, hidden_size: int, aval_type_size: int, - num_layers: int, output_size: int, dropout_rate: float): - super(Type4Py, self).__init__() - - self.input_size = input_size - self.hidden_size = hidden_size - self.aval_type_size = aval_type_size - self.num_layers = num_layers - self.output_size = output_size - - self.lstm_id = nn.LSTM(self.input_size, self.hidden_size, self.num_layers, batch_first=True, - bidirectional=True) - self.lstm_tok = nn.LSTM(self.input_size, self.hidden_size, self.num_layers, batch_first=True, - bidirectional=True) - self.linear = nn.Linear(self.hidden_size * 2 * 2 + self.aval_type_size, self.output_size) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward(self, x_id, x_tok, x_type): - # Using dropout on input sequences - x_id = self.dropout(x_id) - x_tok = self.dropout(x_tok) - - # Flattens LSTMs weights for data-parallelism in multi-GPUs config - self.lstm_id.flatten_parameters() - self.lstm_tok.flatten_parameters() - - x_id, _ = self.lstm_id(x_id) - x_tok, _ = self.lstm_tok(x_tok) - - # Decode the hidden state of the last time step - x_id = x_id[:, -1, :] - x_tok = x_tok[:, -1, :] - - x = torch.cat((x_id, x_tok, x_type), 1) - - x = self.linear(x) - return x - - -class TripletModel(nn.Module): - """ - A model with Triplet loss for similarity learning - """ - - def __init__(self, model: nn.Module): - super(TripletModel, self).__init__() - self.model = model - - def forward(self, a, p, n): - """ - A triplet consists of anchor, positive examples and negative examples - """ - # return self.model(*(s.to(DEVICE) for s in a)), \ - # self.model(*(s.to(DEVICE) for s in p)), \ - # self.model(*(s.to(DEVICE) for s in n)) - - return self.model(*(s for s in a)), \ - self.model(*(s for s in p)), \ - self.model(*(s for s in n)) - - -def predict_type_embed(types_embed_array: np.array, types_embed_labels: np.array, - indexed_knn: AnnoyIndex, k: int) -> List[dict]: - """ - Predict type of given type embedding vectors - """ - - pred_types_embed = [] - pred_types_score = [] - for i, embed_vec in enumerate( - tqdm(types_embed_array, total=len(types_embed_array), desc="Finding KNNs & Prediction")): - idx, dist = indexed_knn.get_nns_by_vector(embed_vec, k, include_distances=True) - pred_idx_scores = compute_types_score(dist, idx, types_embed_labels) - pred_types_embed.append([i for i, s in pred_idx_scores]) - pred_types_score.append(pred_idx_scores) - - return pred_types_embed, pred_types_score - - -def predict_type_embed_task(types_embed_array: np.array, types_embed_labels: np.array, type_space_labels: np.array, - pred_task_idx: tuple, indexed_knn: AnnoyIndex, k: int) -> List[dict]: - def find_pred_task(i: int): - if i < pred_task_idx[0]: - return 'Parameter' - elif i < pred_task_idx[1]: - return 'Return' - else: - return 'Variable' - - pred_types: List[dict] = [] - # pred_types_embed = [] - # pred_types_score = [] - for i, embed_vec in enumerate( - tqdm(types_embed_array, total=len(types_embed_array), desc="Finding KNNs & Prediction")): - idx, dist = indexed_knn.get_nns_by_vector(embed_vec, k, include_distances=True) - pred_idx_scores = compute_types_score(dist, idx, type_space_labels) - - pred_types.append({'original_type': types_embed_labels[i], 'predictions': pred_idx_scores, - 'task': find_pred_task(i), - 'is_parametric': bool(re.match(r'(.+)\[(.+)\]', types_embed_labels[i]))}) - - # pred_types_embed.append([i for i, s in pred_idx_scores]) - # pred_types_score.append(pred_idx_scores) - - return pred_types - - def build_type_clusters(model, output_path, train_data_loader: DataLoader, valid_data_loader: DataLoader, type_vocab: set, exist_index: str, exist_emd: str): logger.info("Type Cluster building begin...") computed_embed_labels = [] - annoy_idx = AnnoyIndex(model.output_size, 'euclidean') + current_annoy_idx = AnnoyIndex(model.output_size, 'euclidean') loaded_idx = AnnoyIndex(model.output_size, 'euclidean') curr_idx = 0 @@ -157,7 +40,7 @@ def build_type_clusters(model, output_path, train_data_loader: DataLoader, valid curr_idx = loaded_idx.get_n_items() for i in range(loaded_idx.get_n_items()): item_vector = loaded_idx.get_item_vector(i) - annoy_idx.add_item(i, item_vector) + current_annoy_idx.add_item(i, item_vector) if exist_emd is not None: embedd_labels = np.load(join(output_path, exist_emd)).tolist() @@ -172,7 +55,7 @@ def build_type_clusters(model, output_path, train_data_loader: DataLoader, valid # computed_embed_labels.append(lables) for i, v in enumerate(output_a.data.cpu().numpy()): if lables[i] in type_vocab: - annoy_idx.add_item(curr_idx, v) + current_annoy_idx.add_item(curr_idx, v) computed_embed_labels.append(lables[i]) curr_idx += 1 @@ -185,33 +68,12 @@ def build_type_clusters(model, output_path, train_data_loader: DataLoader, valid # computed_embed_labels.append(a[1].data.cpu().numpy()) for i, v in enumerate(output_a.data.cpu().numpy()): if lables[i] in type_vocab: - annoy_idx.add_item(curr_idx, v) + current_annoy_idx.add_item(curr_idx, v) computed_embed_labels.append(lables[i]) curr_idx += 1 - annoy_idx.build(KNN_TREE_SIZE) - # annoy_idx. - return annoy_idx, np.array(computed_embed_labels) # np.hstack(computed_embed_labels) - - -def compute_type_embed_batch(model, data_loader: DataLoader, pca: PCA = None) -> Tuple[np.array, np.array]: - """ - Compute type embeddings for the whole dataset - """ - - computed_embed_batches = [] - computed_embed_labels = [] - - for batch_i, (a, p, n) in enumerate(tqdm(data_loader, total=len(data_loader), desc="Computing Type Clusters")): - model.eval() - with torch.no_grad(): - output_a = model(*(s.to(DEVICE) for s in a[0])) - output_a = output_a.data.cpu().numpy() - computed_embed_batches.append(pca.transform(output_a) if pca is not None else output_a) - computed_embed_labels.append(a[1].data.cpu().numpy()) - - return np.vstack(computed_embed_batches), np.hstack(computed_embed_labels) - + current_annoy_idx.build(KNN_TREE_SIZE) + return current_annoy_idx, np.array(computed_embed_labels) class DataTypeNotExistError(Exception): pass @@ -240,7 +102,7 @@ def find_existing_embedding(data_loading_funcs, output_path): return None, None -def gen_cluster(output_path: str, data_loading_funcs: dict, datatype: str, type_vocab_limit: int = None, +def gen_type_cluster(output_path: str, data_loading_funcs: dict, datatype: str, type_vocab_limit: int = None, use_tc_reduced: bool = False): logger.info(f"Testing Type4Py model") logger.info(f"**********************************************************************") diff --git a/type4py/learn_split.py b/type4py/learn_split.py index 1d3bc83..9be9cc6 100644 --- a/type4py/learn_split.py +++ b/type4py/learn_split.py @@ -3,9 +3,11 @@ from type4py.data_loaders import select_data, TripletDataset, load_training_data_per_model, \ load_training_data_per_model_sep from type4py.vectorize import AVAILABLE_TYPES_NUMBER, W2V_VEC_LENGTH +from type4py.learn import load_model, TripletModel, Type4Py, create_knn_index, train_loop_dsl from type4py.eval import eval_type_embed from type4py.utils import load_model_params from type4py import logger, MIN_DATA_POINTS, KNN_TREE_SIZE +from type4py.exceptions import ModelTrainedError from torch.utils.data import DataLoader from typing import Tuple from collections import Counter @@ -18,206 +20,10 @@ import torch.nn as nn import torch import pickle -import pkg_resources logger.name = __name__ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - -class ModelNotFit(Exception): - pass - - -class NotCompleteModel(ModelNotFit): - def __init__(self): - super().__init__("learn_split may just fit for complete model!") - - -class TrainedModel(Exception): - pass - - -class ModelTrainedError(TrainedModel): - def __init__(self): - super().__init__("Model has been trained for this dataset!") - - -class Type4Py(nn.Module): - """ - Complete model - """ - - def __init__(self, input_size: int, hidden_size: int, aval_type_size: int, - num_layers: int, output_size: int, dropout_rate: float): - super(Type4Py, self).__init__() - - self.input_size = input_size - self.hidden_size = hidden_size - self.aval_type_size = aval_type_size - self.num_layers = num_layers - self.output_size = output_size - - self.lstm_id = nn.LSTM(self.input_size, self.hidden_size, self.num_layers, batch_first=True, - bidirectional=True) - self.lstm_tok = nn.LSTM(self.input_size, self.hidden_size, self.num_layers, batch_first=True, - bidirectional=True) - self.linear = nn.Linear(self.hidden_size * 2 * 2 + self.aval_type_size, self.output_size) - self.dropout = nn.Dropout(p=dropout_rate) - - def forward(self, x_id, x_tok, x_type): - # Using dropout on input sequences - x_id = self.dropout(x_id) - x_tok = self.dropout(x_tok) - - # Flattens LSTMs weights for data-parallelism in multi-GPUs config - self.lstm_id.flatten_parameters() - self.lstm_tok.flatten_parameters() - - x_id, _ = self.lstm_id(x_id) - x_tok, _ = self.lstm_tok(x_tok) - - # Decode the hidden state of the last time step - x_id = x_id[:, -1, :] - x_tok = x_tok[:, -1, :] - - x = torch.cat((x_id, x_tok, x_type), 1) - - x = self.linear(x) - return x - - -class TripletModel(nn.Module): - """ - A model with Triplet loss for similarity learning - """ - - def __init__(self, model: nn.Module): - super(TripletModel, self).__init__() - self.model = model - - def forward(self, a, p, n): - """ - A triplet consists of anchor, positive examples and negative examples - """ - # return self.model(*(s.to(DEVICE) for s in a)), \ - # self.model(*(s.to(DEVICE) for s in p)), \ - # self.model(*(s.to(DEVICE) for s in n)) - - return self.model(*(s for s in a)), \ - self.model(*(s for s in p)), \ - self.model(*(s for s in n)) - - -def load_model(model_type: str, model_params: dict): - """ - Load the Type4Py model with desired confings - """ - - if model_type == "complete": - return Type4Py(W2V_VEC_LENGTH, model_params['hidden_size'], AVAILABLE_TYPES_NUMBER, model_params['layers'], - model_params['output_size'], model_params['dr']).to(DEVICE) - else: - raise NotCompleteModel - - -def create_knn_index(train_types_embed: np.array, valid_types_embed: np.array, type_embed_dim: int) -> AnnoyIndex: - """ - Creates KNNs index for given type embedding vectors - """ - - annoy_idx = AnnoyIndex(type_embed_dim, 'euclidean') - - for i, v in enumerate(tqdm(train_types_embed, total=len(train_types_embed), - desc="KNN index")): - annoy_idx.add_item(i, v) - - if valid_types_embed is not None: - for i, v in enumerate(valid_types_embed): - annoy_idx.add_item(len(train_types_embed) + i, v) - - annoy_idx.build(KNN_TREE_SIZE) - return annoy_idx - - -def train_loop_dsl(model: TripletModel, criterion, optimizer, train_data_loader: DataLoader, - valid_data_loader: DataLoader, learning_rate: float, epochs: int, - ubiquitous_types: str, common_types: set, model_path: str): - from type4py.predict import predict_type_embed - - for epoch in range(1, epochs + 1): - model.train() - # epoch_start_t = time() - total_loss = 0 - - for batch_i, (anchor, positive_ex, negative_ex) in enumerate(tqdm(train_data_loader, - total=len(train_data_loader), - desc=f"Epoch {epoch}")): - anchor, _ = anchor[0], anchor[1] - positive_ex, _ = positive_ex[0], positive_ex[1] - negative_ex, _ = negative_ex[0], negative_ex[1] - - optimizer.zero_grad() - anchor_embed, positive_ex_embed, negative_ex_embed = model(anchor, positive_ex, negative_ex) - loss = criterion(anchor_embed, positive_ex_embed, negative_ex_embed) - - # Backward and optimize - loss.backward() - optimizer.step() - - total_loss += loss.item() - - logger.info(f"epoch: {epoch} train loss: {total_loss}") - - if valid_data_loader is not None: - if epoch % 5 == 0: - logger.info("Evaluating on validation set") - valid_start = time() - valid_loss, valid_all_acc = compute_validation_loss_dsl(model, criterion, train_data_loader, - valid_data_loader, - predict_type_embed, ubiquitous_types, - common_types) - logger.info(f"epoch: {epoch} valid loss: {valid_loss} in {(time() - valid_start) / 60.0:.2f} min.") - # torch.save(model.module, join(model_path, f"{model.module.tw_embed_model.__class__.__name__}_{train_data_loader.dataset.dataset_name}_e{epoch}_{datetime.now().strftime('%b%d_%H-%M-%S')}.pt")) - - -def compute_validation_loss_dsl(model: TripletModel, criterion, train_valid_loader: DataLoader, - valid_data_loader: DataLoader, pred_func: callable, - ubiquitous_types: str, common_types: set) -> Tuple[float, float]: - """ - Computes validation loss for Deep Similarity Learning-based approach - """ - - valid_total_loss = 0 - with torch.no_grad(): - model.eval() - - if isinstance(model, nn.DataParallel): - main_model_forward = model.module.model - else: - main_model_forward = model.model - - computed_embed_batches_train = [] - computed_embed_labels_train = [] - computed_embed_batches_valid = [] - computed_embed_labels_valid = [] - - for batch_i, (anchor, positive_ex, negative_ex) in enumerate(tqdm(valid_data_loader, - total=len(valid_data_loader), - desc="Type Cluster - Valid set")): - positive_ex, _ = positive_ex[0], positive_ex[1] - negative_ex, _ = negative_ex[0], negative_ex[1] - - anchor_embed, positive_ex_embed, negative_ex_embed = model(anchor[0], positive_ex, negative_ex) - loss = criterion(anchor_embed, positive_ex_embed, negative_ex_embed) - valid_total_loss += loss.item() - - output_a = main_model_forward(*(s.to(DEVICE) for s in anchor[0])) - computed_embed_batches_valid.append(output_a.data.cpu().numpy()) - computed_embed_labels_valid.append(anchor[1].data.cpu().numpy()) - - return valid_total_loss, 0.0 - - def check_pickle_file(type, data_loading_funcs, output_path): var_exist = False param_exist = False diff --git a/type4py/predict_split.py b/type4py/predict_split.py new file mode 100644 index 0000000..fedb22f --- /dev/null +++ b/type4py/predict_split.py @@ -0,0 +1,70 @@ +import os + +from type4py.data_loaders import select_data, TripletDataset, load_training_data_per_model, load_test_data_per_model +from type4py.deploy.infer import compute_types_score +from type4py.learn import load_model, TripletModel, Type4Py +from type4py.exceptions import ModelNotExistsError +from type4py.utils import load_model_params +from type4py import logger, MIN_DATA_POINTS, KNN_TREE_SIZE +from type4py.predict import compute_type_embed_batch, predict_type_embed_task +from libsa4py.utils import save_json +from typing import Tuple, List +from os.path import join +from time import time +from torch.utils.data import DataLoader +from tqdm import tqdm +from annoy import AnnoyIndex +from sklearn.decomposition import PCA +import numpy as np +import pandas as pd +import pickle +import torch + +logger.name = __name__ +DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + +def test_split(output_path: str, data_loading_funcs: dict): + + logger.info(f"Testing Type4Py model") + logger.info(f"**********************************************************************") + + # Model's hyper parameters + model_params = load_model_params() + if os.path.exists(join(output_path, f"type4py_{data_loading_funcs['name']}_model_var_param_ret.pt")): + model = torch.load(join(output_path, f"type4py_{data_loading_funcs['name']}_model_var_param_ret.pt")) + else: + raise ModelNotExistsError("type4py_{data_loading_funcs['name']}_model_var_param_ret.pt") + le_all = pickle.load(open(join(output_path, "label_encoder_all.pkl"), 'rb')) + logger.info(f"Loaded the pre-trained Type4Py {data_loading_funcs['name']} model") + logger.info(f"Type4Py's trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}") + + annoy_index: AnnoyIndex = None + pca_transform: PCA = None + embed_labels: np.array = None + + + logger.info("Loading the reduced type clusters") + + pca_transform = pickle.load(open(join(output_path, "type_clusters_pca.pkl"), 'rb')) + embed_labels = np.load(join(output_path, f"type4py_{data_loading_funcs['name']}_true.npy")) + annoy_index = AnnoyIndex(pca_transform.n_components_, 'euclidean') + annoy_index.load(join(output_path, "type4py_complete_type_cluster_reduced")) + + logger.info("Loading test set") + test_data_loader, t_idx = load_test_data_per_model(data_loading_funcs, output_path, model_params['batches_test']) + logger.info("Mapping test samples to type clusters") + + test_type_embed, embed_test_labels = compute_type_embed_batch(model.model, test_data_loader, pca_transform) + + # Perform KNN search and predict + logger.info("Performing KNN search") + + train_valid_labels = le_all.inverse_transform(embed_labels) + embed_test_labels = le_all.inverse_transform(embed_test_labels) + pred_types = predict_type_embed_task(test_type_embed, embed_test_labels, + train_valid_labels, + t_idx, annoy_index, model_params['k']) + + save_json(join(output_path, f"type4py_{data_loading_funcs['name']}_test_predictions.json"), pred_types) + logger.info("Saved the Type4Py model's predictions on the disk") diff --git a/type4py/reduce.py b/type4py/reduce.py index 91acca9..d9d9514 100644 --- a/type4py/reduce.py +++ b/type4py/reduce.py @@ -6,6 +6,7 @@ from type4py import logger, KNN_TREE_SIZE from type4py.utils import load_model_params +from type4py.exceptions import TypeClusterNotFound from annoy import AnnoyIndex from sklearn.decomposition import PCA, IncrementalPCA from os.path import join @@ -15,12 +16,6 @@ logger.name = __name__ - -class TypeClusterNotFound(Exception): - def __init__(self): - super().__init__("Type clusters not found!") - - def reduce_tc(args): model_params = load_model_params() type_cluster_index = AnnoyIndex(model_params['output_size'], 'euclidean') diff --git a/type4py/vectorize.py b/type4py/vectorize.py index e4dd872..24fa08a 100644 --- a/type4py/vectorize.py +++ b/type4py/vectorize.py @@ -1,8 +1,9 @@ from gensim.models import Word2Vec from time import time from tqdm import tqdm -from type4py import logger, AVAILABLE_TYPES_NUMBER, TOKEN_SEQ_LEN +from type4py import logger, AVAILABLE_TYPES_NUMBER, TOKEN_SEQ_LEN, IDENTIFIER_SEQ_LEN from type4py.utils import mk_dir_not_exist +from type4py.exceptions import EmdTypeNotFound import os import multiprocessing import numpy as np @@ -13,12 +14,6 @@ W2V_VEC_LENGTH = 100 -class EmdTypeError(Exception): - pass - -class EmdTypeNotFound(EmdTypeError): - def __init__(self): - super().__init__("Embedding Type not found!") class TokenIterator: def __init__(self, param_df: pd.DataFrame, return_df: pd.DataFrame, @@ -239,7 +234,7 @@ def process_datapoints(df, output_path, embedding_type, type, trans_func, cached num_rows = datapoints.shape[0] if embedding_type == "identifiers_": - emd_shape = 31 + emd_shape = IDENTIFIER_SEQ_LEN elif embedding_type == "tokens_": emd_shape = TOKEN_SEQ_LEN[0]*TOKEN_SEQ_LEN[1] else: From 298c96282ffb8cef37aa922a6e768747c476916c Mon Sep 17 00:00:00 2001 From: fenglang Date: Mon, 10 Apr 2023 14:02:08 +0200 Subject: [PATCH 07/43] fix the issues, add the predicts script amd exceptions script --- type4py/__main__.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/type4py/__main__.py b/type4py/__main__.py index 37c08e5..7961ce9 100644 --- a/type4py/__main__.py +++ b/type4py/__main__.py @@ -52,19 +52,16 @@ def extract(args): p = Pipeline(args.c, args.o, True, False, args.d) p.run(find_repos_list(args.c) if args.l is None else find_repos_list(args.c)[:args.l], args.w) - def preprocess(args): from type4py.preprocess import preprocess_ext_fns setup_logs_file(args.o, "preprocess") preprocess_ext_fns(args.o, args.l, args.rvth) - def vectorize(args): from type4py.vectorize import vectorize_args_ret setup_logs_file(args.o, "vectorize") vectorize_args_ret(args.o) - def learn(args): from type4py.learn import train setup_logs_file(args.o, "learn") @@ -77,7 +74,6 @@ def learn(args): else: train(args.o, data_loading_comb, args.p, args.v) - # add learn_split function for CLI command "learn_split" def learn_split(args): from type4py.learn_split import train_split @@ -85,7 +81,6 @@ def learn_split(args): if args.c: train_split(args.o, data_loading_comb_sep, args.dt, args.p, args.v) - def predict(args): from type4py.predict import test setup_logs_file(args.o, "predict") @@ -98,7 +93,6 @@ def predict(args): elif args.c: test(args.o, data_loading_comb, args.l, args.rtc) - # add gen_cluster function for CLI command "gen_clu" def gen_type_cluster(args): from type4py.gen_type_cluster import gen_type_cluster @@ -111,7 +105,6 @@ def predict_split(args): if args.c: test_split(args.o, data_loading_comb_sep) - def eval(args): from type4py.eval import evaluate setup_logs_file(args.o, "eval") @@ -132,14 +125,12 @@ def infer(args): setup_logs_file(args.m, 'infer') infer_main(args.m, args.f) - # add projects-based infer function for command "infer_project" def infer_project(args): from type4py.deploy.infer_project import infer_project_main setup_logs_file(args.m, 'infer_project') infer_project_main(args.m, args.p, args.o, args.split) - def main(): arg_parser = argparse.ArgumentParser() sub_parsers = arg_parser.add_subparsers(dest='cmd') From 040807235a50cf9ccbac2d346cd3cbf471c0b74c Mon Sep 17 00:00:00 2001 From: fenglang Date: Mon, 29 May 2023 09:24:31 +0200 Subject: [PATCH 08/43] update infer-project .py --- type4py/deploy/infer_project.py | 1 + 1 file changed, 1 insertion(+) diff --git a/type4py/deploy/infer_project.py b/type4py/deploy/infer_project.py index e3a5156..98c1a32 100644 --- a/type4py/deploy/infer_project.py +++ b/type4py/deploy/infer_project.py @@ -87,3 +87,4 @@ def infer_project_main(model_path, input_path, output_path, split_file): t4py_pretrained_m.load_pretrained_model() infer_projects(t4py_pretrained_m, input_path, output_path, split_file) + From e33671369c7c6c7e48a200f5b2b4987968f1f6e3 Mon Sep 17 00:00:00 2001 From: fenglang Date: Mon, 29 May 2023 09:56:16 +0200 Subject: [PATCH 09/43] add project-base inference pipeline --- type4py/__main__.py | 19 ++++++++++++++++--- type4py/deploy/infer_project.py | 25 ++++++++++++++----------- type4py/exceptions.py | 7 ++++++- 3 files changed, 36 insertions(+), 15 deletions(-) diff --git a/type4py/__main__.py b/type4py/__main__.py index 7961ce9..f56a299 100644 --- a/type4py/__main__.py +++ b/type4py/__main__.py @@ -2,6 +2,7 @@ from type4py.to_onnx import type4py_to_onnx from type4py.reduce import reduce_tc from type4py.utils import setup_logs_file +from type4py.exceptions import InferApproachNotFound from libsa4py.cst_pipeline import Pipeline from libsa4py.utils import find_repos_list import argparse @@ -126,10 +127,20 @@ def infer(args): infer_main(args.m, args.f) # add projects-based infer function for command "infer_project" +''' +project-based CLI command includes three approaches: +-t4py : typ4py model +-hybrid0: typ4py + pyre +-hybrid1: type4py + pyright +''' def infer_project(args): - from type4py.deploy.infer_project import infer_project_main - setup_logs_file(args.m, 'infer_project') - infer_project_main(args.m, args.p, args.o, args.split) + approach_list = {"t4py", "hybrid0", "hybrid1"} + if args.a in approach_list: + from type4py.deploy.infer_project import infer_project_main + setup_logs_file(args.m, 'infer_project') + infer_project_main(args.m, args.p, args.o, args.a, args.split) + else: + raise InferApproachNotFound def main(): arg_parser = argparse.ArgumentParser() @@ -245,6 +256,8 @@ def main(): help="Path to python projects folder for inference") infer_parser_pro.add_argument('--o', '--output', required=True, type=str, help="Path to store the ml_infer outputs") + infer_parser_pro.add_argument('--a', '--approach', required=True, type=str, + help="infer approach includes ml, hybrid0, hybrid1") # split according to dataset_split_repo.csv infer_parser_pro.add_argument('--split', '--split_file', required=True, type=str, help="file to store the split of projects") diff --git a/type4py/deploy/infer_project.py b/type4py/deploy/infer_project.py index 98c1a32..c840c02 100644 --- a/type4py/deploy/infer_project.py +++ b/type4py/deploy/infer_project.py @@ -8,7 +8,7 @@ from type4py import logger from libsa4py.exceptions import ParseError -from libsa4py.utils import list_files, find_repos_list +from libsa4py.utils import list_files, find_repos_list, save_json from pathlib import Path def find_test_list(project_dir, dataset_split): @@ -30,7 +30,7 @@ def find_test_list(project_dir, dataset_split): # logger.info(f"dataset_split file: {dataset_split} does not exist!") raise FileNotFoundError(f"dataset_split file: {dataset_split} does not exist!") -def infer(repo, model, project_dir, tar_dir): +def infer(repo, model, project_dir): project_author = repo["author"] project_name = repo["repo"] project_path = os.path.join(project_dir, project_author, project_name) @@ -64,12 +64,11 @@ def infer(repo, model, project_dir, tar_dir): project_analyzed_files[project_id]["src_files"].keys()]) / len( project_analyzed_files[project_id]["src_files"].keys()), 2) - processed_file = os.path.join(tar_dir, f"{project_author}{project_name}_mlInfer.json") - with open(processed_file, 'w') as json_f: - json.dump(project_analyzed_files, json_f, indent=4) + return project_analyzed_files -def infer_projects(model, project_dir, tar_dir, split_file): + +def infer_projects(model, project_dir, tar_dir, approach, split_file): if split_file is not None: repo_infos_test = find_test_list(project_dir, split_file) logger.info(f'Totally find {len(repo_infos_test)} projects in test set') @@ -78,13 +77,17 @@ def infer_projects(model, project_dir, tar_dir, split_file): repo_infos_test = find_repos_list(project_dir) logger.info(f'Totally find {len(repo_infos_test)} projects in project dir') - for repo in tqdm(repo_infos_test): - infer(repo, model, project_dir, tar_dir) + if approach == "t4py": + for repo in tqdm(repo_infos_test): + project_author = repo["author"] + project_name = repo["repo"] + filepath = os.path.join(tar_dir, f"{project_author}{project_name}_mlInfer.json") + processed_file = infer(repo, model, project_dir, tar_dir) + save_json(filepath, processed_file) -def infer_project_main(model_path, input_path, output_path, split_file): +def infer_project_main(model_path, input_path, output_path, approach, split_file): t4py_pretrained_m = PretrainedType4Py(model_path, "gpu", pre_read_type_cluster=False, use_pca=True) t4py_pretrained_m.load_pretrained_model() - - infer_projects(t4py_pretrained_m, input_path, output_path, split_file) + infer_projects(t4py_pretrained_m, input_path, output_path, approach, split_file) diff --git a/type4py/exceptions.py b/type4py/exceptions.py index 219d62b..eb75dc1 100644 --- a/type4py/exceptions.py +++ b/type4py/exceptions.py @@ -15,11 +15,16 @@ class ModelTrainedError(TrainedModel): def __init__(self): super().__init__("Model has been trained for this dataset!") +class ApproachError(Exception): + pass + +class InferApproachNotFound(ApproachError): + def __init__(self): + super().__init__("Infer Approach not in t4py, hybrid0, hybrid1!") class EmdTypeError(Exception): pass - class EmdTypeNotFound(EmdTypeError): def __init__(self): super().__init__("Embedding Type not found!") From 08a9f51120467c4bbaf7619ec1d835c03c14768a Mon Sep 17 00:00:00 2001 From: fenglang Date: Mon, 29 May 2023 10:53:12 +0200 Subject: [PATCH 10/43] add project-base inference for ml & hybrid --- type4py/deploy/infer_project.py | 44 ++++++- type4py/deploy/static_infer.py | 71 ++++++++++++ type4py/deploy/utils/pyre_merge.py | 180 +++++++++++++++++++++++++++++ type4py/deploy/utils/pyre_utils.py | 124 ++++++++++++++++++++ type4py/deploy/utils/utils.py | 52 +++++++++ 5 files changed, 465 insertions(+), 6 deletions(-) create mode 100644 type4py/deploy/static_infer.py create mode 100644 type4py/deploy/utils/pyre_merge.py create mode 100644 type4py/deploy/utils/pyre_utils.py create mode 100644 type4py/deploy/utils/utils.py diff --git a/type4py/deploy/infer_project.py b/type4py/deploy/infer_project.py index c840c02..adea39d 100644 --- a/type4py/deploy/infer_project.py +++ b/type4py/deploy/infer_project.py @@ -2,7 +2,6 @@ from typing import List import pandas as pd import tqdm -import json from type4py.deploy.infer import PretrainedType4Py, type_annotate_file from type4py import logger @@ -10,6 +9,14 @@ from libsa4py.utils import list_files, find_repos_list, save_json from pathlib import Path +import multiprocessing +from type4py.deploy.static_infer import pyre_infer +from type4py.deploy.utils.pyre_merge import merge_pyre + + +ml_queue = multiprocessing.Queue() +pyre_queue = multiprocessing.Queue() +pyright_queue = multiprocessing.Queue() def find_test_list(project_dir, dataset_split): if os.path.exists(dataset_split): @@ -30,7 +37,7 @@ def find_test_list(project_dir, dataset_split): # logger.info(f"dataset_split file: {dataset_split} does not exist!") raise FileNotFoundError(f"dataset_split file: {dataset_split} does not exist!") -def infer(repo, model, project_dir): +def ml_infer(repo, model, project_dir): project_author = repo["author"] project_name = repo["repo"] project_path = os.path.join(project_dir, project_author, project_name) @@ -67,6 +74,13 @@ def infer(repo, model, project_dir): return project_analyzed_files +def run_mlInfer(): + ml_result = ml_infer(repo, model, project_dir) + ml_queue.put(ml_result) + +def run_pyreInfer(): + pyre_result = pyre_infer(repo, project_dir) + pyre_queue.put(pyre_result) def infer_projects(model, project_dir, tar_dir, approach, split_file): if split_file is not None: @@ -79,12 +93,30 @@ def infer_projects(model, project_dir, tar_dir, approach, split_file): if approach == "t4py": for repo in tqdm(repo_infos_test): - project_author = repo["author"] - project_name = repo["repo"] - filepath = os.path.join(tar_dir, f"{project_author}{project_name}_mlInfer.json") - processed_file = infer(repo, model, project_dir, tar_dir) + project_name = "".join((repo["author"], repo["repo"])) + filepath = os.path.join(tar_dir, f"{project_name}_mlInfer.json") + processed_file = ml_infer(repo, model, project_dir, tar_dir) save_json(filepath, processed_file) + if approach == "hybrid0": + for repo in tqdm(repo_infos_test): + process1 = multiprocessing.Process(target=run_mlInfer) + process2 = multiprocessing.Process(target=run_pyreInfer) + + # Start the processes + process1.start() + process2.start() + + # Get the results from t4py and pyre & merge + ml_result = ml_queue.get() + sa_result = pyre_queue.get() + + project_id = "/".join((repo["author"], repo["repo"])) + project_name = "".join((repo["author"], repo["repo"])) + hy_result = merge_pyre(ml_result, sa_result, project_id) + + filepath = os.path.join(tar_dir, f"{project_name}_hybridinfer0.json") + save_json(filepath, hy_result) def infer_project_main(model_path, input_path, output_path, approach, split_file): t4py_pretrained_m = PretrainedType4Py(model_path, "gpu", pre_read_type_cluster=False, use_pca=True) diff --git a/type4py/deploy/static_infer.py b/type4py/deploy/static_infer.py new file mode 100644 index 0000000..4fb535d --- /dev/null +++ b/type4py/deploy/static_infer.py @@ -0,0 +1,71 @@ +import os +from pathlib import Path +import utils.pyre_utils as pyre_util +from utils.utils import rebuild_repo +from libsa4py.utils import list_files, read_file +from libsa4py.exceptions import ParseError +from libsa4py.cst_extractor import Extractor +import shutil + + +def pyre_start(project_path): + pyre_util.clean_watchman_config(project_path) + pyre_util.clean_pyre_config(project_path) + pyre_util.start_watchman(project_path) + pyre_util.start_pyre(project_path) + + +def pyre_infer(repo, project_dir): + # rebuild for masking original types + cache_path = "/cache_path" + os.mkdir(cache_path) + rebuild_repo(project_dir, cache_path, repo) + + project_author = repo["author"] + project_name = repo["repo"] + project_path = os.path.join(cache_path, project_author, project_name) + id_tuple = (project_author, project_name) + project_id = "/".join(id_tuple) + project_analyzed_files: dict = {project_id: {"src_files": {}, "type_annot_cove": 0.0}} + + print(f'Running pyre pipeline for project {project_path}') + pyre_start(project_path) + # start pyre infer for project + print(f'Running pyre infer for project {project_path}') + pyre_util.pyre_infer(project_path) + print(f'Extracting for {project_path}...') + project_files = list_files(project_path) + print(f"{project_path} has {len(project_files)} files") + + project_files = [(f, str(Path(f).relative_to(Path(project_path).parent))) for f in project_files] + + if len(project_files) != 0: + print(f'Running pyre query for project {project_path}') + try: + for filename, f_relative in project_files: + pyre_data_file = pyre_util.pyre_query_types(project_path, filename) + project_analyzed_files[project_id]["src_files"][filename] = \ + Extractor.extract(read_file(filename), pyre_data_file).to_dict() + except ParseError as err: + print("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) + except UnicodeDecodeError: + print(f"Could not read file {filename}") + except Exception as err: + print("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) + + print(f'Saving static analysis results for {project_id}...') + + if len(project_analyzed_files[project_id]["src_files"].keys()) != 0: + project_analyzed_files[project_id]["type_annot_cove"] = \ + round(sum([project_analyzed_files[project_id]["src_files"][s]["type_annot_cove"] for s in + project_analyzed_files[project_id]["src_files"].keys()]) / len( + project_analyzed_files[project_id]["src_files"].keys()), 2) + + pyre_util.watchman_shutdown(project_path) + pyre_util.pyre_server_shutdown(project_path) + pyre_util.clean_config(project_path) + + # remove cache projects + shutil.rmtree(cache_path) + + return project_analyzed_files diff --git a/type4py/deploy/utils/pyre_merge.py b/type4py/deploy/utils/pyre_merge.py new file mode 100644 index 0000000..26f9430 --- /dev/null +++ b/type4py/deploy/utils/pyre_merge.py @@ -0,0 +1,180 @@ +""" +functions for merging the type information from static analysis and machine learning +""" +import regex + + +def type_consist(t: str): + sub_regex = r'typing\.|typing_extensions\.|t\.|builtins\.|collections\.' + + def remove_quote_types(t: str): + s = regex.search(r'^\'(.+)\'$', t) + if bool(s): + return s.group(1) + else: + # print(t) + return t + + t = regex.sub(sub_regex, "", str(t)) + t = remove_quote_types(t) + return t + + +def check(t: str): + types = ["", "Any", "any", "None", "Object", "object", "type", "Type[Any]", + 'Type[cls]', 'Type[type]', 'Type', 'TypeVar', 'Optional[Any]'] + if t in types: + return False + else: + return True + + +def merge_vars(sa_dict, ml_dict): + add = 0 + var_dict_sa = sa_dict["variables"] + if "variables_p" in ml_dict.keys(): + var_dict_1 = ml_dict["variables_p"] + else: + var_dict_1 = {} + for var_key in var_dict_sa.keys(): + var_dict_sa[var_key] = type_consist(var_dict_sa[var_key]) + if check(var_dict_sa[var_key]): + if var_key in var_dict_1.keys(): + if len(var_dict_1[var_key]) != 0 and var_dict_1[var_key][0][0] != var_dict_sa[var_key]: + sa_type = [var_dict_sa[var_key], 1.1] + var_dict_1[var_key].insert(0, sa_type) + add = add + 1 + ml_dict["variables_p"] = var_dict_1 + return ml_dict, add + + +def merge_params(sa_dict, ml_dict): + add = 0 + param_dict_sa = sa_dict["params"] + if "params_p" in ml_dict.keys(): + param_dict_1 = ml_dict["params_p"] + else: + param_dict_1 = {} + + for param_key in param_dict_sa.keys(): + if param_key in param_dict_1.keys(): + param_dict_sa[param_key] = type_consist(param_dict_sa[param_key]) + if check(param_dict_sa[param_key]): + if len(param_dict_1[param_key]) != 0 and param_dict_1[param_key][0][0] != param_dict_sa[param_key]: + sa_type = [param_dict_sa[param_key], 1.1] + param_dict_1[param_key].insert(0, sa_type) + add = add + 1 + ml_dict["params_p"] = param_dict_1 + return ml_dict, add + + +def merge_ret_types(sa_dict, ml_dict): + add = 0 + ret_type_sa = sa_dict["ret_type"] + if "ret_type_p" in ml_dict.keys(): + ret_type_1 = ml_dict["ret_type_p"] + else: + ret_type_1 = [] + ret_type_sa = type_consist(ret_type_sa) + if check(ret_type_sa): + if len(ret_type_1) != 0 and ret_type_1[0][0] != ret_type_sa: + sa_type = [ret_type_sa, 1.1] + ret_type_1.insert(0, sa_type) + add = add + 1 + ml_dict["ret_type_p"] = ret_type_1 + return ml_dict, add + + +def merge_file(sa_file, ml_file): + add_var = 0 + add_params = 0 + add_ret_type = 0 + # print("find") + # merge variables in the py file + merged_file, add = merge_vars(sa_file, ml_file) + add_var = add_var + add + + # merge variables, params and ret_types in the functions in the py file + if "funcs" in ml_file.keys(): + func_list = sa_file['funcs'] + func_list_1 = merged_file['funcs'] + for i in range(len(func_list)): + if func_list[i]['name'] == func_list_1[i]['name']: + merged_func, add = merge_vars(func_list[i], func_list_1[i]) + func_list_1[i] = merged_func + add_var = add_var + add + merged_func, add = merge_params(func_list[i], func_list_1[i]) + func_list_1[i] = merged_func + add_params = add_params + add + merged_func, add = merge_ret_types(func_list[i], func_list_1[i]) + func_list_1[i] = merged_func + add_ret_type = add_ret_type + add + + merged_file['funcs'] = func_list_1 + + if "classes" in ml_file.keys(): + class_list = sa_file['classes'] + class_list_1 = merged_file['classes'] + for i in range(len(class_list)): + if class_list[i]['name'] == class_list_1[i]['name']: + # add vars in classes + class_list_1[i], add = merge_vars(class_list[i], class_list_1[i]) + add_var = add_var + add + # add vars, params, ret_types in functions in classes + if "funcs" in class_list_1[i].keys(): + func_list = class_list[i]['funcs'] + func_list_1 = class_list_1[i]['funcs'] + for j in range(len(func_list)): + if func_list[j]['name'] == func_list_1[j]['name']: + func_list_1[j], add = merge_vars(func_list[j], func_list_1[j]) + add_var = add_var + add + func_list_1[j], add = merge_params(func_list[j], func_list_1[j]) + add_params = add_params + add + func_list_1[j], add = merge_ret_types(func_list[j], func_list_1[j]) + add_ret_type = add_ret_type + add + class_list_1[i]['funcs'] = func_list_1 + merged_file['classes'] = class_list_1 + + return merged_file + +def merge_project(sa_dict, ml_dict): + merged_dict = {} + for key in ml_dict: + if key in sa_dict.keys(): + m_file = merge_file(sa_dict[key], ml_dict[key]) + merged_dict[key] = m_file + else: + merged_dict[key] = ml_dict[key] + return merged_dict + + +def update_key(file_name, project_id): + author = project_id.split("/")[0] + repo = project_id.split("/")[1] + list = file_name.split("/") + start_index = 0 + for i in range(len(list) - 1): + if list[i] == author and list[i + 1] == repo: + start_index = i + break + new_list = [] + new_list.append("data") + while start_index < len(list): + new_list.append(list[start_index]) + start_index = start_index + 1 + return "/".join(new_list) + + +def merge_pyre(ml_dict, static_dict, project_id): + src_dict = {} + src_dict_ml = {} + for key in static_dict[project_id]['src_files'].keys(): + key_new = update_key(key, project_id) + src_dict[key_new] = static_dict[project_id]['src_files'][key] + for key in ml_dict[project_id]['src_files'].keys(): + key_new = update_key(key, project_id) + src_dict_ml[key_new] = ml_dict[project_id]['src_files'][key] + merged_project: dict = {project_id: {"src_files": {}}} + merged_src_dict = merge_project(src_dict, src_dict_ml) + merged_project[project_id]["src_files"] = merged_src_dict + return merged_project \ No newline at end of file diff --git a/type4py/deploy/utils/pyre_utils.py b/type4py/deploy/utils/pyre_utils.py new file mode 100644 index 0000000..820d41f --- /dev/null +++ b/type4py/deploy/utils/pyre_utils.py @@ -0,0 +1,124 @@ +""" +Helper functions to use pyre in the pipeline +""" +from pathlib import Path +from subprocess import TimeoutExpired +from os.path import join, exists, isdir +from libcst.metadata.type_inference_provider import PyreData +import os +import shutil +import json +import subprocess +from typing import List, Optional, Tuple + + + +def run_command(cmd_args: List[str], timeout: Optional[int] = None + ) -> Tuple[str, str, int]: + process = subprocess.run(cmd_args, shell=True, capture_output=True, timeout=timeout) + return process.stdout.decode(), process.stderr.decode(), process.returncode + + +def clean_watchman_config(project_path: str): + # update the watchman config to the project + dict = {"root": "."} + if exists(join(project_path, '.watchmanconfig')): + os.remove(join(project_path, '.watchmanconfig')) + print(f"[WATCHMAN_CLEAN] config of {project_path} ") + + with open(join(project_path, '.watchmanconfig'), "w") as f: + json.dump(dict, f) + print(f"[WATCHMAN_WRITE] config of {project_path} ") + + +def clean_pyre_config(project_path: str): + # update pyre config file for the path + dict = { + "site_package_search_strategy": "pep561", + "source_directories": [ + "." + ], + "typeshed": "/pyre-check/stubs/typeshed/typeshed", + "workers":64 + } + + if exists(join(project_path, '.pyre_configuration')): + os.remove(join(project_path, '.pyre_configuration')) + print(f"[PYRE_CLEAN] config of {project_path} ") + + with open(join(project_path, '.pyre_configuration'), "w") as f: + json.dump(dict, f) + print(f"[PYRE_WRITE] config of {project_path} ") + + +def start_watchman(project_path: str): + # start watchman server + stdout, stderr, r_code = run_command( + "cd %s; watchman watch-project ." % project_path) + if r_code == 0: + print(f"[WATCHMAN SERVER] started at {project_path} ", stdout, stderr) + else: + print(f"[WATCHMAN_ERROR] p: {project_path}", stderr) + + +def start_pyre(project_path: str): + # start pyre server + stdout, stderr, r_code = run_command( + "cd %s; pyre start" % project_path) + print(f"[PYRE_SERVER] started at {project_path} ", stdout, stderr) + + +def pyre_infer(project_path: str): + # start pyre server for the project + stdout, stderr, r_code = run_command( + "cd %s; pyre infer; pyre infer -i --annotate-from-existing-stubs" % project_path) + print(f"[PYRE_INFER] started at {project_path} ", stdout, stderr) + + +def pyre_query_types(project_path: str, file_path: str, timeout: int = 600) -> Optional[PyreData]: + try: + file_types = None + stdout, stderr, r_code = run_command('''cd %s; pyre query "types(path='%s')"''' % (project_path, + str(Path( + file_path).relative_to( + Path(project_path)))), + timeout=timeout) + if r_code == 0: + file_types = json.loads(stdout)["response"][0] + else: + print(f"[PYRE_ERROR] p: {project_path}", stderr) + except KeyError: + print(f"[PYRE_ERROR] p: {project_path}", json.loads(stdout)['error']) + except TimeoutExpired as te: + print(f"[PYRE_TIMEOUT] p: {project_path}", te) + finally: + return file_types + + +def pyre_server_shutdown(project_path: str): + # stop pyre server in the project path + stdout, stderr, r_code = run_command("cd %s ; pyre stop" % project_path) + print(f"[PYRE_SERVER] stopped at {project_path} ", stdout, stderr) + +def watchman_shutdown(project_path: str): + # stop pyre server in the project path + stdout, stderr, r_code = run_command("cd %s ; watchman watch-del ." % project_path) + print(f"[WATCHMAN SERVER] stopped at {project_path} ", stdout, stderr) + + +def clean_config(project_path: str): + # clean watchman + if exists(join(project_path, '.watchmanconfig')): + os.remove(join(project_path, '.watchmanconfig')) + print(f"[WATCHMAN_CLEAN] config of {project_path} ") + + # clean pyre + if exists(join(project_path, '.pyre_configuration')): + os.remove(join(project_path, '.pyre_configuration')) + print(f"[PYRE_CLEAN] config of {project_path} ") + + # clean pyre folder + pyre_dir = join(project_path, '.pyre') + if exists(pyre_dir) and isdir(pyre_dir): + shutil.rmtree(pyre_dir) + diff --git a/type4py/deploy/utils/utils.py b/type4py/deploy/utils/utils.py new file mode 100644 index 0000000..e2fdf9e --- /dev/null +++ b/type4py/deploy/utils/utils.py @@ -0,0 +1,52 @@ +from libsa4py.cst_visitor import Visitor +from libsa4py.cst_transformers import TypeAdder, SpaceAdder, StringRemover, CommentAndDocStringRemover, NumberRemover, \ + TypeAnnotationRemover, TypeQualifierResolver +from libsa4py.exceptions import ParseError +from libsa4py.utils import read_file, write_file, find_repos_list, list_files +from pathlib import Path +import libcst as cst +import os +import shutil + +def rebuild(filename: str, project_dir: str, tar_dict: str, + program_types: cst.metadata.type_inference_provider.PyreData = None, + include_seq2seq: bool = True): + program = read_file(filename) + try: + parsed_program = cst.parse_module(program) + except Exception as e: + raise ParseError(str(e)) + + # Resolves qualified names for a modules' type annotations + program_tqr = cst.metadata.MetadataWrapper(parsed_program).visit(TypeQualifierResolver()) + + v = Visitor() + if program_types is not None: + mw = cst.metadata.MetadataWrapper(program_tqr, + cache={cst.metadata.TypeInferenceProvider: program_types}) + mw.visit(v) + else: + mw = cst.metadata.MetadataWrapper(program_tqr, cache={cst.metadata.TypeInferenceProvider: {'types': []}}) + mw.visit(v) + + if include_seq2seq: + v_type = TypeAnnotationRemover() + v_untyped = parsed_program.visit(v_type) + + relative_path = str(Path(filename).relative_to(Path(project_dir))) + tar_path = os.path.join(tar_dict, relative_path) + + write_file(tar_path, v_untyped.code) + + +def rebuild_repo(project_dir, tar_dir, repo_info): + repo_path = os.path.join(project_dir, repo_info['author'], repo_info['repo']) + tar_path = os.path.join(tar_dir, repo_info['author'], repo_info['repo']) + shutil.copytree(repo_path, tar_path) + for root, dirs, files in os.walk(tar_path): + for filename in files: + os.unlink(os.path.join(root, filename)) + + source_files = list_files(repo_path) + for filename in source_files: + rebuild(filename, project_dir, tar_dir) \ No newline at end of file From bd4bb81613d13ac0bc021aba730042fb7403555e Mon Sep 17 00:00:00 2001 From: fenglang Date: Mon, 29 May 2023 11:07:06 +0200 Subject: [PATCH 11/43] add script explanations --- type4py/deploy/infer_project.py | 4 ++++ type4py/deploy/static_infer.py | 4 ++++ type4py/deploy/utils/pyre_utils.py | 2 +- type4py/deploy/utils/utils.py | 4 ++++ 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/type4py/deploy/infer_project.py b/type4py/deploy/infer_project.py index adea39d..307eed7 100644 --- a/type4py/deploy/infer_project.py +++ b/type4py/deploy/infer_project.py @@ -1,3 +1,7 @@ +""" +This module is for infer projects and output json files based on three approaches: +type4py, pyre and pyright +""" import os from typing import List import pandas as pd diff --git a/type4py/deploy/static_infer.py b/type4py/deploy/static_infer.py index 4fb535d..de219df 100644 --- a/type4py/deploy/static_infer.py +++ b/type4py/deploy/static_infer.py @@ -1,3 +1,7 @@ +""" +This module is for static infer, including pyre infer and pyright infer +""" + import os from pathlib import Path import utils.pyre_utils as pyre_util diff --git a/type4py/deploy/utils/pyre_utils.py b/type4py/deploy/utils/pyre_utils.py index 820d41f..3a4b37b 100644 --- a/type4py/deploy/utils/pyre_utils.py +++ b/type4py/deploy/utils/pyre_utils.py @@ -101,7 +101,7 @@ def pyre_server_shutdown(project_path: str): print(f"[PYRE_SERVER] stopped at {project_path} ", stdout, stderr) def watchman_shutdown(project_path: str): - # stop pyre server in the project path + # stop watchman server in the project path stdout, stderr, r_code = run_command("cd %s ; watchman watch-del ." % project_path) print(f"[WATCHMAN SERVER] stopped at {project_path} ", stdout, stderr) diff --git a/type4py/deploy/utils/utils.py b/type4py/deploy/utils/utils.py index e2fdf9e..5a1fdc3 100644 --- a/type4py/deploy/utils/utils.py +++ b/type4py/deploy/utils/utils.py @@ -1,3 +1,7 @@ +''' +This module includes function for rebuild project for pyre infer, clean & mask type annotations +''' + from libsa4py.cst_visitor import Visitor from libsa4py.cst_transformers import TypeAdder, SpaceAdder, StringRemover, CommentAndDocStringRemover, NumberRemover, \ TypeAnnotationRemover, TypeQualifierResolver From 51a9693ca4a2fded66b3127742ee0bf32981931a Mon Sep 17 00:00:00 2001 From: fenglang Date: Mon, 5 Jun 2023 18:58:06 +0200 Subject: [PATCH 12/43] update infer-project base approach name t4pyre and t4pyright --- type4py/__main__.py | 6 +++--- type4py/deploy/infer_project.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/type4py/__main__.py b/type4py/__main__.py index f56a299..60fcbbc 100644 --- a/type4py/__main__.py +++ b/type4py/__main__.py @@ -129,9 +129,9 @@ def infer(args): # add projects-based infer function for command "infer_project" ''' project-based CLI command includes three approaches: --t4py : typ4py model --hybrid0: typ4py + pyre --hybrid1: type4py + pyright +-t4py : type4py model +-t4pyre: typ4py + pyre +-t4pyright: type4py + pyright ''' def infer_project(args): approach_list = {"t4py", "hybrid0", "hybrid1"} diff --git a/type4py/deploy/infer_project.py b/type4py/deploy/infer_project.py index 307eed7..a940b78 100644 --- a/type4py/deploy/infer_project.py +++ b/type4py/deploy/infer_project.py @@ -102,7 +102,7 @@ def infer_projects(model, project_dir, tar_dir, approach, split_file): processed_file = ml_infer(repo, model, project_dir, tar_dir) save_json(filepath, processed_file) - if approach == "hybrid0": + if approach == "t4pyre": for repo in tqdm(repo_infos_test): process1 = multiprocessing.Process(target=run_mlInfer) process2 = multiprocessing.Process(target=run_pyreInfer) From 5c5f5221b5c4ace06d4d4cc2d3f87de489486857 Mon Sep 17 00:00:00 2001 From: fenglang Date: Tue, 6 Jun 2023 10:24:44 +0200 Subject: [PATCH 13/43] update t4pyright logic in infer-project approach --- type4py/deploy/infer_project.py | 29 ++++- type4py/deploy/static_infer.py | 56 ++++++++- type4py/deploy/utils/cst_utils.py | 117 +++++++++++++++++ type4py/deploy/utils/preprocess_utils.py | 63 ++++++++++ type4py/deploy/utils/pyre_merge.py | 36 +----- type4py/deploy/utils/pyright_merge.py | 153 +++++++++++++++++++++++ type4py/deploy/utils/pyright_utils.py | 78 ++++++++++++ 7 files changed, 498 insertions(+), 34 deletions(-) create mode 100644 type4py/deploy/utils/cst_utils.py create mode 100644 type4py/deploy/utils/preprocess_utils.py create mode 100644 type4py/deploy/utils/pyright_merge.py create mode 100644 type4py/deploy/utils/pyright_utils.py diff --git a/type4py/deploy/infer_project.py b/type4py/deploy/infer_project.py index a940b78..fcfc48f 100644 --- a/type4py/deploy/infer_project.py +++ b/type4py/deploy/infer_project.py @@ -14,8 +14,9 @@ from libsa4py.utils import list_files, find_repos_list, save_json from pathlib import Path import multiprocessing -from type4py.deploy.static_infer import pyre_infer +from type4py.deploy.static_infer import pyre_infer, pyright_infer from type4py.deploy.utils.pyre_merge import merge_pyre +from type4py.deploy.utils.pyright_merge import merge_pyright ml_queue = multiprocessing.Queue() @@ -86,6 +87,10 @@ def run_pyreInfer(): pyre_result = pyre_infer(repo, project_dir) pyre_queue.put(pyre_result) +def run_pyrightInfer(): + pyright_result = pyright_infer(repo, project_dir) + pyright_queue.put(pyright_result) + def infer_projects(model, project_dir, tar_dir, approach, split_file): if split_file is not None: repo_infos_test = find_test_list(project_dir, split_file) @@ -119,7 +124,27 @@ def infer_projects(model, project_dir, tar_dir, approach, split_file): project_name = "".join((repo["author"], repo["repo"])) hy_result = merge_pyre(ml_result, sa_result, project_id) - filepath = os.path.join(tar_dir, f"{project_name}_hybridinfer0.json") + filepath = os.path.join(tar_dir, f"{project_name}_t4pyreInfer.json") + save_json(filepath, hy_result) + + if approach == "t4pyright": + for repo in tqdm(repo_infos_test): + process1 = multiprocessing.Process(target=run_mlInfer) + process2 = multiprocessing.Process(target=run_pyrightInfer) + + # Start the processes + process1.start() + process2.start() + + # Get the results from t4py and pyright & merge + ml_result = ml_queue.get() + sa_result = pyright_queue.get() + + project_id = "/".join((repo["author"], repo["repo"])) + project_name = "".join((repo["author"], repo["repo"])) + hy_result = merge_pyright(ml_result, sa_result, project_id) + + filepath = os.path.join(tar_dir, f"{project_name}_t4pyrightInfer.json") save_json(filepath, hy_result) def infer_project_main(model_path, input_path, output_path, approach, split_file): diff --git a/type4py/deploy/static_infer.py b/type4py/deploy/static_infer.py index de219df..20898e0 100644 --- a/type4py/deploy/static_infer.py +++ b/type4py/deploy/static_infer.py @@ -6,10 +6,14 @@ from pathlib import Path import utils.pyre_utils as pyre_util from utils.utils import rebuild_repo -from libsa4py.utils import list_files, read_file +from utils.cst_utils import TypeAnnotationFinder, TypeAnnotationMasker +from utils.preprocess_utils import check, make_types_consistent +from libsa4py.utils import list_files, read_file, write_file from libsa4py.exceptions import ParseError from libsa4py.cst_extractor import Extractor import shutil +import libcst as cst +from tqdm import tqdm def pyre_start(project_path): @@ -73,3 +77,53 @@ def pyre_infer(repo, project_dir): shutil.rmtree(cache_path) return project_analyzed_files + + +def extract(code): + parsed_program = cst.parse_module(code) + transformer = TypeAnnotationFinder() + new_tree = cst.metadata.MetadataWrapper(parsed_program).visit(transformer) + return transformer.annotated_types + + +def mask_reveal(code, type): + parsed_program = cst.parse_module(code) + transformer_mask = TypeAnnotationMasker(type) + new_tree = cst.metadata.MetadataWrapper(parsed_program).visit(transformer_mask) + return new_tree.code + + + +def pyright_infer(project_path): + project_author = project_path.split("/")[len(project_path.split("/")) - 2] + project_name = project_path.split("/")[len(project_path.split("/")) - 1] + t_list = [] + files = list_files(project_path) + for file in tqdm(files): + try: + pre_list = [] + code = read_file(file) + type_list = extract(code) + for type_info in type_list: + label = make_types_consistent(type_info["label"]) + if not check(label): + continue + code_org = code + code_masked = mask_reveal(code_org, type_info) + write_file(f"{project_author}{project_name}.py", code_masked) + predict = None + # for pyright infer + predict = pyright_infer(f"{project_author}{project_name}.py", type_info["dt"], type_info["name"]) + if predict is not None: + predict["label"] = label + predict["loc"] = type_info["loc"] + pre_list.append(predict) + os.remove(f"{project_author}{project_name}.py") + if len(pre_list) != 0: + t_list.append({file: pre_list}) + except ParseError as err: + print(err) + except UnicodeDecodeError: + print(f"Could not read file {file}") + + return t_list diff --git a/type4py/deploy/utils/cst_utils.py b/type4py/deploy/utils/cst_utils.py new file mode 100644 index 0000000..eeca84b --- /dev/null +++ b/type4py/deploy/utils/cst_utils.py @@ -0,0 +1,117 @@ +import libcst as cst +from libcst.metadata import PositionProvider, CodeRange + + +class TypeAnnotationFinder(cst.CSTTransformer): + """ + It find all type annotations from a source code + use a dict object for saving the type information: + for example: + { + dt: "param", + func_name: "my_foo", + name: "foo", + label: "Fool" + } + """ + METADATA_DEPENDENCIES = (PositionProvider,) + + def __init__(self): + super().__init__() + self.annotated_types = [] + self.var_list = set() + + def __get_line_column_no(self, node): + lc = self.get_metadata(cst.metadata.PositionProvider, node) + return (lc.start.line, lc.start.column), (lc.end.line, lc.end.column) + + def visit_FunctionDef(self, original_node: cst.FunctionDef): + fn_pos = self.__get_line_column_no(original_node) + if original_node.returns is not None: + node_module = cst.Module([original_node.returns.annotation]) + type_dict = dict(dt="ret", func_name=original_node.name.value, name="ret_type", + label=node_module.code, loc = fn_pos) + self.annotated_types.append(type_dict) + for param in original_node.params.params: + if param.annotation is not None: + node_module = cst.Module([param.annotation.annotation]) + type_dict = dict(dt="param", func_name=original_node.name.value, name=param.name.value, + label=node_module.code, loc=fn_pos) + self.annotated_types.append(type_dict) + + def visit_AnnAssign(self, node: cst.AnnAssign) -> None: + pos = self.__get_line_column_no(node.target) + node_module = cst.Module([node.annotation.annotation]) + var_name = cst.Module([node.target]).code + if pos not in self.var_list: + type_dict = dict(dt="var", func_name="__global__", name=var_name, + label=node_module.code, loc=pos) + self.annotated_types.append(type_dict) + self.var_list.add(pos) + + +class TypeAnnotationMasker(cst.CSTTransformer): + """ + It removes type annotations and add reveal_type() for static analysis + """ + METADATA_DEPENDENCIES = (PositionProvider,) + + def __init__(self, tar_dict): + super().__init__() + self.target = tar_dict + self.dt = tar_dict["dt"] + self.find = False + + def __get_line_column_no(self, node): + lc = self.get_metadata(cst.metadata.PositionProvider, node) + return (lc.start.line, lc.start.column), (lc.end.line, lc.end.column) + + def leave_FunctionDef(self, original_node: cst.FunctionDef, updated_node): + fn_pos = self.__get_line_column_no(original_node) + if self.find == False and fn_pos == self.target["loc"]: + + # for return types + if self.dt == "ret" and original_node.returns is not None: + log_stmt = cst.Expr(cst.parse_expression(f"reveal_type({updated_node.name.value})")) + self.find = True + return cst.FlattenSentinel([updated_node.with_changes(returns=None), log_stmt]) + + # for parameterss + elif self.dt == "param": + updated_params = [] + for param in original_node.params.params: + if param.name.value == self.target["name"]: + self.find = True + param_untyped = param.with_changes(annotation=None, comma=None) + updated_params.append(param_untyped) + else: + updated_params.append(param) + log_stmt = cst.Expr(cst.parse_expression(f"reveal_type({updated_node.name.value})")) + return cst.FlattenSentinel([updated_node.with_changes( + params=cst.Parameters(updated_params)), log_stmt] + ) + + else: + return updated_node + + else: + return updated_node + + def leave_AnnAssign(self, original_node: cst.AnnAssign, updated_node) -> None: + if self.find == False and self.dt == "var" and self.target["func_name"] == "__global__": + pos = self.__get_line_column_no(original_node.target) + var_name = cst.Module([original_node.target]).code + if pos == self.target["loc"] and var_name == self.target["name"]: + self.find = True + log_stmt = cst.Expr(cst.parse_expression(f"reveal_type({updated_node.target.value})")) + if original_node.value == None: + updated_node = cst.Assign(targets=[cst.AssignTarget(target=original_node.target)], + value=cst.Ellipsis()) + else: + updated_node = cst.Assign(targets=[cst.AssignTarget(target=original_node.target)], + value=original_node.value) + return cst.FlattenSentinel([updated_node, log_stmt]) + else: + return updated_node + else: + return updated_node diff --git a/type4py/deploy/utils/preprocess_utils.py b/type4py/deploy/utils/preprocess_utils.py new file mode 100644 index 0000000..14e2a83 --- /dev/null +++ b/type4py/deploy/utils/preprocess_utils.py @@ -0,0 +1,63 @@ +''' +Including functions for preprocess among different infer results +- type consis +- remove alias +- check type +''' + +import regex + +def check(t: str): + types = ["", "Any", "any", "None", "Object", "object", "type", "Type[Any]", + 'Type[cls]', 'Type[type]', 'Type', 'TypeVar', 'Optional[Any]'] + if t in types: + return False + else: + return True + +def make_types_consistent(t: str): + """ + Removes typing module from type annotations + """ + sub_regex = r'typing\.|typing_extensions\.|t\.|builtins\.|collections\.' + + def remove_quote_types(t: str): + s = regex.search(r'^\'(.+)\'$', t) + if bool(s): + return s.group(1) + else: + # print(t) + return t + + t = regex.sub(sub_regex, "", str(t)) + t = remove_quote_types(t) + return t + + +def resolve_type_aliasing(t: str): + """ + Resolves type aliasing and mappings. e.g. `[]` -> `list` + """ + type_aliases = {'(?<=.*)any(?<=.*)|(?<=.*)unknown(?<=.*)': 'Any', + '^{}$|^Dict$|^Dict\[\]$|(?<=.*)Dict\[Any, *?Any\](?=.*)|^Dict\[unknown, *Any\]$': 'dict', + '^Set$|(?<=.*)Set\[\](?<=.*)|^Set\[Any\]$': 'set', + '^Tuple$|(?<=.*)Tuple\[\](?<=.*)|^Tuple\[Any\]$|(?<=.*)Tuple\[Any, *?\.\.\.\](?=.*)|^Tuple\[unknown, *?unknown\]$|^Tuple\[unknown, *?Any\]$|(?<=.*)tuple\[\](?<=.*)': 'tuple', + '^Tuple\[(.+), *?\.\.\.\]$': r'Tuple[\1]', + '\\bText\\b': 'str', + '^\[\]$|(?<=.*)List\[\](?<=.*)|^List\[Any\]$|^List$': 'list', + '^\[{}\]$': 'List[dict]', + '(?<=.*)Literal\[\'.*?\'\](?=.*)': 'Literal', + '(?<=.*)Literal\[\d+\](?=.*)': 'Literal', # Maybe int?! + '^Callable\[\.\.\., *?Any\]$|^Callable\[\[Any\], *?Any\]$|^Callable[[Named(x, Any)], Any]$': 'Callable', + '^Iterator[Any]$': 'Iterator', + '^OrderedDict[Any, *?Any]$': 'OrderedDict', + '^Counter[Any]$': 'Counter', + '(?<=.*)Match[Any](?<=.*)': 'Match'} + + def resolve_type_alias(t: str): + for t_alias in type_aliases: + if regex.search(regex.compile(t_alias), t): + t = regex.sub(regex.compile(t_alias), type_aliases[t_alias], t) + return t + + return resolve_type_alias(t) \ No newline at end of file diff --git a/type4py/deploy/utils/pyre_merge.py b/type4py/deploy/utils/pyre_merge.py index 26f9430..6125986 100644 --- a/type4py/deploy/utils/pyre_merge.py +++ b/type4py/deploy/utils/pyre_merge.py @@ -1,33 +1,7 @@ """ functions for merging the type information from static analysis and machine learning """ -import regex - - -def type_consist(t: str): - sub_regex = r'typing\.|typing_extensions\.|t\.|builtins\.|collections\.' - - def remove_quote_types(t: str): - s = regex.search(r'^\'(.+)\'$', t) - if bool(s): - return s.group(1) - else: - # print(t) - return t - - t = regex.sub(sub_regex, "", str(t)) - t = remove_quote_types(t) - return t - - -def check(t: str): - types = ["", "Any", "any", "None", "Object", "object", "type", "Type[Any]", - 'Type[cls]', 'Type[type]', 'Type', 'TypeVar', 'Optional[Any]'] - if t in types: - return False - else: - return True - +from preprocess_utils import check, make_types_consistent def merge_vars(sa_dict, ml_dict): add = 0 @@ -37,7 +11,7 @@ def merge_vars(sa_dict, ml_dict): else: var_dict_1 = {} for var_key in var_dict_sa.keys(): - var_dict_sa[var_key] = type_consist(var_dict_sa[var_key]) + var_dict_sa[var_key] = make_types_consistent(var_dict_sa[var_key]) if check(var_dict_sa[var_key]): if var_key in var_dict_1.keys(): if len(var_dict_1[var_key]) != 0 and var_dict_1[var_key][0][0] != var_dict_sa[var_key]: @@ -58,7 +32,7 @@ def merge_params(sa_dict, ml_dict): for param_key in param_dict_sa.keys(): if param_key in param_dict_1.keys(): - param_dict_sa[param_key] = type_consist(param_dict_sa[param_key]) + param_dict_sa[param_key] = make_types_consistent(param_dict_sa[param_key]) if check(param_dict_sa[param_key]): if len(param_dict_1[param_key]) != 0 and param_dict_1[param_key][0][0] != param_dict_sa[param_key]: sa_type = [param_dict_sa[param_key], 1.1] @@ -75,7 +49,7 @@ def merge_ret_types(sa_dict, ml_dict): ret_type_1 = ml_dict["ret_type_p"] else: ret_type_1 = [] - ret_type_sa = type_consist(ret_type_sa) + ret_type_sa = make_types_consistent(ret_type_sa) if check(ret_type_sa): if len(ret_type_1) != 0 and ret_type_1[0][0] != ret_type_sa: sa_type = [ret_type_sa, 1.1] @@ -89,7 +63,7 @@ def merge_file(sa_file, ml_file): add_var = 0 add_params = 0 add_ret_type = 0 - # print("find") + # merge variables in the py file merged_file, add = merge_vars(sa_file, ml_file) add_var = add_var + add diff --git a/type4py/deploy/utils/pyright_merge.py b/type4py/deploy/utils/pyright_merge.py new file mode 100644 index 0000000..3d9bf0d --- /dev/null +++ b/type4py/deploy/utils/pyright_merge.py @@ -0,0 +1,153 @@ +from preprocess_utils import check, make_types_consistent, resolve_type_aliasing + +def merge_vars(var_dict_sa, ml_dict, range): + if "variables_p" in ml_dict.keys(): + var_dict_1 = ml_dict["variables_p"] + else: + var_dict_1 = {} + for var_slot in var_dict_sa: + var_name = var_slot["name"] + var_pred = make_types_consistent(var_slot["type"]) + var_pred = resolve_type_aliasing(var_pred) + # var_dict_sa[var_key] = type_consist(var_dict_sa[var_key]) + if check(var_pred): + if range == "model": + if var_name in var_dict_1.keys() and var_slot["loc"] == ml_dict["mod_var_ln"][var_name]: + if len(var_dict_1[var_name]) != 0 and var_dict_1[var_name][0][0] != var_pred: + sa_type = [var_pred, 1.2] + var_dict_1[var_name].insert(0, sa_type) + if range == "func": + if var_name in var_dict_1.keys() and var_slot["loc"] == ml_dict["fn_var_ln"][var_name]: + if len(var_dict_1[var_name]) != 0 and var_dict_1[var_name][0][0] != var_pred: + sa_type = [var_pred, 1.2] + var_dict_1[var_name].insert(0, sa_type) + if range == "class": + if var_name in var_dict_1.keys() and var_slot["loc"] == ml_dict["cls_var_ln"][var_name]: + if len(var_dict_1[var_name]) != 0 and var_dict_1[var_name][0][0] != var_pred: + sa_type = [var_pred, 1.2] + var_dict_1[var_name].insert(0, sa_type) + ml_dict["variables_p"] = var_dict_1 + return ml_dict + + +def merge_params(param_dict_sa, ml_dict): + if "params_p" in ml_dict.keys(): + param_dict_1 = ml_dict["params_p"] + else: + param_dict_1 = {} + + for param_slot in param_dict_sa: + param_name = param_slot["name"] + if param_slot["loc"] == ml_dict["fn_lc"] and param_name in param_dict_1.keys(): + type_pred = make_types_consistent(param_slot["type"]) + type_pred = resolve_type_aliasing(type_pred) + if len(param_dict_1[param_name]) != 0 and param_dict_1[param_name][0][0] != type_pred: + sa_type = [type_pred, 1.2] + param_dict_1[param_name].insert(0, sa_type) + ml_dict["params_p"] = param_dict_1 + return ml_dict + + +def merge_returns(ret_dict_sa, ml_dict): + if "ret_type_p" in ml_dict.keys(): + ret_type_1 = ml_dict["ret_type_p"] + else: + ret_type_1 = [] + for ret_slot in ret_dict_sa: + if ret_slot["loc"] == ml_dict["fn_lc"]: + type_pred = make_types_consistent(ret_slot["type"]) + type_pred = resolve_type_aliasing(type_pred) + if len(ret_type_1) != 0 and ret_type_1[0][0] != type_pred: + sa_type = [type_pred, 1.2] + ret_type_1.insert(0, sa_type) + ml_dict["ret_type_p"] = ret_type_1 + return ml_dict + +def merge_file(sa_file, ml_file): + filtered_list = [d for d in sa_file if d.get('type') != 'Unknown'] + var_preds = [d for d in filtered_list if d.get("task") == 'var'] + param_preds = [d for d in filtered_list if d.get("task") == "parameter"] + ret_preds = [d for d in filtered_list if d.get("task") == "return"] + + merged_file = merge_vars(var_preds, ml_file, "model") + + if "funcs" in ml_file.keys(): + func_list = merged_file['funcs'] + func_list_1 = merged_file['funcs'] + for i in range(len(func_list)): + merged_func = merge_vars(var_preds, func_list[i], "func") + func_list_1[i] = merged_func + merged_func = merge_params(param_preds, func_list[i]) + func_list_1[i] = merged_func + merged_func = merge_returns(ret_preds, func_list[i]) + func_list_1[i] = merged_func + + merged_file['funcs'] = func_list_1 + + if "classes" in ml_file.keys(): + class_list = merged_file['classes'] + class_list_1 = merged_file['classes'] + for i in range(len(class_list)): + class_list_1[i] = merge_vars(var_preds, class_list[i], "class") + if "funcs" in class_list_1[i].keys(): + func_list = class_list[i]['funcs'] + func_list_1 = class_list_1[i]['funcs'] + for j in range(len(func_list)): + merged_func = merge_vars(var_preds, func_list[j], "func") + func_list_1[j] = merged_func + merged_func = merge_params(param_preds, func_list[j]) + func_list_1[j] = merged_func + merged_func = merge_returns(ret_preds, func_list[j]) + func_list_1[j] = merged_func + + class_list_1[i]['funcs'] = func_list_1 + merged_file['classes'] = class_list_1 + + return merged_file + + +def merge_project(sa_dict, ml_dict): + merged_dict = {} + for key in ml_dict: + if key in sa_dict.keys(): + m_file = merge_file(sa_dict[key], ml_dict[key]) + merged_dict[key] = m_file + else: + merged_dict[key] = ml_dict[key] + return merged_dict + + +def update_key(file_name, project_id): + author = project_id.split("/")[0] + repo = project_id.split("/")[1] + list = file_name.split("/") + start_index = 0 + for i in range(len(list) - 1): + if list[i] == author and list[i + 1] == repo: + start_index = i + break + new_list = [] + new_list.append("data") + while start_index < len(list): + new_list.append(list[start_index]) + start_index = start_index + 1 + return "/".join(new_list) + + +def merge_pyright(ml_dict, static_dict, project_id): + src_dict = {} + src_dict_ml = {} + for p_dict in static_dict: + key = list(p_dict.keys())[0] + key_new = update_key(key, project_id) + src_dict[key_new] = p_dict[key] + for key in ml_dict[project_id]['src_files'].keys(): + key_new = update_key(key, project_id) + src_dict_ml[key_new] = ml_dict[project_id]['src_files'][key] + + merged_project: dict = {project_id: {"src_files": {}}} + print(len(src_dict)) + print(len(src_dict_ml)) + merged_src_dict = merge_project(src_dict, src_dict_ml) + merged_project[project_id]["src_files"] = merged_src_dict + return merged_project \ No newline at end of file diff --git a/type4py/deploy/utils/pyright_utils.py b/type4py/deploy/utils/pyright_utils.py new file mode 100644 index 0000000..d4a034b --- /dev/null +++ b/type4py/deploy/utils/pyright_utils.py @@ -0,0 +1,78 @@ +''' +This script is used to run pyright command and convert pyright info to type slot +''' + +import subprocess +import json +import re + +var_pattern = r'Type of "(\w+)" is "(\w+)"' +func_pattern = r'Type of "(.+)" is "(.+)"' +param_pattern = r"(\w+): (\w+(?:\[.*?\])?)" + +def run_command(cmd_args, timeout): + process = subprocess.run(cmd_args, shell=True, capture_output=True, timeout=timeout) + return process.stdout.decode(), process.stderr.decode(), process.returncode + + +''' +This function is used to parse the pyright expression into a type slot: +For example: +-parse: +"Type of "boo" is "str"" +-into: +"boo":"str" +''' +def parse_pyright(p_dict, dt, name): + type_str = p_dict["message"] + if dt == "var": + match = re.match(var_pattern, type_str) + if match and name == match.group(1): + return dict(name=match.group(1), type=match.group(2), task="var") + else: + return None + + else: + if dt == "ret": + match = re.match(func_pattern, type_str) + if match: + sig = match.group(2) + if "->" in sig: + predict_ret = sig.split(" -> ")[1] + else: + predict_ret = sig + return dict(name=match.group(1), type=predict_ret, task="return") + else: + return None + elif dt == "param": + match = re.match(func_pattern, type_str) + if match: + param_sig = match.group(2).split(" -> ")[0] + if len(param_sig) != 2: + un_quote = param_sig[1:-1] + matches = re.findall(param_pattern, un_quote) + for match_p in matches: + if name == match_p[0]: + return dict(name=match_p[0], type=match_p[1], task="parameter") + + else: + return None + else: + return None + + + +def pyright_infer(file_path, dt: str, name: str): + # run pyright command + try: + stdout, stderr, r_code = run_command("pyright %s --outputjson" % file_path, 60) + output = json.loads(stdout) + for dict in output["generalDiagnostics"]: + if dict["severity"] == "information": + t_dict = parse_pyright(dict, dt, name) + if t_dict is not None: + return t_dict + except Exception as e: + # somtimes cannot load the results of pyright correctly, so catch + print(str(e)) + pass \ No newline at end of file From 8698994e61ceec41ef083f11f52baba4994feae0 Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 8 Jun 2023 10:46:05 +0200 Subject: [PATCH 14/43] rename type_preprocess script --- type4py/deploy/utils/{preprocess_utils.py => type_preprocess.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename type4py/deploy/utils/{preprocess_utils.py => type_preprocess.py} (100%) diff --git a/type4py/deploy/utils/preprocess_utils.py b/type4py/deploy/utils/type_preprocess.py similarity index 100% rename from type4py/deploy/utils/preprocess_utils.py rename to type4py/deploy/utils/type_preprocess.py From e9b1a11ef76ca3e29a9f3d540d0a6b6f85603a3f Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 8 Jun 2023 10:56:59 +0200 Subject: [PATCH 15/43] update TypeAnnotationFinder & Masker to libsa4py and import from it --- type4py/deploy/static_infer.py | 2 +- type4py/deploy/utils/cst_utils.py | 117 ------------------------------ 2 files changed, 1 insertion(+), 118 deletions(-) delete mode 100644 type4py/deploy/utils/cst_utils.py diff --git a/type4py/deploy/static_infer.py b/type4py/deploy/static_infer.py index 20898e0..37952d0 100644 --- a/type4py/deploy/static_infer.py +++ b/type4py/deploy/static_infer.py @@ -6,7 +6,7 @@ from pathlib import Path import utils.pyre_utils as pyre_util from utils.utils import rebuild_repo -from utils.cst_utils import TypeAnnotationFinder, TypeAnnotationMasker +from libsa4py.cst_transformers import TypeAnnotationFinder, TypeAnnotationMasker from utils.preprocess_utils import check, make_types_consistent from libsa4py.utils import list_files, read_file, write_file from libsa4py.exceptions import ParseError diff --git a/type4py/deploy/utils/cst_utils.py b/type4py/deploy/utils/cst_utils.py deleted file mode 100644 index eeca84b..0000000 --- a/type4py/deploy/utils/cst_utils.py +++ /dev/null @@ -1,117 +0,0 @@ -import libcst as cst -from libcst.metadata import PositionProvider, CodeRange - - -class TypeAnnotationFinder(cst.CSTTransformer): - """ - It find all type annotations from a source code - use a dict object for saving the type information: - for example: - { - dt: "param", - func_name: "my_foo", - name: "foo", - label: "Fool" - } - """ - METADATA_DEPENDENCIES = (PositionProvider,) - - def __init__(self): - super().__init__() - self.annotated_types = [] - self.var_list = set() - - def __get_line_column_no(self, node): - lc = self.get_metadata(cst.metadata.PositionProvider, node) - return (lc.start.line, lc.start.column), (lc.end.line, lc.end.column) - - def visit_FunctionDef(self, original_node: cst.FunctionDef): - fn_pos = self.__get_line_column_no(original_node) - if original_node.returns is not None: - node_module = cst.Module([original_node.returns.annotation]) - type_dict = dict(dt="ret", func_name=original_node.name.value, name="ret_type", - label=node_module.code, loc = fn_pos) - self.annotated_types.append(type_dict) - for param in original_node.params.params: - if param.annotation is not None: - node_module = cst.Module([param.annotation.annotation]) - type_dict = dict(dt="param", func_name=original_node.name.value, name=param.name.value, - label=node_module.code, loc=fn_pos) - self.annotated_types.append(type_dict) - - def visit_AnnAssign(self, node: cst.AnnAssign) -> None: - pos = self.__get_line_column_no(node.target) - node_module = cst.Module([node.annotation.annotation]) - var_name = cst.Module([node.target]).code - if pos not in self.var_list: - type_dict = dict(dt="var", func_name="__global__", name=var_name, - label=node_module.code, loc=pos) - self.annotated_types.append(type_dict) - self.var_list.add(pos) - - -class TypeAnnotationMasker(cst.CSTTransformer): - """ - It removes type annotations and add reveal_type() for static analysis - """ - METADATA_DEPENDENCIES = (PositionProvider,) - - def __init__(self, tar_dict): - super().__init__() - self.target = tar_dict - self.dt = tar_dict["dt"] - self.find = False - - def __get_line_column_no(self, node): - lc = self.get_metadata(cst.metadata.PositionProvider, node) - return (lc.start.line, lc.start.column), (lc.end.line, lc.end.column) - - def leave_FunctionDef(self, original_node: cst.FunctionDef, updated_node): - fn_pos = self.__get_line_column_no(original_node) - if self.find == False and fn_pos == self.target["loc"]: - - # for return types - if self.dt == "ret" and original_node.returns is not None: - log_stmt = cst.Expr(cst.parse_expression(f"reveal_type({updated_node.name.value})")) - self.find = True - return cst.FlattenSentinel([updated_node.with_changes(returns=None), log_stmt]) - - # for parameterss - elif self.dt == "param": - updated_params = [] - for param in original_node.params.params: - if param.name.value == self.target["name"]: - self.find = True - param_untyped = param.with_changes(annotation=None, comma=None) - updated_params.append(param_untyped) - else: - updated_params.append(param) - log_stmt = cst.Expr(cst.parse_expression(f"reveal_type({updated_node.name.value})")) - return cst.FlattenSentinel([updated_node.with_changes( - params=cst.Parameters(updated_params)), log_stmt] - ) - - else: - return updated_node - - else: - return updated_node - - def leave_AnnAssign(self, original_node: cst.AnnAssign, updated_node) -> None: - if self.find == False and self.dt == "var" and self.target["func_name"] == "__global__": - pos = self.__get_line_column_no(original_node.target) - var_name = cst.Module([original_node.target]).code - if pos == self.target["loc"] and var_name == self.target["name"]: - self.find = True - log_stmt = cst.Expr(cst.parse_expression(f"reveal_type({updated_node.target.value})")) - if original_node.value == None: - updated_node = cst.Assign(targets=[cst.AssignTarget(target=original_node.target)], - value=cst.Ellipsis()) - else: - updated_node = cst.Assign(targets=[cst.AssignTarget(target=original_node.target)], - value=original_node.value) - return cst.FlattenSentinel([updated_node, log_stmt]) - else: - return updated_node - else: - return updated_node From 0b491e267d02c473dfaa37dec24cb53efe2dbf4a Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 8 Jun 2023 10:58:42 +0200 Subject: [PATCH 16/43] update comments --- type4py/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/type4py/__main__.py b/type4py/__main__.py index 60fcbbc..f9ab8a7 100644 --- a/type4py/__main__.py +++ b/type4py/__main__.py @@ -134,7 +134,7 @@ def infer(args): -t4pyright: type4py + pyright ''' def infer_project(args): - approach_list = {"t4py", "hybrid0", "hybrid1"} + approach_list = {"t4py", "t4pyre", "t4pyright"} if args.a in approach_list: from type4py.deploy.infer_project import infer_project_main setup_logs_file(args.m, 'infer_project') From afbddd714f744d9718d11eda24b06b4e975a0426 Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 17 Aug 2023 13:44:12 +0200 Subject: [PATCH 17/43] update vectorize --- type4py/vectorize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/type4py/vectorize.py b/type4py/vectorize.py index 24fa08a..6c02333 100644 --- a/type4py/vectorize.py +++ b/type4py/vectorize.py @@ -245,7 +245,7 @@ def process_datapoints(df, output_path, embedding_type, type, trans_func, cached start_idx = i end_idx = min(i + batch_size, num_rows) batch = datapoints.iloc[start_idx:end_idx] - datapoints_X[start_idx:end_idx, :, :] = np.stack(batch.progress_apply(lambda x: x.generate_datapoint()), + datapoints_X[start_idx:end_idx, :, :] = np.stack(batch.apply(lambda x: x.generate_datapoint()), axis=0) np.save(os.path.join(output_path, embedding_type + type + '_datapoints_x'), datapoints_X) From 2e80ff180da361b01cb9d6651249d95d1c5f05ab Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 17 Aug 2023 14:04:20 +0200 Subject: [PATCH 18/43] update preprocess --- type4py/preprocess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/type4py/preprocess.py b/type4py/preprocess.py index 26e5fbb..83482fe 100644 --- a/type4py/preprocess.py +++ b/type4py/preprocess.py @@ -243,7 +243,7 @@ def encode_all_types(df_ret: pd.DataFrame, df_params: pd.DataFrame, df_vars: pd. logger.info(f"Total no. of extracted types: {len(all_types):,}") logger.info(f"Total no. of unique types: {len(unq_types):,}") - return df_ret, df_params, le_all + return df_vars, df_ret, df_params, le_all def gen_most_frequent_avl_types(avl_types_dir, output_dir, top_n: int = 1024) -> pd.DataFrame: """ @@ -392,7 +392,7 @@ def preprocess_ext_fns(output_dir: str, limit: int = None, apply_random_vth: boo # Exclude variables without a type processed_proj_vars = filter_var_wo_type(processed_proj_vars) - processed_proj_fns, processed_proj_fns_params, le_all = encode_all_types(processed_proj_fns, processed_proj_fns_params, + processed_proj_vars, processed_proj_fns, processed_proj_fns_params, le_all = encode_all_types(processed_proj_fns, processed_proj_fns_params, processed_proj_vars, output_dir) # Exclude self from arg names and return expressions From f7d1fef815c46e92e477cfcac1dd7c3e0e7772d4 Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 17 Aug 2023 15:06:09 +0200 Subject: [PATCH 19/43] update learn_split.py --- type4py/learn_split.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/type4py/learn_split.py b/type4py/learn_split.py index 9be9cc6..e62bb30 100644 --- a/type4py/learn_split.py +++ b/type4py/learn_split.py @@ -143,6 +143,8 @@ def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, m if torch.cuda.device_count() > 1: model = nn.DataParallel(model) + logger.info(f"Model training on {DEVICE}") + criterion = torch.nn.TripletMarginLoss(margin=model_params['margin']) optimizer = torch.optim.Adam(model.parameters(), lr=model_params['lr']) From ed8af0d7cc91d79cfa3edbbd5c7cfeb74c9b0ba1 Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 17 Aug 2023 15:27:00 +0200 Subject: [PATCH 20/43] update learn_split.py --- type4py/learn_split.py | 1 - 1 file changed, 1 deletion(-) diff --git a/type4py/learn_split.py b/type4py/learn_split.py index e62bb30..352f9f2 100644 --- a/type4py/learn_split.py +++ b/type4py/learn_split.py @@ -156,6 +156,5 @@ def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, m # Saving the model logger.info("Saved the trained Type4Py model for %s prediction on the disk" % data_loading_funcs['name']) - os.remove(output_path, trained_model_name) torch.save(model.module if torch.cuda.device_count() > 1 else model, join(output_path, f"{trained_model_name[:-3]}_{dataset_type}.pt")) From 2ff621e4753bc3a59192c403ce3375004e751a06 Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 17 Aug 2023 15:36:16 +0200 Subject: [PATCH 21/43] update learn_split.py --- type4py/learn_split.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/type4py/learn_split.py b/type4py/learn_split.py index 352f9f2..26ad065 100644 --- a/type4py/learn_split.py +++ b/type4py/learn_split.py @@ -156,5 +156,7 @@ def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, m # Saving the model logger.info("Saved the trained Type4Py model for %s prediction on the disk" % data_loading_funcs['name']) + if trained_model_name == None: + trained_model_name == f"type4py_{data_loading_funcs['name']}_model.pt" torch.save(model.module if torch.cuda.device_count() > 1 else model, join(output_path, f"{trained_model_name[:-3]}_{dataset_type}.pt")) From 1a5332aafc5b20e4ee38df693991eaa5d9a007eb Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 17 Aug 2023 15:47:19 +0200 Subject: [PATCH 22/43] update learn_split.py --- type4py/learn_split.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/type4py/learn_split.py b/type4py/learn_split.py index 26ad065..aba20dd 100644 --- a/type4py/learn_split.py +++ b/type4py/learn_split.py @@ -128,7 +128,8 @@ def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, m trained_model_name, trained_types = find_existing_model(data_loading_funcs, output_path) if trained_types == None: - logger.info("No trained model found, starting to intialize the model...") + trained_model_name = f"type4py_{data_loading_funcs['name']}_model.pt" + logger.info(f"No trained model found, starting to intialize the model {trained_model_name}...") # Loading the model model = load_model(data_loading_funcs['name'], model_params) logger.info(f"Intializing the {model.__class__.__name__} model") @@ -156,7 +157,5 @@ def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, m # Saving the model logger.info("Saved the trained Type4Py model for %s prediction on the disk" % data_loading_funcs['name']) - if trained_model_name == None: - trained_model_name == f"type4py_{data_loading_funcs['name']}_model.pt" torch.save(model.module if torch.cuda.device_count() > 1 else model, join(output_path, f"{trained_model_name[:-3]}_{dataset_type}.pt")) From 40f1302a766f80399d46a9d8309d49f459f3e824 Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 17 Aug 2023 15:51:54 +0200 Subject: [PATCH 23/43] update learn_split.py --- type4py/learn_split.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/type4py/learn_split.py b/type4py/learn_split.py index aba20dd..562641b 100644 --- a/type4py/learn_split.py +++ b/type4py/learn_split.py @@ -12,7 +12,7 @@ from typing import Tuple from collections import Counter from multiprocessing import cpu_count -from os.path import join +from os.path import join, exists from time import time from annoy import AnnoyIndex from tqdm import tqdm @@ -157,5 +157,8 @@ def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, m # Saving the model logger.info("Saved the trained Type4Py model for %s prediction on the disk" % data_loading_funcs['name']) + # remove old model + if exists(join(output_path, trained_model_name)): + os.remove(join(output_path, trained_model_name)) torch.save(model.module if torch.cuda.device_count() > 1 else model, join(output_path, f"{trained_model_name[:-3]}_{dataset_type}.pt")) From 9aa640b0ca84cad53ed2a381b2099f14a53321a7 Mon Sep 17 00:00:00 2001 From: fenglang Date: Fri, 18 Aug 2023 12:42:09 +0200 Subject: [PATCH 24/43] update pipeline --- README.md | 18 ++++++++++++++++++ type4py/gen_type_cluster.py | 10 +++++----- type4py/learn_split.py | 6 +++++- type4py/preprocess.py | 2 +- type4py/reduce.py | 6 +++--- type4py/to_onnx.py | 4 ++-- 6 files changed, 34 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index cddabec..746cafd 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,24 @@ Description: - `--p $PARAM_FILE`: The path to user-provided hyper-parameters for the model. See [this](https://github.com/saltudelft/type4py/blob/main/type4py/model_params.json) file as an example. [Optional] +## 4*. Learning separately +``` +$ type4py learns --o $OUTPUT_DIR --dt $DATA_TYPE --c --p $PARAM_FILE +``` +- `$OUTPUT_DIR`: The path that was used in the previous step to store processed projects. +- `$DATA_TYPE`: Sequential Learing, either `var`, or `param` or `ret` +- `--c`: Trains the complete model. Use `type4py learn -h` to see other configurations. + +- `--p $PARAM_FILE`: The path to user-provided hyper-parameters for the model. See [this](https://github.com/saltudelft/type4py/blob/main/type4py/model_params.json) file as an example. [Optional] + +## 4**. Gernerating Type Cluster +``` +$ type4py gen_type_clu --o $OUTPUT_DIR --dt $DATA_TYPE +``` +- `$OUTPUT_DIR`: The path that was used in the previous step to store processed projects. +- `$DATA_TYPE`: Sequential Learing, either `var`, or `param` or `ret` + + ## 5. Testing ``` $ type4py predict --o $OUTPUT_DIR --c diff --git a/type4py/gen_type_cluster.py b/type4py/gen_type_cluster.py index 5a0e1a7..9c91aa7 100644 --- a/type4py/gen_type_cluster.py +++ b/type4py/gen_type_cluster.py @@ -163,16 +163,16 @@ def gen_type_cluster(output_path: str, data_loading_funcs: dict, datatype: str, # update and save the annoy_index and embed_labels if cluster_file is not None: - os.remove(join(output_path,cluster_file)) - cluster_file = cluster_file + "_" + datatype - annoy_index.save(join(output_path, cluster_file)) + cluster_file_new = cluster_file + "_" + datatype + annoy_index.save(join(output_path, cluster_file_new)) + os.remove(join(output_path, cluster_file)) else: annoy_index.save(join(output_path, f"type4py_{data_loading_funcs['name']}_type_cluster_{datatype}")) if embedded_file is not None: + embedded_file_new = processed_type_em + "_" + datatype + ".npy" + np.save(join(output_path, embedded_file_new), embed_labels) os.remove(join(output_path, embedded_file)) - embedded_file = processed_type_em + "_" + datatype - np.save(join(output_path, embedded_file), embed_labels) else: np.save(join(output_path, f"type4py_{data_loading_funcs['name']}_true_{datatype}.npy"), embed_labels) diff --git a/type4py/learn_split.py b/type4py/learn_split.py index 562641b..508bc95 100644 --- a/type4py/learn_split.py +++ b/type4py/learn_split.py @@ -91,6 +91,8 @@ def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, m with open(join(output_path, f"{data_loading_funcs['name']}_common_types_var.pkl"), 'rb') as f1: count_types_var = pickle.load(f1) count_types.update(count_types_var) + # delete the old existing pkl + os.remove(join(output_path, f"{data_loading_funcs['name']}_common_types_var.pkl")) # also add suffix to filename type_filename = type_filename + "_var" @@ -99,6 +101,7 @@ def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, m with open(join(output_path, f"{data_loading_funcs['name']}_common_types_param.pkl"), 'rb') as f2: count_types_param = pickle.load(f2) count_types.update(count_types_param) + os.remove(join(output_path, f"{data_loading_funcs['name']}_common_types_param.pkl")) type_filename = type_filename + "_param" # if find existing types in "ret" dataset, load them for updating for final common types @@ -106,6 +109,7 @@ def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, m with open(join(output_path, f"{data_loading_funcs['name']}_common_types_ret.pkl"), 'rb') as f3: count_types_ret = pickle.load(f3) count_types.update(count_types_ret) + os.remove(join(output_path, f"{data_loading_funcs['name']}_common_types_ret.pkl")) type_filename = type_filename + "_ret" common_types = [t.item() for t in train_data_loader.dataset.labels if count_types[t.item()] >= 100] @@ -129,7 +133,7 @@ def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, m if trained_types == None: trained_model_name = f"type4py_{data_loading_funcs['name']}_model.pt" - logger.info(f"No trained model found, starting to intialize the model {trained_model_name}...") + logger.info(f"No trained model found, starting to initialize the model {trained_model_name}...") # Loading the model model = load_model(data_loading_funcs['name'], model_params) logger.info(f"Intializing the {model.__class__.__name__} model") diff --git a/type4py/preprocess.py b/type4py/preprocess.py index 83482fe..15cf1c7 100644 --- a/type4py/preprocess.py +++ b/type4py/preprocess.py @@ -305,7 +305,7 @@ def preprocess_ext_fns(output_dir: str, limit: int = None, apply_random_vth: boo if not (os.path.exists(os.path.join(output_dir, "all_fns.csv")) and os.path.exists(os.path.join(output_dir, "all_vars.csv"))): logger.info("Merging JSON projects") - merged_jsons = merge_jsons_to_dict(list_files(os.path.join(output_dir, 'processed_projects'), ".json"), limit) + merged_jsons = merge_jsons_to_dict(list_files(os.path.join(output_dir, 'processed_projects'), ".json"), output_dir, limit) logger.info("Creating functions' Dataframe") create_dataframe_fns(output_dir, merged_jsons) logger.info("Creating variables' Dataframe") diff --git a/type4py/reduce.py b/type4py/reduce.py index d9d9514..86a6bfa 100644 --- a/type4py/reduce.py +++ b/type4py/reduce.py @@ -23,9 +23,9 @@ def reduce_tc(args): if os.path.exists(join(args.o, "type4py_complete_type_cluster")): logger.info("Loading type clusters: type4py_complete_type_cluster") type_cluster_index.load(join(args.o, "type4py_complete_type_cluster")) - elif os.path.exists(join(args.o, "type4py_complete_type_cluster_var_param_return")): - logger.info("Loading type clusters: type4py_complete_type_cluster_var_param_return") - type_cluster_index.load(join(args.o, "type4py_complete_type_cluster_var_param_return")) + elif os.path.exists(join(args.o, "type4py_complete_type_cluster_var_param_ret")): + logger.info("Loading type clusters: type4py_complete_type_cluster_var_param_ret") + type_cluster_index.load(join(args.o, "type4py_complete_type_cluster_var_param_ret")) else: raise TypeClusterNotFound diff --git a/type4py/to_onnx.py b/type4py/to_onnx.py index 57a23df..dcccb1c 100644 --- a/type4py/to_onnx.py +++ b/type4py/to_onnx.py @@ -21,9 +21,9 @@ def type4py_to_onnx(args): if os.path.exists(join(args.o, "type4py_complete_model.pt")): logger.info("Loading the pre-trained Type4Py model") type4py_model = torch.load(join(args.o, "type4py_complete_model.pt")).model - elif os.path.exists(join(args.o, "type4py_complete_model_var_param_return.pt")): + elif os.path.exists(join(args.o, "type4py_complete_model_var_param_ret.pt")): logger.info("Loading the pre-trained Type4Py model") - type4py_model = torch.load(join(args.o, "type4py_complete_model_var_param_return.pt")).model + type4py_model = torch.load(join(args.o, "type4py_complete_model_var_param_ret.pt")).model else: raise FileNotFoundError("Type4Py model not found!") From e01df3ccc96ab3a16c9b03d949d7f73c5033bc78 Mon Sep 17 00:00:00 2001 From: fenglang Date: Fri, 18 Aug 2023 13:09:01 +0200 Subject: [PATCH 25/43] update pipeline --- type4py/__main__.py | 4 +- type4py/eval.py | 94 ++++++++++++++++++++++------------------ type4py/predict_split.py | 3 +- 3 files changed, 57 insertions(+), 44 deletions(-) diff --git a/type4py/__main__.py b/type4py/__main__.py index f9ab8a7..9aefb75 100644 --- a/type4py/__main__.py +++ b/type4py/__main__.py @@ -104,7 +104,7 @@ def predict_split(args): from type4py.predict_split import test_split setup_logs_file(args.o, "predict_sep") if args.c: - test_split(args.o, data_loading_comb_sep) + test_split(args.o, data_loading_comb) def eval(args): from type4py.eval import evaluate @@ -257,7 +257,7 @@ def main(): infer_parser_pro.add_argument('--o', '--output', required=True, type=str, help="Path to store the ml_infer outputs") infer_parser_pro.add_argument('--a', '--approach', required=True, type=str, - help="infer approach includes ml, hybrid0, hybrid1") + help="infer approach includes t4py, t4pyre, t4pyright, t4pysa") # split according to dataset_split_repo.csv infer_parser_pro.add_argument('--split', '--split_file', required=True, type=str, help="file to store the split of projects") diff --git a/type4py/eval.py b/type4py/eval.py index b8c6426..7654fb8 100644 --- a/type4py/eval.py +++ b/type4py/eval.py @@ -10,9 +10,9 @@ logger.name = __name__ -def eval_type_embed(y_pred: np.array, y_true: np.array, ubiquitous_types: set, common_types: set, - top_n: int=10): +def eval_type_embed(y_pred: np.array, y_true: np.array, ubiquitous_types: set, common_types: set, + top_n: int = 10): all_ubiq_types = 0 corr_ubiq_types = 0 all_common_types = 0 @@ -26,7 +26,7 @@ def eval_type_embed(y_pred: np.array, y_true: np.array, ubiquitous_types: set, c corr_rare_mask = np.array([False] * len(y_pred), dtype=np.bool) for idx, p in enumerate(y_pred): - + if y_true[idx] in ubiquitous_types: all_ubiq_types += 1 if y_true[idx] in p[:top_n]: @@ -44,11 +44,13 @@ def eval_type_embed(y_pred: np.array, y_true: np.array, ubiquitous_types: set, c corr_rare_types += 1 corr_rare_mask[idx] = True - return (corr_ubiq_types + corr_common_types + corr_rare_types) / len(y_pred) * 100.0, corr_ubiq_types / all_ubiq_types * 100.0, \ - corr_common_types / all_common_types * 100.0, corr_rare_types / all_rare_types * 100.0, corr_common_mask, corr_rare_mask + return (corr_ubiq_types + corr_common_types + corr_rare_types) / len( + y_pred) * 100.0, corr_ubiq_types / all_ubiq_types * 100.0, \ + corr_common_types / all_common_types * 100.0, corr_rare_types / all_rare_types * 100.0, corr_common_mask, corr_rare_mask + def eval_parametric_match(y_pred: np.array, y_true: np.array, ubiquitous_types: str, - common_types: set, label_enc, top_n: int=10): + common_types: set, label_enc, top_n: int = 10): """ Finds correct parametric types in predicted types. That is, List[*] is parametric type. Only outermost is considered, which is List in the given example. @@ -68,11 +70,11 @@ def pred_param_types(pred_types: np.array, true_param_type): if true_param_type.group(1) == re.match(param_type_match, p).group(1): no_match += 1 break - + return no_match for idx, t in enumerate(tqdm(y_true, total=len(y_true), desc="Calculating parametric match")): - + if t in ubiquitous_types: # The selected ubiquitous types are not parametric types if t in y_pred[idx][:top_n]: @@ -96,7 +98,8 @@ def pred_param_types(pred_types: np.array, true_param_type): corr_param_rare_types += pred_param_types(y_pred[idx], matched_param_type) return (corr_ubiq_types + corr_param_common_types + corr_param_rare_types) / len(y_pred) * 100.0, \ - corr_param_common_types / all_param_common_types * 100.0, corr_param_rare_types / all_param_rare_types * 100.0 + corr_param_common_types / all_param_common_types * 100.0, corr_param_rare_types / all_param_rare_types * 100.0 + def eval_pred_dsl(test_pred: List[dict], common_types, tasks: set, top_n=10, mrr_all=False): """ @@ -107,7 +110,7 @@ def eval_pred_dsl(test_pred: List[dict], common_types, tasks: set, top_n=10, mrr def pred_types_fix(y_true: str, y_pred: List[Tuple[str, int]]): for i, (p, _) in enumerate(y_pred[:top_n]): if p == y_true: - return p, 1/(i+1) + return p, 1 / (i + 1) return y_pred[0][0], 0.0 @@ -118,19 +121,20 @@ def is_param_correct(true_param_type: str, pred_types: np.array): if re.match(param_type_match, p): if re.match(param_type_match, true_param_type).group(1) == re.match(param_type_match, p).group(1): no_match += 1 - r = 1/(i+1) + r = 1 / (i + 1) break else: if re.match(param_type_match, true_param_type).group(1).lower() == p.lower(): no_match += 1 - r = 1/(i+1) + r = 1 / (i + 1) break - + return no_match, r - #ubiquitous_types = {'str', 'int', 'list', 'bool', 'float'} - ubiquitous_types = {'str', 'int', 'list', 'bool', 'float', 'typing.Text', 'typing.List', 'typing.List[typing.Any]', 'typing.list'} - #common_types = common_types - ubiquitous_types + # ubiquitous_types = {'str', 'int', 'list', 'bool', 'float'} + ubiquitous_types = {'str', 'int', 'list', 'bool', 'float', 'typing.Text', 'typing.List', 'typing.List[typing.Any]', + 'typing.list'} + # common_types = common_types - ubiquitous_types all_ubiq_types = 0 corr_ubiq_types = 0 @@ -152,7 +156,7 @@ def is_param_correct(true_param_type: str, pred_types: np.array): mrr_param_ubiq = [] mrr_param_comm = [] mrr_param_rare = [] - + for p in tqdm(test_pred, total=len(test_pred)): if p['task'] not in tasks: @@ -160,7 +164,7 @@ def is_param_correct(true_param_type: str, pred_types: np.array): top_n_pred, r = pred_types_fix(p['original_type'], p['predictions']) mrr.append(r) - + if p['original_type'] in ubiquitous_types: all_ubiq_types += 1 mrr_exact_ubiq.append(r) @@ -190,39 +194,47 @@ def is_param_correct(true_param_type: str, pred_types: np.array): # mrr_exact_rare.append(r) tasks = 'Combined' if tasks == {'Parameter', 'Return', 'Variable'} else list(tasks)[0] - logger.info(f"Type4Py - {tasks} - Exact match - all: {(corr_ubiq_types + corr_common_types + corr_rare_types) / (all_ubiq_types+all_common_types+all_rare_types) * 100.0:.1f}%") + logger.info( + f"Type4Py - {tasks} - Exact match - all: {(corr_ubiq_types + corr_common_types + corr_rare_types) / (all_ubiq_types + all_common_types + all_rare_types) * 100.0:.1f}%") logger.info(f"Type4Py - {tasks} - Exact match - ubiquitous: {corr_ubiq_types / all_ubiq_types * 100.0:.1f}%") logger.info(f"Type4Py - {tasks} - Exact match - common: {corr_common_types / all_common_types * 100.0:.1f}%") logger.info(f"Type4Py - {tasks} - Exact match - rare: {corr_rare_types / all_rare_types * 100.0:.1f}%") - logger.info(f"Type4Py - {tasks} - Parametric match - all: {(corr_ubiq_types + corr_common_types + corr_rare_types + corr_param_common_types + corr_param_rare_types) / (all_ubiq_types+all_common_types+all_rare_types) * 100.0:.1f}%") - logger.info(f"Type4Py - {tasks} - Parametric match - common: {(corr_param_common_types + corr_common_types) / all_common_types * 100.0:.1f}%") - logger.info(f"Type4Py - {tasks} - Parametric match - rare: {(corr_param_rare_types+corr_rare_types) / all_rare_types * 100.0:.1f}%") - - logger.info(f"Type4Py - Mean reciprocal rank {np.mean(mrr)*100:.1f}") - - if mrr_all: - logger.info(f"Type4Py - {tasks} - MRR - Exact match - all: {np.mean(mrr)*100:.1f}") - logger.info(f"Type4Py - {tasks} - MRR - Exact match - ubiquitous: {np.mean(mrr_exact_ubiq)*100:.1f}") - logger.info(f"Type4Py - {tasks} - MRR - Exact match - common: {np.mean(mrr_exact_comm)*100:.1f}") - logger.info(f"Type4Py - {tasks} - MRR - Exact match - rare: {np.mean(mrr_exact_rare)*100:.1f}") - #print(mrr_param_comm) - logger.info(f"Type4Py - {tasks} - MRR - Parameteric match - all: {np.mean(mrr_exact_ubiq+mrr_exact_comm+mrr_exact_rare+mrr_param_comm+mrr_param_rare)*100:.1f}") - logger.info(f"Type4Py - {tasks} - MRR - Parameteric match - common: {np.mean(mrr_param_comm+mrr_exact_comm)*100:.1f}") - logger.info(f"Type4Py - {tasks} - MRR - Parameteric match - rare: {np.mean(mrr_param_rare+mrr_exact_rare)*100:.1f}") + logger.info( + f"Type4Py - {tasks} - Parametric match - all: {(corr_ubiq_types + corr_common_types + corr_rare_types + corr_param_common_types + corr_param_rare_types) / (all_ubiq_types + all_common_types + all_rare_types) * 100.0:.1f}%") + logger.info( + f"Type4Py - {tasks} - Parametric match - common: {(corr_param_common_types + corr_common_types) / all_common_types * 100.0:.1f}%") + logger.info( + f"Type4Py - {tasks} - Parametric match - rare: {(corr_param_rare_types + corr_rare_types) / all_rare_types * 100.0:.1f}%") - return np.mean(mrr)*100 - -def evaluate(output_path: str, data_name: str, tasks: set, top_n: int=10, mrr_all=False): + logger.info(f"Type4Py - Mean reciprocal rank {np.mean(mrr) * 100:.1f}") + if mrr_all: + logger.info(f"Type4Py - {tasks} - MRR - Exact match - all: {np.mean(mrr) * 100:.1f}") + logger.info(f"Type4Py - {tasks} - MRR - Exact match - ubiquitous: {np.mean(mrr_exact_ubiq) * 100:.1f}") + logger.info(f"Type4Py - {tasks} - MRR - Exact match - common: {np.mean(mrr_exact_comm) * 100:.1f}") + logger.info(f"Type4Py - {tasks} - MRR - Exact match - rare: {np.mean(mrr_exact_rare) * 100:.1f}") + # print(mrr_param_comm) + logger.info( + f"Type4Py - {tasks} - MRR - Parameteric match - all: {np.mean(mrr_exact_ubiq + mrr_exact_comm + mrr_exact_rare + mrr_param_comm + mrr_param_rare) * 100:.1f}") + logger.info( + f"Type4Py - {tasks} - MRR - Parameteric match - common: {np.mean(mrr_param_comm + mrr_exact_comm) * 100:.1f}") + logger.info( + f"Type4Py - {tasks} - MRR - Parameteric match - rare: {np.mean(mrr_param_rare + mrr_exact_rare) * 100:.1f}") + + return np.mean(mrr) * 100 + + +def evaluate(output_path: str, data_name: str, tasks: set, top_n: int = 10, mrr_all=False): logger.info(f"Evaluating the Type4Py {data_name} model for {tasks} prediction task") logger.info(f"*************************************************************************") # Loading label encoder andd common types test_pred = load_json(join(output_path, f'type4py_{data_name}_test_predictions.json')) le_all = pickle.load(open(join(output_path, "label_encoder_all.pkl"), 'rb')) - common_types = pickle.load(open(join(output_path, "complete_common_types.pkl"), 'rb')) + common_types = pickle.load(open(join(output_path, "complete_common_types_ret_var_param.pkl"), 'rb')) + common_types = [int(element) for element in common_types] common_types = set(le_all.inverse_transform(list(common_types))) - #ubiquitous_types = {'str', 'int', 'list', 'bool', 'float'} - #common_types = common_types - ubiquitous_types - + # ubiquitous_types = {'str', 'int', 'list', 'bool', 'float'} + # common_types = common_types - ubiquitous_types + eval_pred_dsl(test_pred, common_types, tasks, top_n=top_n, mrr_all=mrr_all) \ No newline at end of file diff --git a/type4py/predict_split.py b/type4py/predict_split.py index fedb22f..e22f8cf 100644 --- a/type4py/predict_split.py +++ b/type4py/predict_split.py @@ -47,7 +47,8 @@ def test_split(output_path: str, data_loading_funcs: dict): logger.info("Loading the reduced type clusters") pca_transform = pickle.load(open(join(output_path, "type_clusters_pca.pkl"), 'rb')) - embed_labels = np.load(join(output_path, f"type4py_{data_loading_funcs['name']}_true.npy")) + embed_labels = np.load(join(output_path, f"type4py_{data_loading_funcs['name']}_true_var_param_ret.npy")) + embed_labels = np.array(embed_labels, dtype=int) annoy_index = AnnoyIndex(pca_transform.n_components_, 'euclidean') annoy_index.load(join(output_path, "type4py_complete_type_cluster_reduced")) From 3eb4fe9e24fce0e1c4757294bee100ec73c8c0c1 Mon Sep 17 00:00:00 2001 From: fenglang Date: Fri, 18 Aug 2023 13:16:14 +0200 Subject: [PATCH 26/43] update pipeline --- README.md | 54 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 746cafd..a2f2180 100644 --- a/README.md +++ b/README.md @@ -63,15 +63,22 @@ $ type4py vectorize --o $OUTPUT_DIR Description: - `$OUTPUT_DIR`: The path that was used in the previous step to store processed projects. -## 4. Learning -``` -$ type4py learn --o $OUTPUT_DIR --c --p $PARAM_FILE -``` -Description: -- `$OUTPUT_DIR`: The path that was used in the previous step to store processed projects. -- `--c`: Trains the complete model. Use `type4py learn -h` to see other configurations. +[//]: # (## 4. Learning) -- `--p $PARAM_FILE`: The path to user-provided hyper-parameters for the model. See [this](https://github.com/saltudelft/type4py/blob/main/type4py/model_params.json) file as an example. [Optional] +[//]: # (```) + +[//]: # ($ type4py learn --o $OUTPUT_DIR --c --p $PARAM_FILE) + +[//]: # (```) + +[//]: # (Description:) + +[//]: # (- `$OUTPUT_DIR`: The path that was used in the previous step to store processed projects.) + +[//]: # (- `--c`: Trains the complete model. Use `type4py learn -h` to see other configurations.) + +[//]: # () +[//]: # (- `--p $PARAM_FILE`: The path to user-provided hyper-parameters for the model. See [this](https://github.com/saltudelft/type4py/blob/main/type4py/model_params.json) file as an example. [Optional]) ## 4*. Learning separately ``` @@ -83,24 +90,35 @@ $ type4py learns --o $OUTPUT_DIR --dt $DATA_TYPE --c --p $PARAM_FILE - `--p $PARAM_FILE`: The path to user-provided hyper-parameters for the model. See [this](https://github.com/saltudelft/type4py/blob/main/type4py/model_params.json) file as an example. [Optional] -## 4**. Gernerating Type Cluster +## 5**. Gernerating Type Cluster ``` $ type4py gen_type_clu --o $OUTPUT_DIR --dt $DATA_TYPE ``` - `$OUTPUT_DIR`: The path that was used in the previous step to store processed projects. - `$DATA_TYPE`: Sequential Learing, either `var`, or `param` or `ret` +## 6. Reducing Type Cluster +To reduce the dimension of the created type clusters in step 5, run the following command: +> Note: The reduced version of type clusters causes a slight performance loss in type prediction. +``` +$ type4py reduce --o $OUTPUT_DIR --d $DIMENSION +``` + +Description: +- `$OUTPUT_DIR`: The path that was used in the first step to store processed projects. +- `$DIMENSION`: Reduces the dimension of type clusters to the specified value [Default: 256] -## 5. Testing +## 7. Testing ``` -$ type4py predict --o $OUTPUT_DIR --c +$ type4py predicts --o $OUTPUT_DIR ``` Description: - `$OUTPUT_DIR`: The path that was used in the first step to store processed projects. -- `--c`: Predicts using the complete model. Use `type4py predict -h` to see other configurations. -## 6. Evaluating +[//]: # (- `--c`: Predicts using the complete model. Use `type4py predict -h` to see other configurations.) + +## 8. Evaluating ``` $ type4py eval --o $OUTPUT_DIR --t c --tp 10 ``` @@ -113,16 +131,6 @@ Description: Use `type4py eval -h` to see other options. -## Reduce -To reduce the dimension of the created type clusters in step 5, run the following command: -> Note: The reduced version of type clusters causes a slight performance loss in type prediction. -``` -$ type4py reduce --o $OUTPUT_DIR --d $DIMENSION -``` - -Description: -- `$OUTPUT_DIR`: The path that was used in the first step to store processed projects. -- `$DIMENSION`: Reduces the dimension of type clusters to the specified value [Default: 256] # Converting Type4Py to ONNX To convert the pre-trained Type4Py model to the [ONNX](https://onnxruntime.ai/) format, use the following command: From 33d169f9f5bd08bf761fabe9a37b1e05144fafab Mon Sep 17 00:00:00 2001 From: fenglang Date: Fri, 18 Aug 2023 13:29:04 +0200 Subject: [PATCH 27/43] update Dockerfile for cuda version --- Dockerfile.cuda | 68 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 20 deletions(-) diff --git a/Dockerfile.cuda b/Dockerfile.cuda index 77dc7af..33bdf59 100644 --- a/Dockerfile.cuda +++ b/Dockerfile.cuda @@ -1,15 +1,41 @@ -# NOTE: This Docker file is configured to deploy Type4Py on our server and GPUs. -# For us, these configs seem to work: CUDA 11.0.3, ONNX v1.10.0, nvidia driver 450.36.06 +# FROM --platform=linux/amd64 ubuntu + FROM nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04 -WORKDIR /type4py/ -# Put the required models files in a folder "t4py_model_files" inside "/type4py" -# -type4py/ -# --type4py/ -# --t4py_model_files/ -COPY . /type4py -ENV T4PY_LOCAL_MODE="1" -RUN apt update --fix-missing -y && apt upgrade -y && apt install -y python3-pip libpq-dev +RUN ln -snf /usr/share/zoneinfo/$CONTAINER_TIMEZONE /etc/localtime && echo $CONTAINER_TIMEZONE > /etc/timezone + +# RUN apt-get purge libappstream3 +RUN apt-get update + +# python 3.8 installed by one of the following packages +# install packages needed +RUN apt-get install -y vim +RUN apt-get install -y wget +RUN apt-get install unzip +RUN apt-get install -y git +RUN apt install -y software-properties-common +RUN add-apt-repository ppa:deadsnakes/ppa + +RUN apt install -y expect + +RUN apt-get install -y python3-distutils + +RUN wget https://bootstrap.pypa.io/get-pip.py +RUN python3 get-pip.py + +RUN pip --version + +RUN apt-get install -y libssl-dev + +# download watchman +RUN wget https://github.com/facebook/watchman/releases/download/v2022.12.12.00/watchman_ubuntu20.04_v2022.12.12.00.deb +# RUN dpkg -i watchman_ubuntu20.04_v2022.12.12.00.deb +# RUN apt-get -f -y install +# RUN watchman version + +RUN apt install -y python3.8-venv +RUN python3 -m venv py38 +# RUN /bin/bash -c "source py38/bin/activate" # The current model files are pickled with the below ver. of sklearn RUN pip install scikit-learn==0.24.1 @@ -20,20 +46,22 @@ RUN pip install https://type4py.com/pretrained_models/annoy-wheels/annoy-1.17.0- # For production env., install ONNXRuntime with GPU support RUN pip install onnx==1.10 onnxruntime==1.10 onnxruntime-gpu==1.10 -# Install Type4Py -RUN pip install -e . +RUN pip install --upgrade pip +RUN pip install setuptools-rust -# Web server's required packages -RUN pip install -r type4py/server/requirements.txt +# install libsa4py +RUN git clone https://github.com/LangFeng0912/libsa4py.git +RUN pip install -r libsa4py/requirements.txt +RUN pip install -e libsa4py/ + +# install type4py +RUN git clone https://github.com/LangFeng0912/type4py.git +# RUN pip install -e type4py/ -# Install NLTK corpus RUN python3 -c "import nltk; nltk.download('stopwords')" RUN python3 -c "import nltk; nltk.download('wordnet')" RUN python3 -c "import nltk; nltk.download('omw-1.4')" RUN python3 -c "import nltk; nltk.download('averaged_perceptron_tagger')" -WORKDIR /type4py/type4py/server/ - -EXPOSE 5010 - -CMD ["bash", "run_server.sh"] \ No newline at end of file +# download dataset +RUN wget https://zenodo.org/record/8255564/files/ManyTypes4PyV8.tar.gz?download=1 From 8e0f0de35a0abbbf369cff01e001bbf5f3d5f3ff Mon Sep 17 00:00:00 2001 From: fenglang Date: Fri, 18 Aug 2023 17:20:35 +0200 Subject: [PATCH 28/43] update model parameters --- type4py/model_params.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/type4py/model_params.json b/type4py/model_params.json index afeae99..4cc9740 100644 --- a/type4py/model_params.json +++ b/type4py/model_params.json @@ -1,5 +1,5 @@ { - "epochs": 25, + "epochs": 5, "lr": 0.002, "dr": 0.25, "output_size": 1536, From 3dc74c59dc46c52b68861bd4917c99e5f5a9097e Mon Sep 17 00:00:00 2001 From: fenglang Date: Wed, 23 Aug 2023 11:35:44 +0200 Subject: [PATCH 29/43] update model parameters --- type4py/learn_split.py | 96 +++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 53 deletions(-) diff --git a/type4py/learn_split.py b/type4py/learn_split.py index 508bc95..d7990a8 100644 --- a/type4py/learn_split.py +++ b/type4py/learn_split.py @@ -24,20 +24,29 @@ logger.name = __name__ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') -def check_pickle_file(type, data_loading_funcs, output_path): - var_exist = False - param_exist = False - ret_exist = False - if os.path.exists(join(output_path, f"{data_loading_funcs['name']}_common_types_var.pkl")) and type != "var": - var_exist = True - logger.info(f"find existing {data_loading_funcs['name']}_common_types_var.pkl file !") - if os.path.exists(join(output_path, f"{data_loading_funcs['name']}_common_types_param.pkl")) and type != "param": - param_exist = True - logger.info(f"find existing {data_loading_funcs['name']}_common_types_param.pkl file !") - if os.path.exists(join(output_path, f"{data_loading_funcs['name']}_common_types_ret.pkl")) and type != "ret": - ret_exist = True - logger.info(f"find existing {data_loading_funcs['name']}_common_types_ret.pkl file !") - return var_exist, param_exist, ret_exist +def check_pickle_file(data_loading_funcs, output_path): + prefix = f"{data_loading_funcs['name']}_common_types" + suffix = "pkl" + for filename in os.listdir(output_path): + if filename.startswith(prefix) and filename.endswith(suffix): + logger.info(f"find existing common types file: {filename}!") + middle = filename[len(prefix):-len(suffix)] + trained = middle.split("_") + return filename, trained + return None, None + # var_exist = False + # param_exist = False + # ret_exist = False + # if os.path.exists(join(output_path, f"{data_loading_funcs['name']}_common_types_var.pkl")) and type != "var": + # var_exist = True + # logger.info(f"find existing {data_loading_funcs['name']}_common_types_var.pkl file !") + # if os.path.exists(join(output_path, f"{data_loading_funcs['name']}_common_types_param.pkl")) and type != "param": + # param_exist = True + # logger.info(f"find existing {data_loading_funcs['name']}_common_types_param.pkl file !") + # if os.path.exists(join(output_path, f"{data_loading_funcs['name']}_common_types_ret.pkl")) and type != "ret": + # ret_exist = True + # logger.info(f"find existing {data_loading_funcs['name']}_common_types_ret.pkl file !") + # return var_exist, param_exist, ret_exist # find existing trained model, return trained_types @@ -76,41 +85,16 @@ def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, m le_all = pickle.load(open(join(output_path, "label_encoder_all.pkl"), 'rb')) count_types = Counter(train_data_loader.dataset.labels.data.numpy()) - var_exists, param_exits, ret_exists = check_pickle_file(dataset_type, data_loading_funcs, output_path) - - if os.path.exists(join(output_path, f"{data_loading_funcs['name']}_common_types_{dataset_type}.pkl")): - logger.warn(f"{data_loading_funcs['name']}_common_types_{dataset_type}.pkl file exists!") - - with open(join(output_path, f"{data_loading_funcs['name']}_common_types_{dataset_type}.pkl"), 'wb') as f: - pickle.dump(count_types, f) - - type_filename = dataset_type - - # if find existing types in "var" dataset, load them for updating for final common types - if var_exists and dataset_type != "var": - with open(join(output_path, f"{data_loading_funcs['name']}_common_types_var.pkl"), 'rb') as f1: - count_types_var = pickle.load(f1) - count_types.update(count_types_var) - # delete the old existing pkl - os.remove(join(output_path, f"{data_loading_funcs['name']}_common_types_var.pkl")) - # also add suffix to filename - type_filename = type_filename + "_var" - - # if find existing types in "param" dataset, load them for updating for final common types - if param_exits and dataset_type != "param": - with open(join(output_path, f"{data_loading_funcs['name']}_common_types_param.pkl"), 'rb') as f2: - count_types_param = pickle.load(f2) - count_types.update(count_types_param) - os.remove(join(output_path, f"{data_loading_funcs['name']}_common_types_param.pkl")) - type_filename = type_filename + "_param" - - # if find existing types in "ret" dataset, load them for updating for final common types - if ret_exists and dataset_type != "ret": - with open(join(output_path, f"{data_loading_funcs['name']}_common_types_ret.pkl"), 'rb') as f3: - count_types_ret = pickle.load(f3) - count_types.update(count_types_ret) - os.remove(join(output_path, f"{data_loading_funcs['name']}_common_types_ret.pkl")) - type_filename = type_filename + "_ret" + common_typefile, common_datatype = check_pickle_file(data_loading_funcs, output_path) + if common_datatype == None: + common_typefile = f"{data_loading_funcs['name']}_common_types.pkl" + + else: + logger.info(f"Load existing {common_typefile} file !") + with open(join(output_path, common_typefile), 'rb') as f1: + count_types_pre = pickle.load(f1) + count_types.update(count_types_pre) + common_types = [t.item() for t in train_data_loader.dataset.labels if count_types[t.item()] >= 100] ubiquitous_types = set(le_all.transform(['str', 'int', 'list', 'bool', 'float'])) @@ -124,9 +108,15 @@ def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, m train_data_loader.dataset.labels if t.item() in common_types]) / train_data_loader.dataset.labels.shape[0] * 100.0)) - - with open(join(output_path, f"{data_loading_funcs['name']}_common_types_{type_filename}.pkl"), 'wb') as f: + # saving common types + logger.info("Saving common types...") + with open(join(output_path, f"{common_typefile[:-4]}_{dataset_type}.pkl"), 'wb') as f: pickle.dump(common_types, f) + # remove old common types + if common_datatype is not None: + os.remove(join(output_path, common_typefile)) + + # get the trained_model name and trained_types trained_model_name, trained_types = find_existing_model(data_loading_funcs, output_path) @@ -161,8 +151,8 @@ def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, m # Saving the model logger.info("Saved the trained Type4Py model for %s prediction on the disk" % data_loading_funcs['name']) + torch.save(model.module if torch.cuda.device_count() > 1 else model, + join(output_path, f"{trained_model_name[:-3]}_{dataset_type}.pt")) # remove old model if exists(join(output_path, trained_model_name)): os.remove(join(output_path, trained_model_name)) - torch.save(model.module if torch.cuda.device_count() > 1 else model, - join(output_path, f"{trained_model_name[:-3]}_{dataset_type}.pt")) From 97c1c1182a46b1cd05c292a75c50451aeefd285b Mon Sep 17 00:00:00 2001 From: fenglang Date: Wed, 23 Aug 2023 11:39:22 +0200 Subject: [PATCH 30/43] update model parameters --- type4py/learn_split.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/type4py/learn_split.py b/type4py/learn_split.py index d7990a8..8741ee4 100644 --- a/type4py/learn_split.py +++ b/type4py/learn_split.py @@ -34,19 +34,6 @@ def check_pickle_file(data_loading_funcs, output_path): trained = middle.split("_") return filename, trained return None, None - # var_exist = False - # param_exist = False - # ret_exist = False - # if os.path.exists(join(output_path, f"{data_loading_funcs['name']}_common_types_var.pkl")) and type != "var": - # var_exist = True - # logger.info(f"find existing {data_loading_funcs['name']}_common_types_var.pkl file !") - # if os.path.exists(join(output_path, f"{data_loading_funcs['name']}_common_types_param.pkl")) and type != "param": - # param_exist = True - # logger.info(f"find existing {data_loading_funcs['name']}_common_types_param.pkl file !") - # if os.path.exists(join(output_path, f"{data_loading_funcs['name']}_common_types_ret.pkl")) and type != "ret": - # ret_exist = True - # logger.info(f"find existing {data_loading_funcs['name']}_common_types_ret.pkl file !") - # return var_exist, param_exist, ret_exist # find existing trained model, return trained_types From 67b8a70bfd41670b54023a4fd414d88d8bdfcf07 Mon Sep 17 00:00:00 2001 From: fenglang Date: Wed, 23 Aug 2023 12:04:06 +0200 Subject: [PATCH 31/43] update infer main approach --- type4py/__main__.py | 4 ++-- type4py/deploy/infer_project.py | 10 ++++------ type4py/deploy/static_infer.py | 6 +++--- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/type4py/__main__.py b/type4py/__main__.py index 9aefb75..e8ee000 100644 --- a/type4py/__main__.py +++ b/type4py/__main__.py @@ -259,8 +259,8 @@ def main(): infer_parser_pro.add_argument('--a', '--approach', required=True, type=str, help="infer approach includes t4py, t4pyre, t4pyright, t4pysa") # split according to dataset_split_repo.csv - infer_parser_pro.add_argument('--split', '--split_file', required=True, type=str, - help="file to store the split of projects") + infer_parser_pro.add_argument('--split', '--split_file', type=str, + help="file to store the split of projects", default='test_repo.csv') infer_parser_pro.set_defaults(func=infer_project) # To ONNX format diff --git a/type4py/deploy/infer_project.py b/type4py/deploy/infer_project.py index fcfc48f..02925bb 100644 --- a/type4py/deploy/infer_project.py +++ b/type4py/deploy/infer_project.py @@ -27,20 +27,18 @@ def find_test_list(project_dir, dataset_split): if os.path.exists(dataset_split): repos_list: List[dict] = [] - df = pd.read_csv(dataset_split) - test_df = df[df['set'] == 'test'] + test_df = pd.read_csv(dataset_split) for index, row in test_df.iterrows(): project = row['project'] - author = project.split('/')[1] - repo = project.split('/')[2] + author = project.split('/')[0] + repo = project.split('/')[1] project_path = os.path.join(project_dir, author, repo) if os.path.isdir(project_path): repos_list.append({"author": author, "repo": repo}) return repos_list else: - # logger.info(f"dataset_split file: {dataset_split} does not exist!") - raise FileNotFoundError(f"dataset_split file: {dataset_split} does not exist!") + print("test_repo.csv does not exist!") def ml_infer(repo, model, project_dir): project_author = repo["author"] diff --git a/type4py/deploy/static_infer.py b/type4py/deploy/static_infer.py index 37952d0..5ebbc41 100644 --- a/type4py/deploy/static_infer.py +++ b/type4py/deploy/static_infer.py @@ -4,10 +4,10 @@ import os from pathlib import Path -import utils.pyre_utils as pyre_util -from utils.utils import rebuild_repo +import type4py.deploy.utils.pyre_utils as pyre_util +from type4py.deploy.utils.utils import rebuild_repo from libsa4py.cst_transformers import TypeAnnotationFinder, TypeAnnotationMasker -from utils.preprocess_utils import check, make_types_consistent +from type4py.deploy.utils.preprocess_utils import check, make_types_consistent from libsa4py.utils import list_files, read_file, write_file from libsa4py.exceptions import ParseError from libsa4py.cst_extractor import Extractor From b369011707cbe0bda897956425cd586830d899f6 Mon Sep 17 00:00:00 2001 From: fenglang Date: Wed, 23 Aug 2023 12:06:53 +0200 Subject: [PATCH 32/43] update infer main approach --- type4py/deploy/static_infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/type4py/deploy/static_infer.py b/type4py/deploy/static_infer.py index 5ebbc41..1f347c7 100644 --- a/type4py/deploy/static_infer.py +++ b/type4py/deploy/static_infer.py @@ -7,7 +7,7 @@ import type4py.deploy.utils.pyre_utils as pyre_util from type4py.deploy.utils.utils import rebuild_repo from libsa4py.cst_transformers import TypeAnnotationFinder, TypeAnnotationMasker -from type4py.deploy.utils.preprocess_utils import check, make_types_consistent +from type4py.deploy.utils.type_preprocess import check, make_types_consistent from libsa4py.utils import list_files, read_file, write_file from libsa4py.exceptions import ParseError from libsa4py.cst_extractor import Extractor From acbdae17b44be93671059f1d9ae7afe8138ca479 Mon Sep 17 00:00:00 2001 From: fenglang Date: Wed, 23 Aug 2023 17:06:17 +0200 Subject: [PATCH 33/43] update infer main approach --- type4py/deploy/infer.py | 11 +-- type4py/deploy/infer_project.py | 20 ++++- type4py/deploy/utils/extract_types.py | 104 ++++++++++++++++++++++++ type4py/deploy/utils/pyre_merge.py | 2 +- type4py/deploy/utils/pyright_merge.py | 2 +- type4py/deploy/utils/type_preprocess.py | 37 +++++++++ 6 files changed, 165 insertions(+), 11 deletions(-) create mode 100644 type4py/deploy/utils/extract_types.py diff --git a/type4py/deploy/infer.py b/type4py/deploy/infer.py index 89e2e44..068f006 100644 --- a/type4py/deploy/infer.py +++ b/type4py/deploy/infer.py @@ -7,6 +7,7 @@ from type4py.vectorize import IdentifierSequence, TokenSequence, type_vector from type4py.type_check import MypyManager, type_check_single_file from type4py.utils import create_tmp_file, load_model_params +from type4py.deploy.utils.type_preprocess import apply_nlp_transf from libsa4py import PY_BUILTINS_MOD, PY_TYPING_MOD, PY_COLLECTION_MOD from libsa4py.cst_extractor import Extractor from libsa4py.representations import ModuleInfo @@ -75,7 +76,7 @@ def load_pretrained_model(self): 'euclidean') self.type_clusters_idx.load(join(self.pre_trained_model_path, "type4py_complete_type_cluster_reduced" if self.use_pca else "type4py_complete_type_cluster"), prefault=self.pre_read_type_cluster) - self.type_clusters_labels = np.load(join(self.pre_trained_model_path, f"type4py_complete_true.npy")) + self.type_clusters_labels = np.load(join(self.pre_trained_model_path, f"type4py_complete_true_var_param_ret.npy")) self.label_enc = pickle.load(open(join(self.pre_trained_model_path, "label_encoder_all.pkl"), 'rb')) logger.info(f"Loaded the Type Clusters") @@ -742,16 +743,16 @@ def type_annotate_file(pre_trained_m: PretrainedType4Py, source_code: str, sourc else: src_f_read = source_code #src_f_ext = analyze_src_f(src_f_read).to_dict() - ext_type_hints = Extractor.extract(src_f_read, include_seq2seq=False).to_dict() - logger.info("Extracted JSON-representation of input source file") + ext_type_hints = apply_nlp_transf(Extractor.extract(src_f_read, include_seq2seq=False).to_dict()) + # logger.info("Extracted JSON-representation of input source file") all_type_slots, vars_type_hints, params_type_hints, rets_type_hints = get_dps_single_file(ext_type_hints) - logger.info("Extracted type hints from JSON") + # logger.info("Extracted type hints from JSON") ext_type_hints = get_type_preds_single_file(ext_type_hints, all_type_slots, (vars_type_hints, params_type_hints, rets_type_hints), pre_trained_m, filter_pred_types) - logger.info("Predicted type annotations for the given file") + # logger.info("Predicted type annotations for the given file") # type_check_inferred_types(src_f_ext, src_f_read, join(dirname(source_file_path), # splitext(basename(source_file_path))[0]+OUTPUT_FILE_SUFFIX)) diff --git a/type4py/deploy/infer_project.py b/type4py/deploy/infer_project.py index 02925bb..20cdff6 100644 --- a/type4py/deploy/infer_project.py +++ b/type4py/deploy/infer_project.py @@ -8,10 +8,11 @@ import tqdm from type4py.deploy.infer import PretrainedType4Py, type_annotate_file +from type4py.deploy.utils.extract_types import extract_result_ml from type4py import logger from libsa4py.exceptions import ParseError -from libsa4py.utils import list_files, find_repos_list, save_json +from libsa4py.utils import list_files, find_repos_list, save_json, load_json from pathlib import Path import multiprocessing from type4py.deploy.static_infer import pyre_infer, pyright_infer @@ -99,13 +100,23 @@ def infer_projects(model, project_dir, tar_dir, approach, split_file): logger.info(f'Totally find {len(repo_infos_test)} projects in project dir') if approach == "t4py": - for repo in tqdm(repo_infos_test): + predict_list = [] + os.makedirs("ml_res", exist_ok=True) + ml_dir = "ml_res" + for repo in repo_infos_test: project_name = "".join((repo["author"], repo["repo"])) - filepath = os.path.join(tar_dir, f"{project_name}_mlInfer.json") - processed_file = ml_infer(repo, model, project_dir, tar_dir) + project_id = "/".join((repo["author"], repo["repo"])) + filepath = os.path.join(ml_dir, f"{project_name}_mlInfer.json") + processed_file = ml_infer(repo, model, project_dir) save_json(filepath, processed_file) + label_filename = "".join((repo["author"], repo["repo"])) + ".json" + label_file = load_json(os.path.join(tar_dir, "processed_projects", label_filename)) + ml_predicts = extract_result_ml(label_file, processed_file, project_id) + predict_list.extend(ml_predicts) + save_json(os.path.join(tar_dir, f"type4py_complete_test_predictions.json"),predict_list) if approach == "t4pyre": + predict_list = [] for repo in tqdm(repo_infos_test): process1 = multiprocessing.Process(target=run_mlInfer) process2 = multiprocessing.Process(target=run_pyreInfer) @@ -125,6 +136,7 @@ def infer_projects(model, project_dir, tar_dir, approach, split_file): filepath = os.path.join(tar_dir, f"{project_name}_t4pyreInfer.json") save_json(filepath, hy_result) + if approach == "t4pyright": for repo in tqdm(repo_infos_test): process1 = multiprocessing.Process(target=run_mlInfer) diff --git a/type4py/deploy/utils/extract_types.py b/type4py/deploy/utils/extract_types.py new file mode 100644 index 0000000..9e04484 --- /dev/null +++ b/type4py/deploy/utils/extract_types.py @@ -0,0 +1,104 @@ + +def extract_var(label_unit, process_unit): + if "variables" in label_unit.keys() and "variables_p" in process_unit.keys(): + var_l = label_unit["variables"] + var_p_l = process_unit["variables_p"] + var_list_unit = [] + if len(var_l)!=0: + for var_name in var_l.keys(): + label = var_l[var_name] + if var_name in var_p_l.keys(): + predicts = var_p_l[var_name] + var_list_unit.append({"original_type" : label, "t4py_predicts": predicts, "task": "variable"}) + return var_list_unit + else: + return None + +def extract_param(label_unit, process_unit): + if "params" in label_unit.keys() and "params_p" in process_unit.keys(): + param_l = label_unit["params"] + param_p_l = process_unit["params_p"] + param_list_unit = [] + if len(param_l)!=0: + for param_name in param_l.keys(): + label = param_l[param_name] + if param_name in param_p_l.keys(): + predicts = param_p_l[param_name] + param_list_unit.append({"original_type" : label, "t4py_predicts": predicts, "task": "parameter"}) + return param_list_unit + else: + return None + +def extract_ret(label_unit, process_unit): + if "ret_type" in label_unit.keys() and "ret_type_p" in process_unit.keys(): + ret_unit = [] + ret_l = label_unit["ret_type"] + ret_p = process_unit["ret_type_p"] + ret_unit.append({"original_type": ret_l, "t4py_predicts": ret_p, "task": "return types"}) + return ret_unit + else: + return None + + + +def extract_file(label_f, process_f): + type_f = [] + var_list_f = [] + param_list_f = [] + ret_list_f = [] + + var_list = extract_var(label_f, process_f) + if var_list is not None and len(var_list)!=0: + var_list_f.extend(var_list) + + if "funcs" in label_f.keys() and "funcs" in process_f.keys(): + for i in range(len(label_f["funcs"])): + label_func = label_f["funcs"][i] + process_func = process_f["funcs"][i] + if label_func['name'] == process_func['name']: + var_list = extract_var(label_func, process_func) + if var_list is not None and len(var_list) != 0: + var_list_f.extend(var_list) + param_list = extract_param(label_func, process_func) + if param_list is not None and len(param_list) != 0: + param_list_f.extend(param_list) + ret_list = extract_ret(label_func, process_func) + if ret_list is not None and len(ret_list) != 0: + ret_list_f.extend(ret_list) + + if "classes" in label_f.keys() and "classes" in process_f.keys(): + for j in range(len(label_f["classes"])): + label_class = label_f['classes'][j] + process_class = process_f['classes'][j] + if label_class["name"] == process_class["name"]: + var_list = extract_var(label_class, process_class) + if var_list is not None and len(var_list) != 0: + var_list_f.extend(var_list) + if "funcs" in label_class.keys() and "funcs" in process_class.keys(): + for k in range(len(label_class["funcs"])): + label_func = label_class["funcs"][k] + process_func = process_class["funcs"][k] + if label_func['name'] == process_func['name']: + var_list = extract_var(label_func, process_func) + if var_list is not None and len(var_list) != 0: + var_list_f.extend(var_list) + param_list = extract_param(label_func, process_func) + if param_list is not None and len(param_list) != 0: + param_list_f.extend(param_list) + ret_list = extract_ret(label_func, process_func) + if ret_list is not None and len(ret_list) != 0: + ret_list_f.extend(ret_list) + + type_f.extend(var_list_f) + type_f.extend(param_list_f) + type_f.extend(ret_list_f) + return type_f + + +def extract_result_ml(label_file, processed_file, project_id): + type_project = [] + for key in processed_file[project_id]['src_files'].keys(): + typelist_f = extract_file(label_file[project_id]['src_files'][key], processed_file[project_id]['src_files'][key]) + type_project.extend(typelist_f) + return type_project + diff --git a/type4py/deploy/utils/pyre_merge.py b/type4py/deploy/utils/pyre_merge.py index 6125986..7680f86 100644 --- a/type4py/deploy/utils/pyre_merge.py +++ b/type4py/deploy/utils/pyre_merge.py @@ -1,7 +1,7 @@ """ functions for merging the type information from static analysis and machine learning """ -from preprocess_utils import check, make_types_consistent +from type4py.deploy.utils.type_preprocess import check, make_types_consistent def merge_vars(sa_dict, ml_dict): add = 0 diff --git a/type4py/deploy/utils/pyright_merge.py b/type4py/deploy/utils/pyright_merge.py index 3d9bf0d..ac828f5 100644 --- a/type4py/deploy/utils/pyright_merge.py +++ b/type4py/deploy/utils/pyright_merge.py @@ -1,4 +1,4 @@ -from preprocess_utils import check, make_types_consistent, resolve_type_aliasing +from type4py.deploy.utils.type_preprocess import check, make_types_consistent, resolve_type_aliasing def merge_vars(var_dict_sa, ml_dict, range): if "variables_p" in ml_dict.keys(): diff --git a/type4py/deploy/utils/type_preprocess.py b/type4py/deploy/utils/type_preprocess.py index 14e2a83..533ff85 100644 --- a/type4py/deploy/utils/type_preprocess.py +++ b/type4py/deploy/utils/type_preprocess.py @@ -6,6 +6,43 @@ ''' import regex +from libsa4py.nl_preprocessing import NLPreprocessor + + +def apply_nlp_transf(extracted_module): + """ + Applies NLP transformation to identifiers in a module + """ + nlp_prep = NLPreprocessor() + + def fn_nlp_transf(fn_d, nlp_prep: NLPreprocessor): + fn_d['name'] = nlp_prep.process_identifier(fn_d['name']) + fn_d['params'] = {nlp_prep.process_identifier(p): t for p, t in fn_d['params'].items()} + fn_d['ret_exprs'] = [nlp_prep.process_identifier(r.replace('return ', '')) for r in fn_d['ret_exprs']] + fn_d['params_occur'] = {p: [nlp_prep.process_sentence(j) for i in o for j in i] for p, o in + fn_d['params_occur'].items()} + fn_d['variables'] = {nlp_prep.process_identifier(v): t for v, t in fn_d['variables'].items()} + fn_d['fn_var_occur'] = {v: [nlp_prep.process_sentence(j) for i in o for j in i] for v, o in + fn_d['fn_var_occur'].items()} + fn_d['params_descr'] = {nlp_prep.process_identifier(p): nlp_prep.process_sentence(fn_d['params_descr'][p]) \ + for p in fn_d['params_descr'].keys()} + fn_d['docstring']['func'] = nlp_prep.process_sentence(fn_d['docstring']['func']) + fn_d['docstring']['ret'] = nlp_prep.process_sentence(fn_d['docstring']['ret']) + fn_d['docstring']['long_descr'] = nlp_prep.process_sentence(fn_d['docstring']['long_descr']) + return fn_d + + extracted_module['variables'] = {nlp_prep.process_identifier(v): t for v, t in + extracted_module['variables'].items()} + extracted_module['mod_var_occur'] = {v: [nlp_prep.process_sentence(j) for i in o for j in i] for v,o in extracted_module['mod_var_occur'].items()} + + for c in extracted_module['classes']: + c['variables'] = {nlp_prep.process_identifier(v): t for v, t in c['variables'].items()} + c['cls_var_occur'] = {v: [nlp_prep.process_sentence(j) for i in o for j in i] for v, o in + c['cls_var_occur'].items()} + c['funcs'] = [fn_nlp_transf(f, nlp_prep) for f in c['funcs']] + extracted_module['funcs'] = [fn_nlp_transf(f, nlp_prep) for f in extracted_module['funcs']] + + return extracted_module def check(t: str): types = ["", "Any", "any", "None", "Object", "object", "type", "Type[Any]", From d6125970ddbac94f0e9a2cd9b43708167db1dd68 Mon Sep 17 00:00:00 2001 From: fenglang Date: Wed, 23 Aug 2023 17:25:08 +0200 Subject: [PATCH 34/43] update infer main approach --- type4py/deploy/infer_project.py | 20 +++++++---- type4py/deploy/static_infer.py | 53 +++++++++++++++--------------- type4py/deploy/utils/pyre_merge.py | 2 +- 3 files changed, 42 insertions(+), 33 deletions(-) diff --git a/type4py/deploy/infer_project.py b/type4py/deploy/infer_project.py index 20cdff6..ff09840 100644 --- a/type4py/deploy/infer_project.py +++ b/type4py/deploy/infer_project.py @@ -78,11 +78,11 @@ def ml_infer(repo, model, project_dir): return project_analyzed_files -def run_mlInfer(): +def run_mlInfer(repo, model, project_dir): ml_result = ml_infer(repo, model, project_dir) ml_queue.put(ml_result) -def run_pyreInfer(): +def run_pyreInfer(repo, project_dir): pyre_result = pyre_infer(repo, project_dir) pyre_queue.put(pyre_result) @@ -117,9 +117,11 @@ def infer_projects(model, project_dir, tar_dir, approach, split_file): if approach == "t4pyre": predict_list = [] - for repo in tqdm(repo_infos_test): - process1 = multiprocessing.Process(target=run_mlInfer) - process2 = multiprocessing.Process(target=run_pyreInfer) + os.makedirs("t4pyre_res", exist_ok=True) + ml_dir = "t4pyre_res" + for repo in repo_infos_test: + process1 = multiprocessing.Process(target=run_mlInfer, args=(repo, model, project_dir)) + process2 = multiprocessing.Process(target=run_pyreInfer, args = (repo, project_dir)) # Start the processes process1.start() @@ -133,8 +135,14 @@ def infer_projects(model, project_dir, tar_dir, approach, split_file): project_name = "".join((repo["author"], repo["repo"])) hy_result = merge_pyre(ml_result, sa_result, project_id) - filepath = os.path.join(tar_dir, f"{project_name}_t4pyreInfer.json") + filepath = os.path.join(ml_dir, f"{project_name}_t4pyreInfer.json") save_json(filepath, hy_result) + label_filename = "".join((repo["author"], repo["repo"])) + ".json" + label_file = load_json(os.path.join(tar_dir, "processed_projects", label_filename)) + t4pyre_predicts = extract_result_ml(label_file, hy_result, project_id) + predict_list.extend(t4pyre_predicts) + + save_json(os.path.join(tar_dir, f"type4pyre_complete_test_predictions.json"), predict_list) if approach == "t4pyright": diff --git a/type4py/deploy/static_infer.py b/type4py/deploy/static_infer.py index 1f347c7..a82ff77 100644 --- a/type4py/deploy/static_infer.py +++ b/type4py/deploy/static_infer.py @@ -6,6 +6,7 @@ from pathlib import Path import type4py.deploy.utils.pyre_utils as pyre_util from type4py.deploy.utils.utils import rebuild_repo +from type4py.deploy.utils.type_preprocess import apply_nlp_transf from libsa4py.cst_transformers import TypeAnnotationFinder, TypeAnnotationMasker from type4py.deploy.utils.type_preprocess import check, make_types_consistent from libsa4py.utils import list_files, read_file, write_file @@ -49,34 +50,34 @@ def pyre_infer(repo, project_dir): if len(project_files) != 0: print(f'Running pyre query for project {project_path}') - try: - for filename, f_relative in project_files: + for filename, f_relative in project_files: + try: pyre_data_file = pyre_util.pyre_query_types(project_path, filename) project_analyzed_files[project_id]["src_files"][filename] = \ - Extractor.extract(read_file(filename), pyre_data_file).to_dict() - except ParseError as err: - print("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) - except UnicodeDecodeError: - print(f"Could not read file {filename}") - except Exception as err: - print("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) - - print(f'Saving static analysis results for {project_id}...') - - if len(project_analyzed_files[project_id]["src_files"].keys()) != 0: - project_analyzed_files[project_id]["type_annot_cove"] = \ - round(sum([project_analyzed_files[project_id]["src_files"][s]["type_annot_cove"] for s in - project_analyzed_files[project_id]["src_files"].keys()]) / len( - project_analyzed_files[project_id]["src_files"].keys()), 2) - - pyre_util.watchman_shutdown(project_path) - pyre_util.pyre_server_shutdown(project_path) - pyre_util.clean_config(project_path) - - # remove cache projects - shutil.rmtree(cache_path) - - return project_analyzed_files + apply_nlp_transf(Extractor.extract(read_file(filename), pyre_data_file).to_dict()) + except ParseError as err: + print("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) + except UnicodeDecodeError: + print(f"Could not read file {filename}") + except Exception as err: + print("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) + + print(f'Saving static analysis results for {project_id}...') + + if len(project_analyzed_files[project_id]["src_files"].keys()) != 0: + project_analyzed_files[project_id]["type_annot_cove"] = \ + round(sum([project_analyzed_files[project_id]["src_files"][s]["type_annot_cove"] for s in + project_analyzed_files[project_id]["src_files"].keys()]) / len( + project_analyzed_files[project_id]["src_files"].keys()), 2) + + pyre_util.watchman_shutdown(project_path) + pyre_util.pyre_server_shutdown(project_path) + pyre_util.clean_config(project_path) + + # remove cache projects + shutil.rmtree(cache_path) + + return project_analyzed_files def extract(code): diff --git a/type4py/deploy/utils/pyre_merge.py b/type4py/deploy/utils/pyre_merge.py index 7680f86..ac67e0b 100644 --- a/type4py/deploy/utils/pyre_merge.py +++ b/type4py/deploy/utils/pyre_merge.py @@ -132,7 +132,7 @@ def update_key(file_name, project_id): start_index = i break new_list = [] - new_list.append("data") + new_list.append("raw_projects") while start_index < len(list): new_list.append(list[start_index]) start_index = start_index + 1 From 704af65a01e0eb8578c2f6151e0ea69cc852564b Mon Sep 17 00:00:00 2001 From: fenglang Date: Wed, 23 Aug 2023 17:41:28 +0200 Subject: [PATCH 35/43] update infer main approach --- README.md | 12 ++++++++++++ type4py/deploy/utils/utils.py | 5 ++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a2f2180..96e3e93 100644 --- a/README.md +++ b/README.md @@ -108,6 +108,18 @@ Description: - `$OUTPUT_DIR`: The path that was used in the first step to store processed projects. - `$DIMENSION`: Reduces the dimension of type clusters to the specified value [Default: 256] +## 7*. Project-base inference +```python +$ type4py infer_project --m results --p raw_projects --o results --a t4py +``` +- `$--m`: The path that saved the model +- `$--p`:The path that saved the raw projects, for project-base inference +- `$--o`:The path that output the inference results +- `$--a`:The approach you want, including t4py, t4pyre, t4pyright +```python +$ type4py infer_project --m results --p raw_projects --o results --a t4pyre +``` + ## 7. Testing ``` $ type4py predicts --o $OUTPUT_DIR diff --git a/type4py/deploy/utils/utils.py b/type4py/deploy/utils/utils.py index 5a1fdc3..55c3cd3 100644 --- a/type4py/deploy/utils/utils.py +++ b/type4py/deploy/utils/utils.py @@ -53,4 +53,7 @@ def rebuild_repo(project_dir, tar_dir, repo_info): source_files = list_files(repo_path) for filename in source_files: - rebuild(filename, project_dir, tar_dir) \ No newline at end of file + try: + rebuild(filename, project_dir, tar_dir) + except Exception as e: + print(str(e)) \ No newline at end of file From 6fab9773123ebf0f79314b181e16a928822a09a6 Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 24 Aug 2023 10:19:14 +0200 Subject: [PATCH 36/43] update type preprocess_list --- type4py/deploy/utils/extract_types.py | 44 +++++++++++++++++++++------ 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/type4py/deploy/utils/extract_types.py b/type4py/deploy/utils/extract_types.py index 9e04484..0ab2126 100644 --- a/type4py/deploy/utils/extract_types.py +++ b/type4py/deploy/utils/extract_types.py @@ -1,46 +1,51 @@ +from type4py.deploy.utils.type_preprocess import make_types_consistent, resolve_type_aliasing, check + +import re + def extract_var(label_unit, process_unit): if "variables" in label_unit.keys() and "variables_p" in process_unit.keys(): var_l = label_unit["variables"] var_p_l = process_unit["variables_p"] var_list_unit = [] - if len(var_l)!=0: + if len(var_l) != 0: for var_name in var_l.keys(): label = var_l[var_name] if var_name in var_p_l.keys(): predicts = var_p_l[var_name] - var_list_unit.append({"original_type" : label, "t4py_predicts": predicts, "task": "variable"}) + var_list_unit.append({"original_type": label, "predictions": predicts, "task": "Variable"}) return var_list_unit else: return None + def extract_param(label_unit, process_unit): if "params" in label_unit.keys() and "params_p" in process_unit.keys(): param_l = label_unit["params"] param_p_l = process_unit["params_p"] param_list_unit = [] - if len(param_l)!=0: + if len(param_l) != 0: for param_name in param_l.keys(): label = param_l[param_name] if param_name in param_p_l.keys(): predicts = param_p_l[param_name] - param_list_unit.append({"original_type" : label, "t4py_predicts": predicts, "task": "parameter"}) + param_list_unit.append({"original_type": label, "predictions": predicts, "task": "Parameter"}) return param_list_unit else: return None + def extract_ret(label_unit, process_unit): if "ret_type" in label_unit.keys() and "ret_type_p" in process_unit.keys(): ret_unit = [] ret_l = label_unit["ret_type"] ret_p = process_unit["ret_type_p"] - ret_unit.append({"original_type": ret_l, "t4py_predicts": ret_p, "task": "return types"}) + ret_unit.append({"original_type": ret_l, "predictions": ret_p, "task": "Return"}) return ret_unit else: return None - def extract_file(label_f, process_f): type_f = [] var_list_f = [] @@ -48,7 +53,7 @@ def extract_file(label_f, process_f): ret_list_f = [] var_list = extract_var(label_f, process_f) - if var_list is not None and len(var_list)!=0: + if var_list is not None and len(var_list) != 0: var_list_f.extend(var_list) if "funcs" in label_f.keys() and "funcs" in process_f.keys(): @@ -95,10 +100,29 @@ def extract_file(label_f, process_f): return type_f +def preprocess_types(type_list): + processed_list = [] + for t in type_list: + processed_label = make_types_consistent(t["original_type"]) + processed_label = resolve_type_aliasing(processed_label) + if check(processed_label): + processed_preds = [] + for p in t["predictions"]: + processed_pred = make_types_consistent(p[0]) + processed_pred = resolve_type_aliasing(processed_pred) + if check(processed_pred): + processed_preds.append([processed_pred, p[1]]) + processed_list.append({"original_type": processed_label, "predictions": processed_preds, + "task": t["task"], + 'is_parametric': bool(re.match(r'(.+)\[(.+)\]', processed_label))}) + return processed_list + + def extract_result_ml(label_file, processed_file, project_id): type_project = [] for key in processed_file[project_id]['src_files'].keys(): - typelist_f = extract_file(label_file[project_id]['src_files'][key], processed_file[project_id]['src_files'][key]) + typelist_f = extract_file(label_file[project_id]['src_files'][key], + processed_file[project_id]['src_files'][key]) type_project.extend(typelist_f) - return type_project - + type_project_processed = preprocess_types(type_project) + return type_project_processed From ace9baf0d6893641879736f63fa59c76525b58bf Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 24 Aug 2023 10:45:57 +0200 Subject: [PATCH 37/43] update type preprocess_list --- type4py/__main__.py | 31 ++++++++++++++++++++++++------- type4py/deploy/infer_project.py | 4 ++-- type4py/eval.py | 6 +++--- 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/type4py/__main__.py b/type4py/__main__.py index e8ee000..1c32f10 100644 --- a/type4py/__main__.py +++ b/type4py/__main__.py @@ -53,16 +53,19 @@ def extract(args): p = Pipeline(args.c, args.o, True, False, args.d) p.run(find_repos_list(args.c) if args.l is None else find_repos_list(args.c)[:args.l], args.w) + def preprocess(args): from type4py.preprocess import preprocess_ext_fns setup_logs_file(args.o, "preprocess") preprocess_ext_fns(args.o, args.l, args.rvth) + def vectorize(args): from type4py.vectorize import vectorize_args_ret setup_logs_file(args.o, "vectorize") vectorize_args_ret(args.o) + def learn(args): from type4py.learn import train setup_logs_file(args.o, "learn") @@ -75,6 +78,7 @@ def learn(args): else: train(args.o, data_loading_comb, args.p, args.v) + # add learn_split function for CLI command "learn_split" def learn_split(args): from type4py.learn_split import train_split @@ -82,6 +86,7 @@ def learn_split(args): if args.c: train_split(args.o, data_loading_comb_sep, args.dt, args.p, args.v) + def predict(args): from type4py.predict import test setup_logs_file(args.o, "predict") @@ -94,31 +99,34 @@ def predict(args): elif args.c: test(args.o, data_loading_comb, args.l, args.rtc) + # add gen_cluster function for CLI command "gen_clu" def gen_type_cluster(args): from type4py.gen_type_cluster import gen_type_cluster setup_logs_file(args.o, "gen_clusters") gen_type_cluster(args.o, data_loading_comb_sep, args.dt) + def predict_split(args): from type4py.predict_split import test_split setup_logs_file(args.o, "predict_sep") if args.c: test_split(args.o, data_loading_comb) + def eval(args): from type4py.eval import evaluate setup_logs_file(args.o, "eval") tasks = {'c': {'Parameter', 'Return', 'Variable'}, 'p': {'Parameter'}, 'r': {'Return'}, 'v': {'Variable'}} if args.woi: - evaluate(args.o, data_loading_woi['name'], tasks[args.t], args.tp, args.mrr) + evaluate(args.o, args.a, data_loading_woi['name'], tasks[args.t], args.tp, args.mrr) elif args.woc: - evaluate(args.o, data_loading_woc['name'], tasks[args.t], args.tp, args.mrr) + evaluate(args.o, args.a, data_loading_woc['name'], tasks[args.t], args.tp, args.mrr) elif args.wov: - evaluate(args.o, data_loading_wov['name'], tasks[args.t], args.tp, args.mrr) + evaluate(args.o, args.a, data_loading_wov['name'], tasks[args.t], args.tp, args.mrr) else: - evaluate(args.o, data_loading_comb['name'], tasks[args.t], args.tp, args.mrr) + evaluate(args.o, args.a, data_loading_comb['name'], tasks[args.t], args.tp, args.mrr) def infer(args): @@ -126,6 +134,7 @@ def infer(args): setup_logs_file(args.m, 'infer') infer_main(args.m, args.f) + # add projects-based infer function for command "infer_project" ''' project-based CLI command includes three approaches: @@ -133,6 +142,8 @@ def infer(args): -t4pyre: typ4py + pyre -t4pyright: type4py + pyright ''' + + def infer_project(args): approach_list = {"t4py", "t4pyre", "t4pyright"} if args.a in approach_list: @@ -142,6 +153,7 @@ def infer_project(args): else: raise InferApproachNotFound + def main(): arg_parser = argparse.ArgumentParser() sub_parsers = arg_parser.add_subparsers(dest='cmd') @@ -214,19 +226,24 @@ def main(): # gen type cluster incremental: predict phase generate type cluster predict_parser_gen_cluster = sub_parsers.add_parser('gen_type_clu') - predict_parser_gen_cluster.add_argument('--o', '--output', required=True, type=str, help="Path to processed projects") - predict_parser_gen_cluster.add_argument('--dt', '--datatype', required=True, help="Datatype for generating type clusters") + predict_parser_gen_cluster.add_argument('--o', '--output', required=True, type=str, + help="Path to processed projects") + predict_parser_gen_cluster.add_argument('--dt', '--datatype', required=True, + help="Datatype for generating type clusters") predict_parser_gen_cluster.set_defaults(func=gen_type_cluster) # gen predictions via type cluster: predict phase generate predictions predict_parser_gen_pred = sub_parsers.add_parser('predicts') predict_parser_gen_pred.add_argument('--o', '--output', required=True, type=str, help="Path to processed projects") - predict_parser_gen_pred.add_argument('--c', '--complete', default=True, action="store_true", help="Complete Type4Py model") + predict_parser_gen_pred.add_argument('--c', '--complete', default=True, action="store_true", + help="Complete Type4Py model") predict_parser_gen_pred.set_defaults(func=predict_split) # Evaluation phase eval_parser = sub_parsers.add_parser('eval') eval_parser.add_argument('--o', '--output', required=True, type=str, help="Path to processed projects") + eval_parser.add_argument('--a', '--approach', required=True, type=str, + help="eval the infer approach includes t4py, t4pyre") eval_parser.add_argument('--t', '--task', default="c", type=str, help="Prediction tasks (combined -> c |parameters -> p| return -> r| variable -> v)") eval_parser.add_argument('--tp', '--topn', default=10, type=int, help="Report top-n predictions [default n=10]") diff --git a/type4py/deploy/infer_project.py b/type4py/deploy/infer_project.py index ff09840..61299b7 100644 --- a/type4py/deploy/infer_project.py +++ b/type4py/deploy/infer_project.py @@ -113,7 +113,7 @@ def infer_projects(model, project_dir, tar_dir, approach, split_file): label_file = load_json(os.path.join(tar_dir, "processed_projects", label_filename)) ml_predicts = extract_result_ml(label_file, processed_file, project_id) predict_list.extend(ml_predicts) - save_json(os.path.join(tar_dir, f"type4py_complete_test_predictions.json"),predict_list) + save_json(os.path.join(tar_dir, f"{approach}_complete_test_predictions.json"),predict_list) if approach == "t4pyre": predict_list = [] @@ -142,7 +142,7 @@ def infer_projects(model, project_dir, tar_dir, approach, split_file): t4pyre_predicts = extract_result_ml(label_file, hy_result, project_id) predict_list.extend(t4pyre_predicts) - save_json(os.path.join(tar_dir, f"type4pyre_complete_test_predictions.json"), predict_list) + save_json(os.path.join(tar_dir, f"{approach}_complete_test_predictions.json"), predict_list) if approach == "t4pyright": diff --git a/type4py/eval.py b/type4py/eval.py index 7654fb8..0f26bed 100644 --- a/type4py/eval.py +++ b/type4py/eval.py @@ -225,11 +225,11 @@ def is_param_correct(true_param_type: str, pred_types: np.array): return np.mean(mrr) * 100 -def evaluate(output_path: str, data_name: str, tasks: set, top_n: int = 10, mrr_all=False): +def evaluate(output_path: str, approach_name: str, data_name: str, tasks: set, top_n: int = 10, mrr_all=False): logger.info(f"Evaluating the Type4Py {data_name} model for {tasks} prediction task") logger.info(f"*************************************************************************") - # Loading label encoder andd common types - test_pred = load_json(join(output_path, f'type4py_{data_name}_test_predictions.json')) + # Loading label encoder and common types + test_pred = load_json(join(output_path, f'{approach_name}_{data_name}_test_predictions.json')) le_all = pickle.load(open(join(output_path, "label_encoder_all.pkl"), 'rb')) common_types = pickle.load(open(join(output_path, "complete_common_types_ret_var_param.pkl"), 'rb')) common_types = [int(element) for element in common_types] From 1750b747d2fd531da16c42a111efec2c2bde2370 Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 24 Aug 2023 10:53:31 +0200 Subject: [PATCH 38/43] update eval scripts --- type4py/eval.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/type4py/eval.py b/type4py/eval.py index 0f26bed..cdcd906 100644 --- a/type4py/eval.py +++ b/type4py/eval.py @@ -101,7 +101,7 @@ def pred_param_types(pred_types: np.array, true_param_type): corr_param_common_types / all_param_common_types * 100.0, corr_param_rare_types / all_param_rare_types * 100.0 -def eval_pred_dsl(test_pred: List[dict], common_types, tasks: set, top_n=10, mrr_all=False): +def eval_pred_dsl(test_pred: List[dict], common_types, tasks: set, approach, top_n=10, mrr_all=False): """ Computes evaluation metrics such as recall, precision and f1-score """ @@ -194,20 +194,28 @@ def is_param_correct(true_param_type: str, pred_types: np.array): # mrr_exact_rare.append(r) tasks = 'Combined' if tasks == {'Parameter', 'Return', 'Variable'} else list(tasks)[0] + + if approach == "t4py": + approach_name = "Type4Py" + elif approach == "t4pyre": + approach_name = "Type4Pyre" + else: + approach_name = "UnDefined" + logger.info( - f"Type4Py - {tasks} - Exact match - all: {(corr_ubiq_types + corr_common_types + corr_rare_types) / (all_ubiq_types + all_common_types + all_rare_types) * 100.0:.1f}%") - logger.info(f"Type4Py - {tasks} - Exact match - ubiquitous: {corr_ubiq_types / all_ubiq_types * 100.0:.1f}%") - logger.info(f"Type4Py - {tasks} - Exact match - common: {corr_common_types / all_common_types * 100.0:.1f}%") - logger.info(f"Type4Py - {tasks} - Exact match - rare: {corr_rare_types / all_rare_types * 100.0:.1f}%") + f"{approach_name} - {tasks} - Exact match - all: {(corr_ubiq_types + corr_common_types + corr_rare_types) / (all_ubiq_types + all_common_types + all_rare_types) * 100.0:.1f}%") + logger.info(f"{approach_name} - {tasks} - Exact match - ubiquitous: {corr_ubiq_types / all_ubiq_types * 100.0:.1f}%") + logger.info(f"{approach_name} - {tasks} - Exact match - common: {corr_common_types / all_common_types * 100.0:.1f}%") + logger.info(f"{approach_name} - {tasks} - Exact match - rare: {corr_rare_types / all_rare_types * 100.0:.1f}%") logger.info( - f"Type4Py - {tasks} - Parametric match - all: {(corr_ubiq_types + corr_common_types + corr_rare_types + corr_param_common_types + corr_param_rare_types) / (all_ubiq_types + all_common_types + all_rare_types) * 100.0:.1f}%") + f"{approach_name} - {tasks} - Parametric match - all: {(corr_ubiq_types + corr_common_types + corr_rare_types + corr_param_common_types + corr_param_rare_types) / (all_ubiq_types + all_common_types + all_rare_types) * 100.0:.1f}%") logger.info( - f"Type4Py - {tasks} - Parametric match - common: {(corr_param_common_types + corr_common_types) / all_common_types * 100.0:.1f}%") + f"{approach_name} - {tasks} - Parametric match - common: {(corr_param_common_types + corr_common_types) / all_common_types * 100.0:.1f}%") logger.info( - f"Type4Py - {tasks} - Parametric match - rare: {(corr_param_rare_types + corr_rare_types) / all_rare_types * 100.0:.1f}%") + f"{approach_name} - {tasks} - Parametric match - rare: {(corr_param_rare_types + corr_rare_types) / all_rare_types * 100.0:.1f}%") - logger.info(f"Type4Py - Mean reciprocal rank {np.mean(mrr) * 100:.1f}") + logger.info(f"{approach_name}- Mean reciprocal rank {np.mean(mrr) * 100:.1f}") if mrr_all: logger.info(f"Type4Py - {tasks} - MRR - Exact match - all: {np.mean(mrr) * 100:.1f}") @@ -237,4 +245,4 @@ def evaluate(output_path: str, approach_name: str, data_name: str, tasks: set, t # ubiquitous_types = {'str', 'int', 'list', 'bool', 'float'} # common_types = common_types - ubiquitous_types - eval_pred_dsl(test_pred, common_types, tasks, top_n=top_n, mrr_all=mrr_all) \ No newline at end of file + eval_pred_dsl(test_pred, common_types, tasks, approach_name, top_n=top_n, mrr_all=mrr_all) \ No newline at end of file From 46b3ea5883adbc44fd86be4df153c911f144309c Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 24 Aug 2023 11:11:43 +0200 Subject: [PATCH 39/43] update eval scripts --- type4py/deploy/infer_project.py | 11 ++++++++++- type4py/eval.py | 2 ++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/type4py/deploy/infer_project.py b/type4py/deploy/infer_project.py index 61299b7..def6789 100644 --- a/type4py/deploy/infer_project.py +++ b/type4py/deploy/infer_project.py @@ -146,6 +146,9 @@ def infer_projects(model, project_dir, tar_dir, approach, split_file): if approach == "t4pyright": + predict_list = [] + os.makedirs("t4pyright_res", exist_ok=True) + ml_dir = "t4pyright_res" for repo in tqdm(repo_infos_test): process1 = multiprocessing.Process(target=run_mlInfer) process2 = multiprocessing.Process(target=run_pyrightInfer) @@ -162,8 +165,14 @@ def infer_projects(model, project_dir, tar_dir, approach, split_file): project_name = "".join((repo["author"], repo["repo"])) hy_result = merge_pyright(ml_result, sa_result, project_id) - filepath = os.path.join(tar_dir, f"{project_name}_t4pyrightInfer.json") + filepath = os.path.join(ml_dir, f"{project_name}_t4pyrightInfer.json") save_json(filepath, hy_result) + label_filename = "".join((repo["author"], repo["repo"])) + ".json" + label_file = load_json(os.path.join(tar_dir, "processed_projects", label_filename)) + t4pyright_predicts = extract_result_ml(label_file, hy_result, project_id) + predict_list.extend(t4pyright_predicts) + save_json(os.path.join(tar_dir, f"{approach}_complete_test_predictions.json"), predict_list) + def infer_project_main(model_path, input_path, output_path, approach, split_file): t4py_pretrained_m = PretrainedType4Py(model_path, "gpu", pre_read_type_cluster=False, use_pca=True) diff --git a/type4py/eval.py b/type4py/eval.py index cdcd906..5b8fa04 100644 --- a/type4py/eval.py +++ b/type4py/eval.py @@ -199,6 +199,8 @@ def is_param_correct(true_param_type: str, pred_types: np.array): approach_name = "Type4Py" elif approach == "t4pyre": approach_name = "Type4Pyre" + elif approach == "t4pyright": + approach_name = "Type4Pyright" else: approach_name = "UnDefined" From 9592acf872aeb91d303d52acd63155246265547e Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 24 Aug 2023 12:43:14 +0200 Subject: [PATCH 40/43] update infer project scripts --- type4py/deploy/infer_project.py | 9 ++++++--- type4py/learn_split.py | 2 -- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/type4py/deploy/infer_project.py b/type4py/deploy/infer_project.py index def6789..7ea9d23 100644 --- a/type4py/deploy/infer_project.py +++ b/type4py/deploy/infer_project.py @@ -63,11 +63,14 @@ def ml_infer(repo, model, project_dir): project_analyzed_files[project_id]["src_files"][filename] = \ ext_type_hints except ParseError as err: - print("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) + # print("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) + pass except UnicodeDecodeError: - print(f"Could not read file {filename}") + # print(f"Could not read file {filename}") + pass except Exception as err: - print("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) + # print("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) + pass if len(project_analyzed_files[project_id]["src_files"].keys()) != 0: project_analyzed_files[project_id]["type_annot_cove"] = \ diff --git a/type4py/learn_split.py b/type4py/learn_split.py index 8741ee4..48d501c 100644 --- a/type4py/learn_split.py +++ b/type4py/learn_split.py @@ -103,8 +103,6 @@ def train_split(output_path: str, data_loading_funcs: dict, dataset_type: str, m if common_datatype is not None: os.remove(join(output_path, common_typefile)) - - # get the trained_model name and trained_types trained_model_name, trained_types = find_existing_model(data_loading_funcs, output_path) From 40880e1a01ad37e6a1179775b4418f5738f0f28e Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 24 Aug 2023 12:53:46 +0200 Subject: [PATCH 41/43] update infer project scripts --- type4py/deploy/infer.py | 3 ++- type4py/deploy/utils/extract_types.py | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/type4py/deploy/infer.py b/type4py/deploy/infer.py index 068f006..851ad96 100644 --- a/type4py/deploy/infer.py +++ b/type4py/deploy/infer.py @@ -282,7 +282,8 @@ def infer_preds_score(type_embeds: np.array) -> List[List[Tuple[str, float]]]: code_tks_dps = np.concatenate(tuple(code_tks_dps)) vth_dps = np.concatenate(tuple(vth_dps)) else: - id_dps, code_tks_dps, vth_dps = id_dps[0], code_tks_dps[0], vth_dps[0] + if len(id_dps) != 0: + id_dps, code_tks_dps, vth_dps = id_dps[0], code_tks_dps[0], vth_dps[0] preds = type_embed_single_dp(pre_trained_m.type4py_model, id_dps, code_tks_dps, vth_dps) if pre_trained_m.use_pca: diff --git a/type4py/deploy/utils/extract_types.py b/type4py/deploy/utils/extract_types.py index 0ab2126..6c4929a 100644 --- a/type4py/deploy/utils/extract_types.py +++ b/type4py/deploy/utils/extract_types.py @@ -121,8 +121,9 @@ def preprocess_types(type_list): def extract_result_ml(label_file, processed_file, project_id): type_project = [] for key in processed_file[project_id]['src_files'].keys(): - typelist_f = extract_file(label_file[project_id]['src_files'][key], - processed_file[project_id]['src_files'][key]) - type_project.extend(typelist_f) + if key in label_file[project_id]['src_files'].keys(): + typelist_f = extract_file(label_file[project_id]['src_files'][key], + processed_file[project_id]['src_files'][key]) + type_project.extend(typelist_f) type_project_processed = preprocess_types(type_project) return type_project_processed From 502a1ca9ac65cd039e6ef9ea76877f01b4de1085 Mon Sep 17 00:00:00 2001 From: fenglang Date: Thu, 24 Aug 2023 13:28:35 +0200 Subject: [PATCH 42/43] update infer project scripts --- type4py/deploy/infer.py | 240 +++++++++++++++++-------------- type4py/deploy/infer_project.py | 8 +- type4py/deploy/utils/__init__.py | 0 3 files changed, 136 insertions(+), 112 deletions(-) create mode 100644 type4py/deploy/utils/__init__.py diff --git a/type4py/deploy/infer.py b/type4py/deploy/infer.py index 851ad96..f416419 100644 --- a/type4py/deploy/infer.py +++ b/type4py/deploy/infer.py @@ -36,13 +36,14 @@ OUTPUT_FILE_SUFFIX = "_type4py_typed.py" ALL_PY_TYPES = set(list(PY_BUILTINS_MOD) + list(PY_COLLECTION_MOD) + list(PY_TYPING_MOD)) + class PretrainedType4Py: def __init__(self, pre_trained_model_path, device='gpu', pre_read_type_cluster=False, use_pca=False): self.pre_trained_model_path = pre_trained_model_path self.device = device self.pre_read_type_cluster = pre_read_type_cluster self.use_pca = use_pca - + self.type4py_model = None self.type4py_model_params = None self.type4py_pca = None @@ -54,14 +55,16 @@ def __init__(self, pre_trained_model_path, device='gpu', pre_read_type_cluster=F def load_pretrained_model(self): self.type4py_model_params = load_model_params() - + if self.device == 'gpu': - self.type4py_model = onnxruntime.InferenceSession(join(self.pre_trained_model_path, f"type4py_complete_model.onnx"), - providers=['CUDAExecutionProvider']) + self.type4py_model = onnxruntime.InferenceSession( + join(self.pre_trained_model_path, f"type4py_complete_model.onnx"), + providers=['CUDAExecutionProvider']) logger.info("The model runs on GPU") elif self.device == 'cpu': - self.type4py_model = onnxruntime.InferenceSession(join(self.pre_trained_model_path, f"type4py_complete_model.onnx"), - providers=['CPUExecutionProvider']) + self.type4py_model = onnxruntime.InferenceSession( + join(self.pre_trained_model_path, f"type4py_complete_model.onnx"), + providers=['CPUExecutionProvider']) logger.info("The model runs on CPU") if self.use_pca: @@ -72,11 +75,14 @@ def load_pretrained_model(self): self.w2v_model = Word2Vec.load(join(self.pre_trained_model_path, 'w2v_token_model.bin')) logger.info(f"Loaded the pre-trained W2V model") - self.type_clusters_idx = AnnoyIndex(self.type4py_pca.n_components_ if self.use_pca else self.type4py_model_params['output_size_prod'], - 'euclidean') - self.type_clusters_idx.load(join(self.pre_trained_model_path, "type4py_complete_type_cluster_reduced" if self.use_pca else "type4py_complete_type_cluster"), + self.type_clusters_idx = AnnoyIndex( + self.type4py_pca.n_components_ if self.use_pca else self.type4py_model_params['output_size_prod'], + 'euclidean') + self.type_clusters_idx.load(join(self.pre_trained_model_path, + "type4py_complete_type_cluster_reduced" if self.use_pca else "type4py_complete_type_cluster"), prefault=self.pre_read_type_cluster) - self.type_clusters_labels = np.load(join(self.pre_trained_model_path, f"type4py_complete_true_var_param_ret.npy")) + self.type_clusters_labels = np.load( + join(self.pre_trained_model_path, f"type4py_complete_true_var_param_ret.npy")) self.label_enc = pickle.load(open(join(self.pre_trained_model_path, "label_encoder_all.pkl"), 'rb')) logger.info(f"Loaded the Type Clusters") @@ -101,17 +107,17 @@ def load_pretrained_model(self): def compute_types_score(types_dist: list, types_idx: list, types_embed_labels: np.array): - types_dist = 1 / (np.array(types_dist) + 1e-10) ** 2 - types_dist /= np.sum(types_dist) - types_score = defaultdict(int) - for n, d in zip(types_idx, types_dist): - types_score[types_embed_labels[n]] += d - - return sorted({t: s for t, s in types_score.items()}.items(), key=lambda kv: kv[1], - reverse=True) + types_dist = 1 / (np.array(types_dist) + 1e-10) ** 2 + types_dist /= np.sum(types_dist) + types_score = defaultdict(int) + for n, d in zip(types_idx, types_dist): + types_score[types_embed_labels[n]] += d + + return sorted({t: s for t, s in types_score.items()}.items(), key=lambda kv: kv[1], + reverse=True) -def analyze_src_f(src_f: str, remove_preexisting_type_annot:bool=False) -> ModuleInfo: +def analyze_src_f(src_f: str, remove_preexisting_type_annot: bool = False) -> ModuleInfo: """ Removes pre-existing type annotations from a source file if desired """ @@ -119,10 +125,10 @@ def analyze_src_f(src_f: str, remove_preexisting_type_annot:bool=False) -> Modul v = Visitor() if remove_preexisting_type_annot: mw = MetadataWrapper(parse_module(src_f).visit(TypeAnnotationRemover()), - cache={TypeInferenceProvider: {'types':[]}}) + cache={TypeInferenceProvider: {'types': []}}) else: mw = MetadataWrapper(parse_module(src_f), - cache={TypeInferenceProvider: {'types':[]}}) + cache={TypeInferenceProvider: {'types': []}}) mw.visit(v) return ModuleInfo(v.imports, v.module_variables, v.module_variables_use, v.module_vars_ln, v.cls_list, @@ -133,13 +139,17 @@ def type_embed_single_dp(model: onnxruntime.InferenceSession, id_dp, code_tks_dp """ Gives a type embedding for a single test datapoint. """ - model_inputs = {model.get_inputs()[0].name: id_dp.astype(np.float32, copy=False), - model.get_inputs()[1].name: code_tks_dp.astype(np.float32, copy=False), - model.get_inputs()[2].name: vth_dp.astype(np.float32, copy=False)} + if isinstance(id_dp, list) and len(id_dp) == 0: + id_dp = np.array(id_dp) + + model_inputs = {model.get_inputs()[0].name: id_dp.astype(np.float32, copy=False), + model.get_inputs()[1].name: code_tks_dp.astype(np.float32, copy=False), + model.get_inputs()[2].name: vth_dp.astype(np.float32, copy=False)} return model.run(None, model_inputs)[0] -def infer_single_dp(type_cluster_idx: AnnoyIndex, k:int, types_embed_labels:np.array, + +def infer_single_dp(type_cluster_idx: AnnoyIndex, k: int, types_embed_labels: np.array, type_embed_vec: np.array): """ Infers a list of likely types for a single test datapoint. @@ -147,6 +157,7 @@ def infer_single_dp(type_cluster_idx: AnnoyIndex, k:int, types_embed_labels:np.a idx, dist = type_cluster_idx.get_nns_by_vector(type_embed_vec, k, include_distances=True) return compute_types_score(dist, idx, types_embed_labels) + def var2vec(vars_type_hints: List[list], w2v_model) -> Tuple[np.array, np.array, np.array]: """ Converts a variable to its type embedding @@ -158,13 +169,14 @@ def var2vec(vars_type_hints: List[list], w2v_model) -> Tuple[np.array, np.array, id_dp = np.stack(id_dp.apply(lambda x: x.generate_datapoint()), axis=0) code_tks_dp = df_var.apply(lambda row: TokenSequence(w2v_model, TOKEN_SEQ_LEN[0], TOKEN_SEQ_LEN[1], - row.var_occur, None, None), axis=1) + row.var_occur, None, None), axis=1) code_tks_dp = np.stack(code_tks_dp.apply(lambda x: x.generate_datapoint()), axis=0) vth_dp = np.stack(df_var.apply(lambda row: type_vector(AVAILABLE_TYPES_NUMBER, row.var_aval_enc), - axis=1), axis=0) + axis=1), axis=0) return id_dp, code_tks_dp, vth_dp + def param2vec(params_type_hints: List[list], w2v_model) -> Tuple[np.array, np.array, np.array]: """ Converts a function argument to its type embedding @@ -181,10 +193,11 @@ def param2vec(params_type_hints: List[list], w2v_model) -> Tuple[np.array, np.ar row.arg_occur, None, None), axis=1) code_tks_dp = np.stack(code_tks_dp.apply(lambda x: x.generate_datapoint()), axis=0) vth_dp = np.stack(df_param.apply(lambda row: type_vector(AVAILABLE_TYPES_NUMBER, row.param_aval_enc), - axis=1), axis=0) + axis=1), axis=0) return id_dp, code_tks_dp, vth_dp + def ret2vec(rets_type_hints: List[list], w2v_model) -> Tuple[np.array, np.array, np.array]: """ Converts a function return to its type embedding @@ -192,19 +205,20 @@ def ret2vec(rets_type_hints: List[list], w2v_model) -> Tuple[np.array, np.array, df_ret = pd.DataFrame(rets_type_hints, columns=['func_name', 'arg_names', 'ret_expr_seq', 'ret_aval_enc']) id_dp = df_ret.apply(lambda row: IdentifierSequence(w2v_model, None, row.arg_names, row.func_name, - None), axis=1) + None), axis=1) id_dp = np.stack(id_dp.apply(lambda x: x.generate_datapoint()), - axis=0) + axis=0) code_tks_dp = df_ret.apply(lambda row: TokenSequence(w2v_model, TOKEN_SEQ_LEN[0], TOKEN_SEQ_LEN[1], None, row.ret_expr_seq, None), axis=1) code_tks_dp = np.stack(code_tks_dp.apply(lambda x: x.generate_datapoint()), axis=0) vth_dp = np.stack(df_ret.apply(lambda row: type_vector(AVAILABLE_TYPES_NUMBER, row.ret_aval_enc), - axis=1), axis=0) + axis=1), axis=0) return id_dp, code_tks_dp, vth_dp + def apply_inferred_types(in_src_f: str, in_src_f_dict: dict, out_src_f_path: str): """ Applies inffered type annototations to the source file @@ -214,15 +228,15 @@ def apply_inferred_types(in_src_f: str, in_src_f_dict: dict, out_src_f_path: str write_file(out_src_f_path, f_parsed.code) -def get_type_preds_single_file(src_f_ext:dict, all_type_slots: Tuple[list], all_type_hints: Tuple[list], - pre_trained_m: PretrainedType4Py, filter_pred_types:bool=True) -> dict: +def get_type_preds_single_file(src_f_ext: dict, all_type_slots: Tuple[list], all_type_hints: Tuple[list], + pre_trained_m: PretrainedType4Py, filter_pred_types: bool = True) -> dict: """ Infers type annotations for the whole source code file """ def filter_preds(preds: List[Tuple[str, float]]) -> List[Tuple[str, float]]: """ - Filters out predictions that are not part of Python builtin types nor part of + Filters out predictions that are not part of Python builtin types nor part of the imported names in the file. """ @@ -248,12 +262,16 @@ def infer_preds_score(type_embeds: np.array) -> List[List[Tuple[str, float]]]: for te in type_embeds: preds = infer_single_dp(pre_trained_m.type_clusters_idx, pre_trained_m.type4py_model_params['k'], - pre_trained_m.type_clusters_labels, te) + pre_trained_m.type_clusters_labels, te) if filter_pred_types: - type_embeds_preds.append(filter_preds(list(zip(list(pre_trained_m.label_enc.inverse_transform([int(p) for p,s in preds])), [s for p,s in preds])))) + type_embeds_preds.append(filter_preds(list( + zip(list(pre_trained_m.label_enc.inverse_transform([int(p) for p, s in preds])), + [s for p, s in preds])))) else: - type_embeds_preds.append(list(zip(list(pre_trained_m.label_enc.inverse_transform([int(p) for p,s in preds])), [s for p,s in preds]))) - + type_embeds_preds.append(list( + zip(list(pre_trained_m.label_enc.inverse_transform([int(p) for p, s in preds])), + [s for p, s in preds]))) + return type_embeds_preds vars_type_hints, params_type_hints, rets_type_hints = all_type_hints @@ -276,15 +294,14 @@ def infer_preds_score(type_embeds: np.array) -> List[List[Tuple[str, float]]]: id_dps += [rets_id_dp] code_tks_dps += [rets_code_tks_dp] vth_dps += [rets_vth_dp] - + if len(id_dps) > 1: id_dps = np.concatenate(tuple(id_dps)) code_tks_dps = np.concatenate(tuple(code_tks_dps)) vth_dps = np.concatenate(tuple(vth_dps)) - else: - if len(id_dps) != 0: - id_dps, code_tks_dps, vth_dps = id_dps[0], code_tks_dps[0], vth_dps[0] - + elif len(id_dps) == 1: + id_dps, code_tks_dps, vth_dps = id_dps[0], code_tks_dps[0], vth_dps[0] + preds = type_embed_single_dp(pre_trained_m.type4py_model, id_dps, code_tks_dps, vth_dps) if pre_trained_m.use_pca: preds = pre_trained_m.type4py_pca.transform(preds) @@ -294,6 +311,7 @@ def infer_preds_score(type_embeds: np.array) -> List[List[Tuple[str, float]]]: return src_f_ext + def get_dps_single_file(ext_type_hints: dict) -> Tuple[list]: """ It extracts data points from a single file for the model @@ -307,38 +325,36 @@ def get_dps_single_file(ext_type_hints: dict) -> Tuple[list]: vars_type_hints = [] params_type_hints = [] rets_type_hints = [] - + # Storing Type4Py's predictions ext_type_hints['variables_p'] = {} for m_v, m_v_o in zip(ext_type_hints['variables'], ext_type_hints['mod_var_occur'].values()): - vars_type_slots.append((ext_type_hints['variables_p'], m_v)) - + vars_type_hints.append([nlp_prep.process_identifier(m_v), - str([nlp_prep.process_sentence(o) for i in m_v_o for o in i]), - AVAILABLE_TYPES_NUMBER-1]) - + str([nlp_prep.process_sentence(o) for i in m_v_o for o in i]), + AVAILABLE_TYPES_NUMBER - 1]) + for i, fn in enumerate(ext_type_hints['funcs']): fn_n = nlp_prep.process_identifier(fn['name']) - fn_p = [(n, nlp_prep.process_identifier(n), o) for n, o in zip(fn['params'], fn["params_occur"].values()) if n not in {'args', 'kwargs'}] + fn_p = [(n, nlp_prep.process_identifier(n), o) for n, o in zip(fn['params'], fn["params_occur"].values()) if + n not in {'args', 'kwargs'}] fn['params_p'] = {'args': [], 'kwargs': []} for o_p, p, p_o in fn_p: - params_type_slots.append((ext_type_hints['funcs'][i]['params_p'], o_p)) params_type_hints.append([fn_n, p, " ".join([p[1] for p in fn_p]), - str([nlp_prep.process_sentence(o) for i in p_o for o in i]), - AVAILABLE_TYPES_NUMBER-1]) + str([nlp_prep.process_sentence(o) for i in p_o for o in i]), + AVAILABLE_TYPES_NUMBER - 1]) # The type of local variables for module-level functions fn['variables_p'] = {} for fn_v, fn_v_o in zip(fn['variables'], fn['fn_var_occur'].values()): - vars_type_slots.append((ext_type_hints['funcs'][i]['variables_p'], fn_v)) - + vars_type_hints.append([nlp_prep.process_identifier(fn_v), - str([nlp_prep.process_sentence(o) for i in fn_v_o for o in i]), - AVAILABLE_TYPES_NUMBER-1]) + str([nlp_prep.process_sentence(o) for i in fn_v_o for o in i]), + AVAILABLE_TYPES_NUMBER - 1]) # The return type for module-level functions if ext_type_hints['funcs'][i]['ret_exprs'] != []: @@ -347,26 +363,27 @@ def get_dps_single_file(ext_type_hints: dict) -> Tuple[list]: rets_type_slots.append((ext_type_hints['funcs'][i], 'ret_type_p')) - rets_type_hints.append([fn_n, fn_p, " ".join([nlp_prep.process_identifier(r.replace('return ', '')) for r in fn['ret_exprs']]), - AVAILABLE_TYPES_NUMBER-1]) - + rets_type_hints.append( + [fn_n, fn_p, " ".join([nlp_prep.process_identifier(r.replace('return ', '')) for r in fn['ret_exprs']]), + AVAILABLE_TYPES_NUMBER - 1]) + # The type of class-level vars for c_i, c in enumerate(ext_type_hints['classes']): c['variables_p'] = {} for c_v, c_v_o in zip(c['variables'], c['cls_var_occur'].values()): - vars_type_slots.append((ext_type_hints['classes'][c_i]['variables_p'], c_v)) - + vars_type_hints.append([nlp_prep.process_identifier(c_v), str([nlp_prep.process_sentence(o) for i in c_v_o for o in i]), - AVAILABLE_TYPES_NUMBER-1]) - + AVAILABLE_TYPES_NUMBER - 1]) + # The type of arguments for class-level functions # TODO: Ignore triavial funcs such as __str__ for fn_i, fn in enumerate(c['funcs']): fn_n = nlp_prep.process_identifier(fn['name']) - fn_p = [(n, nlp_prep.process_identifier(n), o) for n, o in zip(fn['params'], fn["params_occur"].values()) if n not in {'args', \ - 'kwargs', 'self'}] + fn_p = [(n, nlp_prep.process_identifier(n), o) for n, o in zip(fn['params'], fn["params_occur"].values()) if + n not in {'args', \ + 'kwargs', 'self'}] fn["params_p"] = {'self': [], 'args': [], 'kwargs': []} for o_p, p, p_o in fn_p: @@ -375,8 +392,8 @@ def get_dps_single_file(ext_type_hints: dict) -> Tuple[list]: params_type_slots.append((ext_type_hints['classes'][c_i]['funcs'][fn_i]['params_p'], o_p)) params_type_hints.append([fn_n, p, " ".join([p[1] for p in fn_p]), - str([nlp_prep.process_sentence(o) for i in p_o for o in i if o != "self"]), - AVAILABLE_TYPES_NUMBER-1]) + str([nlp_prep.process_sentence(o) for i in p_o for o in i if o != "self"]), + AVAILABLE_TYPES_NUMBER - 1]) # The type of local variables for class-level functions fn['variables_p'] = {} @@ -386,9 +403,9 @@ def get_dps_single_file(ext_type_hints: dict) -> Tuple[list]: vars_type_slots.append((ext_type_hints['classes'][c_i]['funcs'][fn_i]['variables_p'], fn_v)) vars_type_hints.append([nlp_prep.process_identifier(fn_v), - str([nlp_prep.process_sentence(o) for i in fn_v_o for o in i]), - AVAILABLE_TYPES_NUMBER-1]) - + str([nlp_prep.process_sentence(o) for i in fn_v_o for o in i]), + AVAILABLE_TYPES_NUMBER - 1]) + # The return type for class-level functions if ext_type_hints['classes'][c_i]['funcs'][fn_i]['ret_exprs'] != []: ext_type_hints['classes'][c_i]['funcs'][fn_i]['ret_type_p'] = {} @@ -396,8 +413,10 @@ def get_dps_single_file(ext_type_hints: dict) -> Tuple[list]: rets_type_slots.append((ext_type_hints['classes'][c_i]['funcs'][fn_i], 'ret_type_p')) rets_type_hints.append([fn_n, fn_p, - " ".join([regex.sub(r"self\.?", '', nlp_prep.process_identifier(r.replace('return ', ''))) for r in fn['ret_exprs']]), - AVAILABLE_TYPES_NUMBER-1]) + " ".join([regex.sub(r"self\.?", '', + nlp_prep.process_identifier(r.replace('return ', ''))) for r + in fn['ret_exprs']]), + AVAILABLE_TYPES_NUMBER - 1]) return vars_type_slots + params_type_slots + rets_type_slots, vars_type_hints, params_type_hints, \ rets_type_hints @@ -418,7 +437,7 @@ def type_check_pred(src_f_r: str, src_f_o_path: str, src_f_ext: dict, Type checks a prediction """ apply_inferred_types(src_f_r, src_f_ext, src_f_o_path) - #print(read_file(src_f_o_path)) + # print(read_file(src_f_o_path)) type_checked = type_check_single_file(src_f_o_path, tc) if pred == true: @@ -428,13 +447,13 @@ def type_check_pred(src_f_r: str, src_f_o_path: str, src_f_ext: dict, else: return type_checked, PredictionType.p_not_equal_gt -def type_check_inferred_types(src_f_ext: dict, src_f_read: str, src_f_o_path): +def type_check_inferred_types(src_f_ext: dict, src_f_read: str, src_f_o_path): mypy_tc = MypyManager('mypy', 20) preds_type_checked: Tuple[bool, PredictionType] = [] for m_v, m_v_t in src_f_ext['variables'].items(): - # The predictions for module-level vars + # The predictions for module-level vars for p, s in src_f_ext['variables_p'][m_v]: logger.info(f"Annotating module-level variable {m_v} with {p}") src_f_ext['variables'][m_v] = p @@ -442,7 +461,7 @@ def type_check_inferred_types(src_f_ext: dict, src_f_read: str, src_f_o_path): preds_type_checked.append((is_tc, p_type)) if not is_tc: src_f_ext['variables'][m_v] = m_v_t - + for i, fn in enumerate(src_f_ext['funcs']): for p_n, p_t in fn['params'].items(): # The predictions for arguments for module-level functions @@ -463,7 +482,7 @@ def type_check_inferred_types(src_f_ext: dict, src_f_read: str, src_f_o_path): preds_type_checked.append((is_tc, p_type)) if not is_tc: src_f_ext['funcs'][i]['variables'][fn_v] = fn_v_t - + # The return type for module-level functions if src_f_ext['funcs'][i]['ret_exprs'] != []: org_t = src_f_ext['funcs'][i]['ret_type'] @@ -511,16 +530,18 @@ def type_check_inferred_types(src_f_ext: dict, src_f_read: str, src_f_o_path): if src_f_ext['classes'][c_i]['funcs'][fn_i]['ret_exprs'] != []: org_t = src_f_ext['classes'][c_i]['funcs'][fn_i]['ret_type'] for p, s in src_f_ext['classes'][c_i]['funcs'][fn_i]['ret_type_p']: - logger.info(f"Annotating function {src_f_ext['classes'][c_i]['funcs'][fn_i]['name']} return with {p}") + logger.info( + f"Annotating function {src_f_ext['classes'][c_i]['funcs'][fn_i]['name']} return with {p}") src_f_ext['classes'][c_i]['funcs'][fn_i]['ret_type'] = p is_tc, p_type = type_check_pred(src_f_read, src_f_o_path, src_f_ext, mypy_tc, p, org_t) preds_type_checked.append((is_tc, p_type)) if not is_tc: src_f_ext['classes'][c_i]['funcs'][fn_i]['ret_type'] = org_t - - #apply_inferred_types(src_f_read, src_f_ext, src_f_o_path) + + # apply_inferred_types(src_f_read, src_f_ext, src_f_o_path) return report_type_check_preds(preds_type_checked) + # def get_type_slots_preds_file(source_file_path: str) -> list: # src_f_read = read_file(source_file_path) @@ -530,31 +551,31 @@ def type_check_inferred_types(src_f_ext: dict, src_f_read: str, src_f_o_path): # f_type_slots_preds = [] # for m_v, m_v_t in tqdm(src_f_ext['variables'].items()): -# # The predictions for module-level vars +# # The predictions for module-level vars # for p, s in src_f_ext['variables_p'][m_v]: # src_f_ext['variables'][m_v] = p # f_type_slots_preds.append((source_file_path, src_f_read, src_f_ext, ('variables', m_v), m_v_t, p)) - + # for i, fn in tqdm(enumerate(src_f_ext['funcs']), total=len(src_f_ext['funcs']), desc="[module][funcs]"): # for p_n, p_t in fn['params'].items(): # # The predictions for arguments for module-level functions # for p, s in fn['params_p'][p_n]: # src_f_ext['funcs'][i]['params'][p_n] = p # f_type_slots_preds.append((source_file_path, src_f_read, src_f_ext, ('funcs', i, 'params', p_n), p_t, p)) - + # # The predictions local variables for module-level functions # for fn_v, fn_v_t in fn['variables'].items(): # for p, s in fn['variables_p'][fn_v]: # src_f_ext['funcs'][i]['variables'][fn_v] = p # f_type_slots_preds.append((source_file_path, src_f_read, src_f_ext, ('funcs', i, 'variables', fn_v), fn_v_t, p)) - + # # The return type for module-level functions # if src_f_ext['funcs'][i]['ret_exprs'] != []: # org_t = src_f_ext['funcs'][i]['ret_type'] # for p, s in src_f_ext['funcs'][i]['ret_type_p']: # src_f_ext['funcs'][i]['ret_type'] = p # f_type_slots_preds.append((source_file_path, src_f_read, src_f_ext, ('funcs', i, 'ret_type'), org_t, p)) - + # # The type of class-level vars # for c_i, c in tqdm(enumerate(src_f_ext['classes']), total=len(src_f_ext['classes']), desc="[module][classes]"): # for c_v, c_v_t in c['variables'].items(): @@ -567,29 +588,28 @@ def type_check_inferred_types(src_f_ext: dict, src_f_read: str, src_f_o_path): # for p_n, p_t in fn["params"].items(): # for p, s in fn["params_p"][p_n]: # src_f_ext['classes'][c_i]['funcs'][fn_i]['params'][p_n] = p -# f_type_slots_preds.append((source_file_path, src_f_read, src_f_ext, ('classes', c_i, 'funcs', fn_i, 'params', p_n), p_t, p)) +# f_type_slots_preds.append((source_file_path, src_f_read, src_f_ext, ('classes', c_i, 'funcs', fn_i, 'params', p_n), p_t, p)) # # The type of local variables for class-level functions # for fn_v, fn_v_t in fn['variables'].items(): # for p, s in fn['variables_p'][fn_v]: # src_f_ext['classes'][c_i]['funcs'][fn_i]['variables'][fn_v] = p -# f_type_slots_preds.append((source_file_path, src_f_read, src_f_ext, ('classes', c_i, 'funcs', fn_i, 'variables', fn_v), fn_v_t, p)) - +# f_type_slots_preds.append((source_file_path, src_f_read, src_f_ext, ('classes', c_i, 'funcs', fn_i, 'variables', fn_v), fn_v_t, p)) + # # The return type for class-level functions # if src_f_ext['classes'][c_i]['funcs'][fn_i]['ret_exprs'] != []: # org_t = src_f_ext['classes'][c_i]['funcs'][fn_i]['ret_type'] # for p, s in src_f_ext['classes'][c_i]['funcs'][fn_i]['ret_type_p']: # src_f_ext['classes'][c_i]['funcs'][fn_i]['ret_type'] = p -# f_type_slots_preds.append((source_file_path, src_f_read, src_f_ext, ('classes', c_i, 'funcs', fn_i, 'ret_type'), org_t, p)) - +# f_type_slots_preds.append((source_file_path, src_f_read, src_f_ext, ('classes', c_i, 'funcs', fn_i, 'ret_type'), org_t, p)) + # #apply_inferred_types(src_f_read, src_f_ext, src_f_o_path) # return f_type_slots_preds def get_type_checked_preds(src_f_ext: dict, src_f_read: str) -> dict: - mypy_tc = MypyManager('mypy', 20) tmp_f = create_tmp_file(".py") - + for m_v, m_v_t in tqdm(src_f_ext['variables'].items()): # The predictions for module-level vars for p, s in src_f_ext['variables_p'][m_v][:]: @@ -597,7 +617,7 @@ def get_type_checked_preds(src_f_ext: dict, src_f_read: str) -> dict: is_tc, _ = type_check_pred(src_f_read, tmp_f.name, src_f_ext, mypy_tc, p, m_v_t) if not is_tc: src_f_ext['variables_p'][m_v].remove((p, s)) - + for i, fn in tqdm(enumerate(src_f_ext['funcs']), total=len(src_f_ext['funcs']), desc="[module][funcs]"): for p_n, p_t in fn['params'].items(): # The predictions for arguments for module-level functions @@ -614,7 +634,7 @@ def get_type_checked_preds(src_f_ext: dict, src_f_read: str) -> dict: is_tc, _ = type_check_pred(src_f_read, tmp_f.name, src_f_ext, mypy_tc, p, fn_v_t) if not is_tc: src_f_ext['funcs'][i]['variables_p'][fn_v].remove((p, s)) - + # The return type for module-level functions if src_f_ext['funcs'][i]['ret_exprs'] != []: org_t = src_f_ext['funcs'][i]['ret_type'] @@ -658,14 +678,14 @@ def get_type_checked_preds(src_f_ext: dict, src_f_read: str) -> dict: is_tc, _ = type_check_pred(src_f_read, tmp_f.name, src_f_ext, mypy_tc, p, org_t) if not is_tc: src_f_ext['classes'][c_i]['funcs'][fn_i]['ret_type_p'].remove((p, s)) - + os.unlink(tmp_f.name) return src_f_ext def report_type_check_preds(type_check_preds: List[Tuple[bool, PredictionType]]) -> Tuple[Optional[float], - Optional[float], Optional[float]]: - + Optional[float], Optional[ + float]]: no_p_equal_gt = 0 no_p_equal_gt_tc = 0 no_p_not_equal_gt = 0 @@ -714,7 +734,8 @@ def infer_json_pred(pre_trained_m: PretrainedType4Py, source_file_path: str): src_f_ext = get_type_preds_single_file(src_f_ext, pre_trained_m) save_json(join(dirname(source_file_path), - splitext(basename(source_file_path))[0]+"_type4py_typed.json"), src_f_ext) + splitext(basename(source_file_path))[0] + "_type4py_typed.json"), src_f_ext) + # def type_check_json_pred(pre_trained_m: PretrainedType4Py, source_file_path: str): # pre_trained_m.load_pretrained_model() @@ -727,23 +748,23 @@ def infer_json_pred(pre_trained_m: PretrainedType4Py, source_file_path: str): def type_check_json_pred(source_file_path: str, tc_resuls: list): - src_f_read = read_file(source_file_path) src_f_ext = load_json(join(dirname(source_file_path), - splitext(basename(source_file_path))[0]+"_type4py_typed.json")) - - tc_resuls.append((source_file_path, type_check_inferred_types(src_f_ext, src_f_read, join(dirname(source_file_path), - splitext(basename(source_file_path))[0]+OUTPUT_FILE_SUFFIX)))) + splitext(basename(source_file_path))[0] + "_type4py_typed.json")) + tc_resuls.append((source_file_path, type_check_inferred_types(src_f_ext, src_f_read, join(dirname(source_file_path), + splitext(basename( + source_file_path))[ + 0] + OUTPUT_FILE_SUFFIX)))) -def type_annotate_file(pre_trained_m: PretrainedType4Py, source_code: str, source_file_path: str=None, - filter_pred_types:bool=True): +def type_annotate_file(pre_trained_m: PretrainedType4Py, source_code: str, source_file_path: str = None, + filter_pred_types: bool = True): if source_file_path is not None: src_f_read = read_file(source_file_path) else: src_f_read = source_code - #src_f_ext = analyze_src_f(src_f_read).to_dict() + # src_f_ext = analyze_src_f(src_f_read).to_dict() ext_type_hints = apply_nlp_transf(Extractor.extract(src_f_read, include_seq2seq=False).to_dict()) # logger.info("Extracted JSON-representation of input source file") @@ -762,6 +783,7 @@ def type_annotate_file(pre_trained_m: PretrainedType4Py, source_code: str, sourc return ext_type_hints + def predict_types_src_code(pre_trained_m: PretrainedType4Py, src_code: str) -> dict: src_f_ext = analyze_src_f(src_code).to_dict() logger.info("Extracted type hints and JSON-representation of input source file") @@ -771,8 +793,8 @@ def predict_types_src_code(pre_trained_m: PretrainedType4Py, src_code: str) -> d return src_f_ext + def infer_main(pre_trained_model_path: str, source_file_path: str): - logger.info(f"Inferring types for the file '{basename(source_file_path)}'' using the Type4Py pretrained model") logger.info(f"*************************************************************************") @@ -788,8 +810,8 @@ def infer_main(pre_trained_model_path: str, source_file_path: str): # src_f_ext = type_annotate_file(pre_trained_m, source_file_path) # save_json(join(pre_trained_model_path, splitext(basename(source_file_path))[0]+"_typed.json"), src_f_ext) -if __name__ == '__main__': +if __name__ == '__main__': arg_parser = ArgumentParser(description="Infering type annotations for a Python file") arg_parser.add_argument("--m", required=True, type=str, help="Path to the pre-trained Type4Py model") arg_parser.add_argument("--f", required=True, type=str, help="Path to a source code file") diff --git a/type4py/deploy/infer_project.py b/type4py/deploy/infer_project.py index 7ea9d23..0698031 100644 --- a/type4py/deploy/infer_project.py +++ b/type4py/deploy/infer_project.py @@ -6,6 +6,7 @@ from typing import List import pandas as pd import tqdm +import traceback from type4py.deploy.infer import PretrainedType4Py, type_annotate_file from type4py.deploy.utils.extract_types import extract_result_ml @@ -63,14 +64,15 @@ def ml_infer(repo, model, project_dir): project_analyzed_files[project_id]["src_files"][filename] = \ ext_type_hints except ParseError as err: - # print("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) pass + # print("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) except UnicodeDecodeError: - # print(f"Could not read file {filename}") pass + # print(f"Could not read file {filename}") except Exception as err: - # print("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) pass + # traceback.print_exc() + # print("project: %s |file: %s |Exception: %s" % (project_id, filename, err)) if len(project_analyzed_files[project_id]["src_files"].keys()) != 0: project_analyzed_files[project_id]["type_annot_cove"] = \ diff --git a/type4py/deploy/utils/__init__.py b/type4py/deploy/utils/__init__.py new file mode 100644 index 0000000..e69de29 From 3e9448102d76ecb2ded1040da2ada69c4619af87 Mon Sep 17 00:00:00 2001 From: fenglang Date: Wed, 6 Sep 2023 14:40:34 +0200 Subject: [PATCH 43/43] update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 96e3e93..2ff3b40 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,8 @@ pip install . Follow the below steps to train and evaluate the Type4Py model. ## 1. Extraction **NOTE:** Skip this step if you're using the ManyTypes4Py dataset. + +**NOTE:** You can find a new ManyTypes4Py dataset(MTV0.8) on [Zenedo](https://zenodo.org/record/8321283). ``` $ type4py extract --c $DATA_PATH --o $OUTPUT_DIR --d $DUP_FILES --w $CORES ```