Skip to content
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
dbd307f
fix the issues in preprocess and np.stack in batches in vec
LangFeng0912 Mar 22, 2023
896e819
add the learn_sep into the type4py pepeline
LangFeng0912 Mar 22, 2023
b38ba1e
add gen_cluster and update reduce for batches script
LangFeng0912 Mar 23, 2023
bfb9732
add infer_project CLI command for infer the dataset
LangFeng0912 Mar 24, 2023
4cc376f
add more comments
LangFeng0912 Mar 28, 2023
084aaa2
fix the issues
LangFeng0912 Apr 6, 2023
298c962
fix the issues, add the predicts script amd exceptions script
LangFeng0912 Apr 10, 2023
0408072
update infer-project .py
LangFeng0912 May 29, 2023
e336713
add project-base inference pipeline
LangFeng0912 May 29, 2023
08a9f51
add project-base inference for ml & hybrid
LangFeng0912 May 29, 2023
bd4bb81
add script explanations
LangFeng0912 May 29, 2023
51a9693
update infer-project base approach name t4pyre and t4pyright
LangFeng0912 Jun 5, 2023
5c5f522
update t4pyright logic in infer-project approach
LangFeng0912 Jun 6, 2023
8698994
rename type_preprocess script
LangFeng0912 Jun 8, 2023
e9b1a11
update TypeAnnotationFinder & Masker to libsa4py and import from it
LangFeng0912 Jun 8, 2023
0b491e2
update comments
LangFeng0912 Jun 8, 2023
afbddd7
update vectorize
LangFeng0912 Aug 17, 2023
2e80ff1
update preprocess
LangFeng0912 Aug 17, 2023
f7d1fef
update learn_split.py
LangFeng0912 Aug 17, 2023
ed8af0d
update learn_split.py
LangFeng0912 Aug 17, 2023
2ff621e
update learn_split.py
LangFeng0912 Aug 17, 2023
1a5332a
update learn_split.py
LangFeng0912 Aug 17, 2023
40f1302
update learn_split.py
LangFeng0912 Aug 17, 2023
9aa640b
update pipeline
LangFeng0912 Aug 18, 2023
e01df3c
update pipeline
LangFeng0912 Aug 18, 2023
3eb4fe9
update pipeline
LangFeng0912 Aug 18, 2023
33d169f
update Dockerfile for cuda version
LangFeng0912 Aug 18, 2023
8e0f0de
update model parameters
LangFeng0912 Aug 18, 2023
3dc74c5
update model parameters
LangFeng0912 Aug 23, 2023
97c1c11
update model parameters
LangFeng0912 Aug 23, 2023
67b8a70
update infer main approach
LangFeng0912 Aug 23, 2023
b369011
update infer main approach
LangFeng0912 Aug 23, 2023
acbdae1
update infer main approach
LangFeng0912 Aug 23, 2023
d612597
update infer main approach
LangFeng0912 Aug 23, 2023
704af65
update infer main approach
LangFeng0912 Aug 23, 2023
6fab977
update type preprocess_list
LangFeng0912 Aug 24, 2023
ace9baf
update type preprocess_list
LangFeng0912 Aug 24, 2023
1750b74
update eval scripts
LangFeng0912 Aug 24, 2023
46b3ea5
update eval scripts
LangFeng0912 Aug 24, 2023
9592acf
update infer project scripts
LangFeng0912 Aug 24, 2023
40880e1
update infer project scripts
LangFeng0912 Aug 24, 2023
502a1ca
update infer project scripts
LangFeng0912 Aug 24, 2023
3e94481
update README.md
LangFeng0912 Sep 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,222 changes: 2,222 additions & 0 deletions dataset_split_repo.csv

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions type4py/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@
MAX_PARAM_TYPE_DEPTH = 2
TOKEN_SEQ_LEN = (7, 3)
AVAILABLE_TYPE_APPLY_PROB = 0.5
IDENTIFIER_SEQ_LEN = 31
168 changes: 135 additions & 33 deletions type4py/__main__.py

Large diffs are not rendered by default.

313 changes: 225 additions & 88 deletions type4py/data_loaders.py

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions type4py/deploy/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,9 +249,9 @@ def infer_preds_score(type_embeds: np.array) -> List[List[Tuple[str, float]]]:
preds = infer_single_dp(pre_trained_m.type_clusters_idx, pre_trained_m.type4py_model_params['k'],
pre_trained_m.type_clusters_labels, te)
if filter_pred_types:
type_embeds_preds.append(filter_preds(list(zip(list(pre_trained_m.label_enc.inverse_transform([p for p,s in preds])), [s for p,s in preds]))))
type_embeds_preds.append(filter_preds(list(zip(list(pre_trained_m.label_enc.inverse_transform([int(p) for p,s in preds])), [s for p,s in preds]))))
else:
type_embeds_preds.append(list(zip(list(pre_trained_m.label_enc.inverse_transform([p for p,s in preds])), [s for p,s in preds])))
type_embeds_preds.append(list(zip(list(pre_trained_m.label_enc.inverse_transform([int(p) for p,s in preds])), [s for p,s in preds])))

return type_embeds_preds

Expand Down
129 changes: 129 additions & 0 deletions type4py/deploy/infer_project.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"""
This module is for infer projects and output json files based on three approaches:
type4py, pyre and pyright
"""
import os
from typing import List
import pandas as pd
import tqdm

from type4py.deploy.infer import PretrainedType4Py, type_annotate_file
from type4py import logger
from libsa4py.exceptions import ParseError

from libsa4py.utils import list_files, find_repos_list, save_json
from pathlib import Path
import multiprocessing
from type4py.deploy.static_infer import pyre_infer
from type4py.deploy.utils.pyre_merge import merge_pyre


ml_queue = multiprocessing.Queue()
pyre_queue = multiprocessing.Queue()
pyright_queue = multiprocessing.Queue()

def find_test_list(project_dir, dataset_split):
if os.path.exists(dataset_split):
repos_list: List[dict] = []

df = pd.read_csv(dataset_split)
test_df = df[df['set'] == 'test']
for index, row in test_df.iterrows():
project = row['project']
author = project.split('/')[1]
repo = project.split('/')[2]
project_path = os.path.join(project_dir, author, repo)
if os.path.isdir(project_path):
repos_list.append({"author": author, "repo": repo})
return repos_list

else:
# logger.info(f"dataset_split file: {dataset_split} does not exist!")
raise FileNotFoundError(f"dataset_split file: {dataset_split} does not exist!")

def ml_infer(repo, model, project_dir):
project_author = repo["author"]
project_name = repo["repo"]
project_path = os.path.join(project_dir, project_author, project_name)
id_tuple = (project_author, project_name)
project_id = "/".join(id_tuple)
project_analyzed_files: dict = {project_id: {"src_files": {}, "type_annot_cove": 0.0}}
print(f'Running pipeline for project {project_path}')

print(f'Extracting for {project_path}...')
project_files = list_files(project_path)
print(f"{project_path} has {len(project_files)} files")

project_files = [(f, str(Path(f).relative_to(Path(project_path).parent))) for f in project_files]

if len(project_files) != 0:
for filename, f_relative in project_files:
try:
ext_type_hints = type_annotate_file(model, None, filename)
project_analyzed_files[project_id]["src_files"][filename] = \
ext_type_hints
except ParseError as err:
print("project: %s |file: %s |Exception: %s" % (project_id, filename, err))
except UnicodeDecodeError:
print(f"Could not read file {filename}")
except Exception as err:
print("project: %s |file: %s |Exception: %s" % (project_id, filename, err))

if len(project_analyzed_files[project_id]["src_files"].keys()) != 0:
project_analyzed_files[project_id]["type_annot_cove"] = \
round(sum([project_analyzed_files[project_id]["src_files"][s]["type_annot_cove"] for s in
project_analyzed_files[project_id]["src_files"].keys()]) / len(
project_analyzed_files[project_id]["src_files"].keys()), 2)

return project_analyzed_files


def run_mlInfer():
ml_result = ml_infer(repo, model, project_dir)
ml_queue.put(ml_result)

def run_pyreInfer():
pyre_result = pyre_infer(repo, project_dir)
pyre_queue.put(pyre_result)

def infer_projects(model, project_dir, tar_dir, approach, split_file):
if split_file is not None:
repo_infos_test = find_test_list(project_dir, split_file)
logger.info(f'Totally find {len(repo_infos_test)} projects in test set')
else:
logger.info(f"dataset_split file not provided, infer all projects in {project_dir}")
repo_infos_test = find_repos_list(project_dir)
logger.info(f'Totally find {len(repo_infos_test)} projects in project dir')

if approach == "t4py":
for repo in tqdm(repo_infos_test):
project_name = "".join((repo["author"], repo["repo"]))
filepath = os.path.join(tar_dir, f"{project_name}_mlInfer.json")
processed_file = ml_infer(repo, model, project_dir, tar_dir)
save_json(filepath, processed_file)

if approach == "hybrid0":
for repo in tqdm(repo_infos_test):
process1 = multiprocessing.Process(target=run_mlInfer)
process2 = multiprocessing.Process(target=run_pyreInfer)

# Start the processes
process1.start()
process2.start()

# Get the results from t4py and pyre & merge
ml_result = ml_queue.get()
sa_result = pyre_queue.get()

project_id = "/".join((repo["author"], repo["repo"]))
project_name = "".join((repo["author"], repo["repo"]))
hy_result = merge_pyre(ml_result, sa_result, project_id)

filepath = os.path.join(tar_dir, f"{project_name}_hybridinfer0.json")
save_json(filepath, hy_result)

def infer_project_main(model_path, input_path, output_path, approach, split_file):
t4py_pretrained_m = PretrainedType4Py(model_path, "gpu", pre_read_type_cluster=False, use_pca=True)
t4py_pretrained_m.load_pretrained_model()
infer_projects(t4py_pretrained_m, input_path, output_path, approach, split_file)

75 changes: 75 additions & 0 deletions type4py/deploy/static_infer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""
This module is for static infer, including pyre infer and pyright infer
"""

import os
from pathlib import Path
import utils.pyre_utils as pyre_util
from utils.utils import rebuild_repo
from libsa4py.utils import list_files, read_file
from libsa4py.exceptions import ParseError
from libsa4py.cst_extractor import Extractor
import shutil


def pyre_start(project_path):
pyre_util.clean_watchman_config(project_path)
pyre_util.clean_pyre_config(project_path)
pyre_util.start_watchman(project_path)
pyre_util.start_pyre(project_path)


def pyre_infer(repo, project_dir):
# rebuild for masking original types
cache_path = "/cache_path"
os.mkdir(cache_path)
rebuild_repo(project_dir, cache_path, repo)

project_author = repo["author"]
project_name = repo["repo"]
project_path = os.path.join(cache_path, project_author, project_name)
id_tuple = (project_author, project_name)
project_id = "/".join(id_tuple)
project_analyzed_files: dict = {project_id: {"src_files": {}, "type_annot_cove": 0.0}}

print(f'Running pyre pipeline for project {project_path}')
pyre_start(project_path)
# start pyre infer for project
print(f'Running pyre infer for project {project_path}')
pyre_util.pyre_infer(project_path)
print(f'Extracting for {project_path}...')
project_files = list_files(project_path)
print(f"{project_path} has {len(project_files)} files")

project_files = [(f, str(Path(f).relative_to(Path(project_path).parent))) for f in project_files]

if len(project_files) != 0:
print(f'Running pyre query for project {project_path}')
try:
for filename, f_relative in project_files:
pyre_data_file = pyre_util.pyre_query_types(project_path, filename)
project_analyzed_files[project_id]["src_files"][filename] = \
Extractor.extract(read_file(filename), pyre_data_file).to_dict()
except ParseError as err:
print("project: %s |file: %s |Exception: %s" % (project_id, filename, err))
except UnicodeDecodeError:
print(f"Could not read file {filename}")
except Exception as err:
print("project: %s |file: %s |Exception: %s" % (project_id, filename, err))

print(f'Saving static analysis results for {project_id}...')

if len(project_analyzed_files[project_id]["src_files"].keys()) != 0:
project_analyzed_files[project_id]["type_annot_cove"] = \
round(sum([project_analyzed_files[project_id]["src_files"][s]["type_annot_cove"] for s in
project_analyzed_files[project_id]["src_files"].keys()]) / len(
project_analyzed_files[project_id]["src_files"].keys()), 2)

pyre_util.watchman_shutdown(project_path)
pyre_util.pyre_server_shutdown(project_path)
pyre_util.clean_config(project_path)

# remove cache projects
shutil.rmtree(cache_path)

return project_analyzed_files
Loading