From 102c6470daebb67bf44f5103d7fc61737ff620a0 Mon Sep 17 00:00:00 2001 From: Stanford997 <545255309@qq.com> Date: Thu, 21 Nov 2024 15:06:26 -0500 Subject: [PATCH 1/9] feat: update all user state to 0 when app.py starts --- be_repo/app.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/be_repo/app.py b/be_repo/app.py index 2e581391c..aa1c4a5c2 100644 --- a/be_repo/app.py +++ b/be_repo/app.py @@ -38,6 +38,11 @@ user_state_collection = user_database.get_collection("user_chat_state") query = {"user_id": "test"} user_state = user_state_collection.find_one(query) + # update all user state to 0 + update_result = user_state_collection.update_many( + {}, + {"$set": {"state": 0}} + ) except Exception as e: raise Exception("Unable to find the document due to the following error: ", e) @@ -160,6 +165,7 @@ def ask_question(): return jsonify({"response": response}), 200 + @app.route('/suggest/interiew_question', methods=['POST', 'OPTIONS']) def interview_question_suggestion(): if request.method == 'OPTIONS': From 216ba0f0f35c2ce07d1a0829550c050c039cc00b Mon Sep 17 00:00:00 2001 From: zihan zhou <95243748+andyasdd1@users.noreply.github.com> Date: Sat, 30 Nov 2024 23:11:34 -0500 Subject: [PATCH 2/9] Add New Modules: GraphCypherQAChain. Add New Preprocess modules: embedding and nodes retrival. Added New Config: OpenAi key retrieve embedding.py graph-preprocess-simpl.py neo4j_import.py openai_key.py job_recommendation_system.py neo4j_model.py recommendation_generator.py resume_processor.py retrieval_engine.py verify.py view.py --- be_repo/configs/openai_key.py | 27 + be_repo/modules/job_recommendation_system.py | 75 ++ be_repo/modules/neo4j_model.py | 136 ++++ be_repo/modules/recommendation_generator.py | 45 ++ be_repo/modules/resume_processor.py | 25 + be_repo/modules/retrieval_engine.py | 109 +++ be_repo/modules/verify.py | 12 + be_repo/modules/view.py | 35 + be_repo/preprocess/embedding.py | 104 +++ be_repo/preprocess/graph-preprocess-simpl.py | 749 +++++++++++++++++++ be_repo/preprocess/neo4j_import.py | 233 ++++++ 11 files changed, 1550 insertions(+) create mode 100644 be_repo/configs/openai_key.py create mode 100644 be_repo/modules/job_recommendation_system.py create mode 100644 be_repo/modules/neo4j_model.py create mode 100644 be_repo/modules/recommendation_generator.py create mode 100644 be_repo/modules/resume_processor.py create mode 100644 be_repo/modules/retrieval_engine.py create mode 100644 be_repo/modules/verify.py create mode 100644 be_repo/modules/view.py create mode 100644 be_repo/preprocess/embedding.py create mode 100644 be_repo/preprocess/graph-preprocess-simpl.py create mode 100644 be_repo/preprocess/neo4j_import.py diff --git a/be_repo/configs/openai_key.py b/be_repo/configs/openai_key.py new file mode 100644 index 000000000..cf045505d --- /dev/null +++ b/be_repo/configs/openai_key.py @@ -0,0 +1,27 @@ +# configs/openai_key.py + +from configs.database import get_key_database + +def get_openai_api_key(): + """ + Retrieve the OpenAI API key from the MongoDB database. + + Returns: + str: The OpenAI API key. + + Raises: + ValueError: If the API key is not found or is empty. + """ + db = get_key_database() + keys_collection = db["keys"] + openai_key_doc = keys_collection.find_one({"_id": "chatgpt_api"}) + + if not openai_key_doc: + raise ValueError("OpenAI API key not found in the database.") + + openai_key = openai_key_doc.get("api_key") + + if not openai_key: + raise ValueError("OpenAI API key is empty.") + + return openai_key diff --git a/be_repo/modules/job_recommendation_system.py b/be_repo/modules/job_recommendation_system.py new file mode 100644 index 000000000..95e4b609b --- /dev/null +++ b/be_repo/modules/job_recommendation_system.py @@ -0,0 +1,75 @@ +# job_recommendation_system.py + +from neo4j_model import Neo4jModel +from resume_processor import ResumeProcessor +from retrieval_engine import RetrievalEngine +from recommendation_generator import RecommendationGenerator +from view import CLIView +import sys + +def main(): + + + # Redirect standard output to a file + sys.stdout = open('output.log', 'w') + + # Your code here + print("Lots of output") + + + # Setup Logging + import logging + logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + logger = logging.getLogger(__name__) + + # Neo4j Connection Details + NEO4J_URI = "neo4j+ssc://7bf5a48e.databases.neo4j.io" # Replace with your Neo4j URI + NEO4J_USERNAME = "neo4j" # Replace with your Neo4j username + NEO4J_PASSWORD = "oxsK7V5_86emZlYQlvCfQHfVWS95wXz29OhtU8GAdFc" # Replace with your Neo4j password + + # Initialize Model + neo4j_model = Neo4jModel( + uri=NEO4J_URI, + username=NEO4J_USERNAME, + password=NEO4J_PASSWORD + ) + + # Initialize Controller Components + resume_processor = ResumeProcessor() + retrieval_engine = RetrievalEngine(resume_processor, neo4j_model) + recommendation_generator = RecommendationGenerator() + + # Initialize View + view = CLIView() + + # Get Resume Input from User + resume_text = view.get_resume_input() + + if not resume_text.strip(): + logger.error("No resume text provided.") + print("Error: No resume text provided.") + return + + # Perform Mixed Retrieval for 'JD' Node Label + node_label = "JD" # Adjust as needed; could be dynamic based on user input or other criteria + similar_docs, graph_results = retrieval_engine.perform_mixed_retrieval(resume_text, node_label=node_label) + + if not similar_docs and not graph_results: + print("No job recommendations found based on your resume.") + return + + # Generate Recommendations + try: + recommendations = recommendation_generator.generate_recommendations(similar_docs, graph_results) + except Exception as e: + print("Error: Failed to generate job recommendations.") + return + + # Display Recommendations + view.display_recommendations(recommendations) + + # Close the file + sys.stdout.close() + +if __name__ == "__main__": + main() diff --git a/be_repo/modules/neo4j_model.py b/be_repo/modules/neo4j_model.py new file mode 100644 index 000000000..f3a987c95 --- /dev/null +++ b/be_repo/modules/neo4j_model.py @@ -0,0 +1,136 @@ +# neo4j_model.py +from langchain_neo4j import GraphCypherQAChain, Neo4jGraph +from langchain_community.vectorstores import Neo4jVector +from langchain_openai import OpenAIEmbeddings, ChatOpenAI +from configs.openai_key import get_openai_api_key # New import +from langchain.prompts import PromptTemplate + +custom_cypher_prompt = PromptTemplate( + input_variables=["question", "schema"], + template=""" + You are an expert Cypher query writer for a Neo4j graph database. + + The database has the following schema: + + {schema} + + Given the user's question, generate an efficient Cypher query that: + + - Retrieves relevant job recommendations based on the user's resume. + - Excludes the 'embedding' property to avoid exceeding context limits. + - Limits the number of results to avoid duplicates and improve performance. + - Returns relevant job recommendations based on the user's resume. + + Question: + {question} + + Cypher Query: + """ +) + + +class Neo4jModel: + def __init__(self, uri, username, password): + + # Initialize Neo4j Graph connection + self.graph = Neo4jGraph( + url=uri, + username=username, + password=password, + # enhanced_schema=True, # Optional: Provides more detailed schema information + ) + + # Initialize the embedding model with the API key + api_key = get_openai_api_key() + self.embeddings = OpenAIEmbeddings(openai_api_key=api_key) + + # Initialize Neo4jVector for each node label + self.vector_store_jd = Neo4jVector.from_existing_index( + embedding=self.embeddings, + url=uri, + username=username, + password=password, + index_name="jd_embedding_index", + ) + + + self.vector_store_jtitle = Neo4jVector.from_existing_index( + embedding=self.embeddings, + url=uri, + username=username, + password=password, + index_name="jtitle_embedding_index", + ) + + + self.vector_store_jkeyword = Neo4jVector.from_existing_index( + embedding=self.embeddings, + url=uri, + username=username, + password=password, + index_name="jkeyword_embedding_index", + ) + + # Initialize Language Model for QA Chain + self.llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, openai_api_key=api_key) + + # Initialize GraphCypherQAChain + self.graph_chain = GraphCypherQAChain.from_llm( + graph=self.graph, + llm=self.llm, + cypher_prompt=custom_cypher_prompt, + return_intermediate_steps=True, + verbose=True, + validate_cypher=True, # Ensures correct relationship directions + allow_dangerous_requests=True, + ) + + def store_documents(self, docs, node_label="JD"): + """ + Store documents in Neo4jVector with embeddings. + """ + # Ensure that 'docs' is a list of Document objects + if node_label == "JD": + self.vector_store_jd.add_documents(docs) + + elif node_label == "JTitle": + self.vector_store_jtitle.add_documents(docs) + + elif node_label == "JKeyword": + self.vector_store_jkeyword.add_documents(docs) + + else: + + raise ValueError(f"Invalid node_label '{node_label}'. Must be 'JD', 'JTitle', or 'JKeyword'.") + + def query_graph(self, cypher_query, parameters=None): + """ + Execute a Cypher query against the Neo4j graph. + """ + results = self.graph.query(cypher_query, parameters) + return results + + def get_retriever(self, node_label="JD"): + """ + Get a retriever from the Neo4jVector for vector similarity searches. + """ + try: + if node_label == "JD": + return self.vector_store_jd.as_retriever() + elif node_label == "JTitle": + return self.vector_store_jtitle.as_retriever() + elif node_label == "JKeyword": + return self.vector_store_jkeyword.as_retriever() + else: + raise ValueError(f"Invalid node_label '{node_label}'. Must be 'JD', 'JTitle', or 'JKeyword'.") + except Exception as e: + raise e + + def get_graph_chain(self): + """ + Get the GraphCypherQAChain instance. + + Returns: + GraphCypherQAChain: The QA chain instance. + """ + return self.graph_chain \ No newline at end of file diff --git a/be_repo/modules/recommendation_generator.py b/be_repo/modules/recommendation_generator.py new file mode 100644 index 000000000..767063c7d --- /dev/null +++ b/be_repo/modules/recommendation_generator.py @@ -0,0 +1,45 @@ +# recommendation_generator.py + +class RecommendationGenerator: + def __init__(self): + pass + + def merge_results(self, vector_docs, graph_results): + combined_jobs = {} + + # Process vector similarity results + for doc in vector_docs: + comp = doc.metadata.get("comp", "") + resp = doc.metadata.get("resp", "") + job_title = f"{resp} at {comp}".strip() + if job_title: + combined_jobs[job_title] = combined_jobs.get(job_title, 0) + 1 + + # Process graph traversal results + # Access the context from intermediate steps + intermediate_steps = graph_results.get('intermediate_steps', []) + if len(intermediate_steps) > 1: + context = intermediate_steps[1].get('context', []) + for job in context: + job_title = job.get('job_title', '') + company = job.get('company', '') + if job_title and company: + combined_job = f"{job_title} at {company}" + combined_jobs[combined_job] = combined_jobs.get(combined_job, 0) + 1 + + # Convert to sorted list based on combined score + sorted_jobs = sorted(combined_jobs.items(), key=lambda item: item[1], reverse=True) + return [job for job, score in sorted_jobs] + + def generate_recommendations(self, vector_docs, graph_results): + """ + Generate a ranked list of job recommendations by merging vector and graph results. + + Parameters: + vector_docs (List[Document]): Documents from vector similarity search. + graph_results (dict): Results from graph traversal. + + Returns: + List[str]: Ranked list of unique job recommendations. + """ + return self.merge_results(vector_docs, graph_results) diff --git a/be_repo/modules/resume_processor.py b/be_repo/modules/resume_processor.py new file mode 100644 index 000000000..8b6dddfb8 --- /dev/null +++ b/be_repo/modules/resume_processor.py @@ -0,0 +1,25 @@ +# resume_processor.py + +from langchain.docstore.document import Document + +class ResumeProcessor: + def __init__(self): + """ + Initialize the Resume Processor. + """ + + def process_resume(self, resume_text): + """ + Process the user's resume to create a Document object. + + Parameters: + resume_text (str): The user's resume text. + + Returns: + Document or None: The processed resume as a LangChain Document or None if failed. + """ + try: + doc = Document(page_content=resume_text, metadata={}) + return doc + except Exception as e: + return None diff --git a/be_repo/modules/retrieval_engine.py b/be_repo/modules/retrieval_engine.py new file mode 100644 index 000000000..02fac70e3 --- /dev/null +++ b/be_repo/modules/retrieval_engine.py @@ -0,0 +1,109 @@ +# retrieval_engine.py + +from langchain_neo4j import GraphCypherQAChain +from langchain_openai import ChatOpenAI +from langchain.chains.retrieval import create_retrieval_chain +from langchain.chains.combine_documents import create_stuff_documents_chain +from configs.openai_key import get_openai_api_key # New import +from langchain.prompts import PromptTemplate + +class RetrievalEngine: + def __init__(self, resume_processor, neo4j_model): + """ + Initialize the Retrieval Engine with necessary components. + + Parameters: + resume_processor (ResumeProcessor): Instance to process resumes. + neo4j_model (Neo4jModel): Instance to interact with Neo4j. + """ + self.resume_processor = resume_processor + self.neo4j_model = neo4j_model + + # Initialize Language Model (already initialized in Neo4jModel) + self.llm = self.neo4j_model.llm + + # Initialize GraphCypherQAChain (already initialized in Neo4jModel) + self.graph_chain = self.neo4j_model.get_graph_chain() + + # Define the PromptTemplate with 'context' as input variable + prompt = PromptTemplate( + template=""" + You are an expert Cypher query writer for a Neo4j graph database. + + Given the user's question, generate an efficient Cypher query that: + - extract entities and relationships from the following resume. + - Focus solely on the resume content. + + **Entities to Extract:** + - **Education (Edu):** Details about degrees, fields of study, institutions, start and end years, GPA. + - **Work Experience (WE):** Positions held, companies, locations. + - **Projects (Proj):** Project titles, descriptions, technologies used, roles. + - **Skills (Skill):** Technical and soft skills. + - **Certifications (Cert):** Certification names, issuing organizations, expiration dates. + - **Soft Skills (SSkill):** Non-technical skills like leadership, communication. + + **Relationships to Identify:** + - **UTILIZES_SKILL:** A Work Experience (WE) node utilizes a Skill (Skill) node. + - **USES_TECH:** A Project (Proj) node uses a Skill (Skill) node as a technology. + - **REL_TO (Proj to Skill):** A Project (Proj) node is related to a Skill (Skill) node. + - **REL_TO (Skill to Skill):** A Skill (Skill) node is similar to another Skill (Skill) node. + + **Resume:** + \"\"\" + {context} + \"\"\" + """, + input_variables=["input"] + ) + + # Create a documents chain + self.combine_docs_chain = create_stuff_documents_chain(self.llm, prompt=prompt) + + # Initialize Retrieval Chain + # Default node_label is 'JD'; can be adjusted as needed + self.retrieval_chain = create_retrieval_chain( + self.neo4j_model.get_retriever(node_label="JD"), + self.combine_docs_chain + ) + + def perform_mixed_retrieval(self, resume_text, node_label="JD"): + """ + Perform mixed retrieval using vector similarity and graph traversal. + + Parameters: + resume_text (str): The user's resume text. + node_label (str): The node label to perform retrieval on ('JD', 'JTitle', 'JKeyword'). + + Returns: + Tuple[List[Document], dict]: Results from vector similarity and graph traversal. + """ + # Process resume into a Document + doc = self.resume_processor.process_resume(resume_text) + + if not doc: + return [], {} + + # Store the Document in the appropriate vector store + self.neo4j_model.store_documents([doc], node_label=node_label) + + # Access the schema property correctly + schema = self.neo4j_model.graph.get_schema + + # Perform vector similarity search + similar_docs_result = self.retrieval_chain.invoke({"input": resume_text}) # Corrected to 'context' + similar_docs = similar_docs_result.get("output", []) + print("similar_docs_result:", similar_docs_result) + print("Keys in similar_docs_result:", similar_docs_result.keys()) + + + + for doc in similar_docs: + print("Document Metadata:", doc.metadata) + + query = f"Based on the following resume, recommend relevant job positions: {resume_text}" + graph_response = self.graph_chain.invoke({"query": query, "schema": schema}) + # After graph query + print("Graph Response:") + print(graph_response) + + return similar_docs, graph_response \ No newline at end of file diff --git a/be_repo/modules/verify.py b/be_repo/modules/verify.py new file mode 100644 index 000000000..a5ec2e025 --- /dev/null +++ b/be_repo/modules/verify.py @@ -0,0 +1,12 @@ +from neo4j import GraphDatabase + +uri = "neo4j+ssc://7bf5a48e.databases.neo4j.io" # Update with your Neo4j URI +username = "neo4j" # Update with your username +password = "oxsK7V5_86emZlYQlvCfQHfVWS95wXz29OhtU8GAdFc" # Update with your password + +driver = GraphDatabase.driver(uri, auth=(username, password)) + +with driver.session() as session: + result = session.run("SHOW INDEXES") + for record in result: + print(record["name"]) diff --git a/be_repo/modules/view.py b/be_repo/modules/view.py new file mode 100644 index 000000000..b7af1eaa1 --- /dev/null +++ b/be_repo/modules/view.py @@ -0,0 +1,35 @@ +# view.py + +class CLIView: + def __init__(self): + pass + + def get_resume_input(self): + """ + Prompt user to input their resume text. + """ + print("Welcome to the Job Recommendation System!") + print("Please paste your resume text below (end input with an empty line):") + resume_lines = [] + while True: + try: + line = input() + if line.strip() == "": + break + resume_lines.append(line) + except EOFError: + # Handle end of file (e.g., user sends EOF signal) + break + resume_text = "\n".join(resume_lines) + return resume_text + + def display_recommendations(self, recommendations): + """ + Display job recommendations to the user. + """ + if not recommendations: + print("No job recommendations found based on your resume.") + return + print("\nRecommended Jobs for You:") + for idx, job in enumerate(recommendations, start=1): + print(f"{idx}. {job}") diff --git a/be_repo/preprocess/embedding.py b/be_repo/preprocess/embedding.py new file mode 100644 index 000000000..65c319f1b --- /dev/null +++ b/be_repo/preprocess/embedding.py @@ -0,0 +1,104 @@ +# embedding_generation.py +import pandas as pd +import json +import os +import time +from tqdm import tqdm +import logging + + +# Initialize the OpenAI API client +from configs.openai_client import get_openai_client + +client = get_openai_client() + +# Setup logging +log_file = 'embedding_generation.log' +logging.basicConfig(filename=log_file, level=logging.DEBUG, + format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger() + +# Initialize the embedding model +default_model = "text-embedding-ada-002" # Updated to a commonly used model + +# Directory containing the original CSV files +csv_dir = 'neo4j_csv' + +# Directory to save updated CSV files with embeddings +updated_csv_dir = 'neo4j_csv_with_embeddings' +os.makedirs(updated_csv_dir, exist_ok=True) + +# Node types and their attributes +node_types = { + 'Edu': ['id', 'deg', 'f_study', 'inst', 's_year', 'e_year', 'gpa'], + 'WE': ['id', 'pos', 'comp', 'loc'], + 'Proj': ['id', 'ttl', 'desc', 'tech', 'role'], + 'Skill': ['id', 'name'], + 'Cert': ['id', 'name', 'issuer', 'exp'], + 'SSkill': ['id', 'name'], + 'JD': ['id', 'comp', 'req', 'resp', 'loc'], + 'JTitle': ['id', 'ttl', 'lvl', 'cat'], + 'JKeyword': ['id', 'keyword'], + 'Indus': ['id', 'name'], +} + +# Load node CSV files into DataFrames +node_dfs = {} +for node_type in node_types.keys(): + file_path = os.path.join(csv_dir, f'{node_type}.csv') + if os.path.exists(file_path): + df = pd.read_csv(file_path) + node_dfs[node_type] = df + else: + print(f"CSV file for node type '{node_type}' not found in '{csv_dir}'.") + +# Function to generate embeddings for a node DataFrame +def generate_embeddings_for_node(df, attributes, model, client, batch_size=100): + # Exclude 'id' from attributes to be concatenated + text_attributes = [attr for attr in attributes if attr not in ['id', 'embedding']] + # Concatenate all text attributes into a single string per row + texts = df[text_attributes].fillna('').astype(str).agg(' '.join, axis=1).tolist() + + embeddings = [] + for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings in batches"): + batch_texts = texts[i:i+batch_size] + if i == 1: + break + try: + response = client.embeddings.create(input=batch_texts, model=model) + + batch_embeddings = [item.embedding for item in response.data] + embeddings.extend(batch_embeddings) + time.sleep(1) # To respect rate limits; adjust as necessary + except Exception as e: + print(f"Error generating embeddings for batch {i//batch_size + 1}: {e}") + # Optionally, append None or a placeholder + embeddings.extend([None] * len(batch_texts)) + + if len(embeddings) != len(texts): + raise ValueError("Number of embeddings does not match number of texts.") + + return embeddings + +# Generate embeddings for each node type +for node_type, df in node_dfs.items(): + print(f"Generating embeddings for node type '{node_type}'...") + attributes = node_types[node_type] + # Generate embeddings + embeddings = generate_embeddings_for_node(df, attributes, default_model, client) + # Add embeddings to the DataFrame + df['embedding'] = embeddings + # Update the DataFrame in the dictionary + node_dfs[node_type] = df + +# Save updated node CSV files with embeddings +for node_type, df in node_dfs.items(): + # Convert embedding arrays to JSON strings for storage + if 'embedding' in df.columns: + df['embedding'] = df['embedding'].apply(lambda x: json.dumps(x) if isinstance(x, list) else '[]') + # Save the updated DataFrame to a CSV file + file_path = os.path.join(updated_csv_dir, f'{node_type}.csv') + df.to_csv(file_path, index=False) + print(f"Updated CSV file saved for node type '{node_type}'.") + +print("Embedding generation and CSV update completed.") diff --git a/be_repo/preprocess/graph-preprocess-simpl.py b/be_repo/preprocess/graph-preprocess-simpl.py new file mode 100644 index 000000000..5fd3a330e --- /dev/null +++ b/be_repo/preprocess/graph-preprocess-simpl.py @@ -0,0 +1,749 @@ +import pandas as pd +import openai +import json +import csv +import os +import re +import time + +# Initialize the OpenAI API client +from configs.openai_client import get_openai_client + +client = get_openai_client() + +# Load your dataset +data = pd.read_csv('resume_job_description_fit_train.csv') + +# Create a directory for storing debug responses +debug_dir = 'debug_responses' +os.makedirs(debug_dir, exist_ok=True) + +# Define the node types and attributes (with shorter names) +node_types = { + 'Edu': ['id', 'deg', 'f_study', 'inst', 's_year', 'e_year', 'gpa'], + 'WE': ['id', 'pos', 'comp', 'loc'], + 'Proj': ['id', 'ttl', 'desc', 'tech', 'role'], + 'Skill': ['id', 'name'], + 'Cert': ['id', 'name', 'issuer', 'exp'], + 'SSkill': ['id', 'name'], + 'JD': ['id', 'comp', 'req', 'resp', 'loc'], + 'JTitle': ['id', 'ttl', 'lvl', 'cat'], + 'JKeyword': ['id', 'keyword'], + 'Indus': ['id', 'name'], +} + +relationships = [ + {'type': 'UTILIZES_SKILL', 'start_node': 'WE', 'end_node': 'Skill'}, + {'type': 'USES_TECH', 'start_node': 'Proj', 'end_node': 'Skill'}, + {'type': 'REL_TO', 'start_node': 'Proj', 'end_node': 'Skill'}, + {'type': 'DESCRIBES', 'start_node': 'JD', 'end_node': 'JTitle'}, + {'type': 'REQ_SKILL', 'start_node': 'JTitle', 'end_node': 'Skill'}, + {'type': 'ASSOC_WITH', 'start_node': 'JTitle', 'end_node': 'JKeyword'}, + {'type': 'MENTIONS', 'start_node': 'JD', 'end_node': 'JKeyword'}, + {'type': 'REQ_EDU', 'start_node': 'JD', 'end_node': 'Edu'}, + {'type': 'DESIRES_SSKILL', 'start_node': 'JD', 'end_node': 'SSkill'}, + {'type': 'BELONGS_TO_INDUS', 'start_node': 'JTitle', 'end_node': 'Indus'}, + {'type': 'REL_TO', 'start_node': 'Skill', 'end_node': 'Skill'}, + {'type': 'REQ_CERT', 'start_node': 'JTitle', 'end_node': 'Cert'}, + {'type': 'BELONGS_TO_INDUS', 'start_node': 'JD', 'end_node': 'Indus'}, + {'type': 'IN_INDUS', 'start_node': 'WE', 'end_node': 'Indus'}, + {'type': 'REL_TO', 'start_node': 'Cert', 'end_node': 'Skill'}, + {'type': 'SIMILAR_TO', 'start_node': 'JKeyword', 'end_node': 'JKeyword'}, + {'type': 'SIMILAR_TO', 'start_node': 'Skill', 'end_node': 'Skill'}, +] + +# Initialize dictionaries to store nodes and relationships +node_id_counter = 1 # Global counter for node IDs +node_mappings = {node_type: {} for node_type in node_types} + +# Initialize relationships data list +relationships_data = [] + +# Create output directory if not exists +output_dir = 'neo4j_csv' +os.makedirs(output_dir, exist_ok=True) + +# Initialize CSV writers for nodes +node_files = {} +node_writers = {} + +for node_type, attributes in node_types.items(): + file_path = os.path.join(output_dir, f'{node_type}.csv') + node_file = open(file_path, 'w', newline='', encoding='utf-8') + node_writer = csv.DictWriter(node_file, fieldnames=attributes) + node_writer.writeheader() + node_files[node_type] = node_file + node_writers[node_type] = node_writer + +# Initialize CSV writer for relationships +relationship_keys = ['start_node_id', 'relationship_type', 'end_node_id'] +relationships_file = open(os.path.join(output_dir, 'relationships.csv'), 'w', newline='', encoding='utf-8') +relationships_writer = csv.DictWriter(relationships_file, fieldnames=relationship_keys) +relationships_writer.writeheader() + +def generate_prompt(resume_text, job_description_text): + prompt = f""" +Extract entities and relationships from the following resume and job description. + +Your task is to extract both **entities** and **relationships** from the provided texts. + +**Valid Relationship Types:** + +- **UTILIZES_SKILL:** A Work Experience (`WE`) node **utilizes** a Skill (`Skill`) node. For example, a person used Python during their job at a company. + +- **USES_TECH:** A Project (`Proj`) node **uses** a Skill (`Skill`) node as a technology. For example, a project implemented using React.js. + +- **REL_TO (Proj to Skill):** A Project (`Proj`) node is **related to** a Skill (`Skill`) node, indicating relevance or association. + +- **DESCRIBES:** A Job Description (`JD`) node **describes** a Job Title (`JTitle`) node, providing details about the role. + +- **REQ_SKILL:** A Job Title (`JTitle`) node **requires** a Skill (`Skill`) node. The skill is necessary for the job position. + +- **ASSOC_WITH:** A Job Title (`JTitle`) node is **associated with** a Job Keyword (`JKeyword`) node, linking keywords relevant to the job. + +- **MENTIONS:** A Job Description (`JD`) node **mentions** a Job Keyword (`JKeyword`) node. The keyword appears in the job description. + +- **REQ_EDU:** A Job Description (`JD`) node **requires** an Education (`Edu`) node, such as a degree or certification. + +- **DESIRES_SSKILL:** A Job Description (`JD`) node **desires** a Soft Skill (`SSkill`) node, indicating preferred soft skills. + +- **BELONGS_TO_INDUS (JTitle to Indus):** A Job Title (`JTitle`) node **belongs to** an Industry (`Indus`) node, specifying the industry category. + +- **REL_TO (Skill to Skill):** A Skill (`Skill`) node is **related to** another Skill (`Skill`) node, indicating similarity or complementarity. + +- **REQ_CERT:** A Job Title (`JTitle`) node **requires** a Certification (`Cert`) node. The certification is needed for the role. + +- **BELONGS_TO_INDUS (JD to Indus):** A Job Description (`JD`) node **belongs to** an Industry (`Indus`) node, providing industry context. + +- **IN_INDUS:** A Work Experience (`WE`) node is **in** an Industry (`Indus`) node, indicating the industry of the work experience. + +- **REL_TO (Cert to Skill):** A Certification (`Cert`) node is **related to** a Skill (`Skill`) node, showing the skill validated by the certification. + +- **SIMILAR_TO (JKeyword to JKeyword):** A Job Keyword (`JKeyword`) node is **similar to** another Job Keyword (`JKeyword`) node, indicating related concepts. + +- **SIMILAR_TO (Skill to Skill):** A Skill (`Skill`) node is **similar to** another Skill (`Skill`) node, suggesting related skills. + +Resume: +\"\"\" +{resume_text} +\"\"\" + +Job Description: +\"\"\" +{job_description_text} +\"\"\" + +Remember to reference nodes using their `id` fields in the relationships, return the data in the following JSON format without additional explanations: + +{{ + ""nodes"": {{ + ""Edu"": [ + {{ + ""id"": ""edu1"", + ""deg"": ""Bachelor of Science"", + ""f_study"": ""Computer Science"", + ""inst"": ""XYZ University"", + ""s_year"": ""2010"", + ""e_year"": ""2014"", + ""gpa"": ""3.8"" + }} + ], + ""WE"": [ + {{ + ""id"": ""we1"", + ""pos"": ""Data Analyst"", + ""comp"": ""Tech Solutions Inc"", + ""loc"": ""San Francisco"" + }} + ], + ""Proj"": [ + {{ + ""id"": ""proj1"", + ""ttl"": ""Data Migration Project"", + ""desc"": ""Migrated data from legacy systems to cloud"", + ""tech"": ""AWS, Python"", + ""role"": ""Lead Developer"" + }} + ], + ""Skill"": [ + {{ + ""id"": ""skill1"", + ""name"": ""SQL"" + }}, + {{ + ""id"": ""skill2"", + ""name"": ""Python"" + }}, + {{ + ""id"": ""skill3"", + ""name"": ""Data Modeling"" + }}, + {{ + ""id"": ""skill4"", + ""name"": ""ETL"" + }} + ], + ""Cert"": [ + {{ + ""id"": ""cert1"", + ""name"": ""AWS Certified Solutions Architect"", + ""issuer"": ""Amazon"", + ""exp"": ""2023"" + }} + ], + ""SSkill"": [ + {{ + ""id"": ""sskill1"", + ""name"": ""Team Leadership"" + }} + ], + ""JD"": {{ + ""id"": ""jd1"", + ""comp"": ""ABC Corp"", + ""req"": ""Experience in Python, SQL, and data analysis"", + ""resp"": ""Manage data integration projects"", + ""loc"": ""New York"" + }}, + ""JTitle"": {{ + ""id"": ""jtitle1"", + ""ttl"": ""Data Engineer"", + ""lvl"": ""Mid"", + ""cat"": ""Engineering"" + }}, + ""JKeyword"": [ + {{ + ""id"": ""jkeyword1"", + ""keyword"": ""Big Data"" + }} + ], + ""Indus"": [ + {{ + ""id"": ""indus1"", + ""name"": ""Information Technology"" + }} + ] + }}, + ""rels"": [ + {{ + ""s_type"": ""WE"", + ""s_id"": ""we1"", + ""rel"": ""UTILIZES_SKILL"", + ""e_type"": ""Skill"", + ""e_id"": ""skill1"" + }}, + {{ + ""s_type"": ""Proj"", + ""s_id"": ""proj1"", + ""rel"": ""USES_TECH"", + ""e_type"": ""Skill"", + ""e_id"": ""skill2"" + }}, + {{ + ""s_type"": ""Proj"", + ""s_id"": ""proj1"", + ""rel"": ""REL_TO"", + ""e_type"": ""Skill"", + ""e_id"": ""skill3"" + }}, + {{ + ""s_type"": ""JD"", + ""s_id"": ""jd1"", + ""rel"": ""DESCRIBES"", + ""e_type"": ""JTitle"", + ""e_id"": ""jtitle1"" + }}, + {{ + ""s_type"": ""JTitle"", + ""s_id"": ""jtitle1"", + ""rel"": ""REQ_SKILL"", + ""e_type"": ""Skill"", + ""e_id"": ""skill2"" + }}, + {{ + ""s_type"": ""JTitle"", + ""s_id"": ""jtitle1"", + ""rel"": ""ASSOC_WITH"", + ""e_type"": ""JKeyword"", + ""e_id"": ""jkeyword1"" + }}, + {{ + ""s_type"": ""JD"", + ""s_id"": ""jd1"", + ""rel"": ""MENTIONS"", + ""e_type"": ""JKeyword"", + ""e_id"": ""jkeyword1"" + }}, + {{ + ""s_type"": ""JD"", + ""s_id"": ""jd1"", + ""rel"": ""REQ_EDU"", + ""e_type"": ""Edu"", + ""e_id"": ""edu1"" + }}, + {{ + ""s_type"": ""JD"", + ""s_id"": ""jd1"", + ""rel"": ""DESIRES_SSKILL"", + ""e_type"": ""SSkill"", + ""e_id"": ""sskill1"" + }}, + {{ + ""s_type"": ""JTitle"", + ""s_id"": ""jtitle1"", + ""rel"": ""BELONGS_TO_INDUS"", + ""e_type"": ""Indus"", + ""e_id"": ""indus1"" + }}, + {{ + ""s_type"": ""Skill"", + ""s_id"": ""skill3"", + ""rel"": ""REL_TO"", + ""e_type"": ""Skill"", + ""e_id"": ""skill4"" + }}, + {{ + ""s_type"": ""JTitle"", + ""s_id"": ""jtitle1"", + ""rel"": ""REQ_CERT"", + ""e_type"": ""Cert"", + ""e_id"": ""cert1"" + }}, + {{ + ""s_type"": ""JD"", + ""s_id"": ""jd1"", + ""rel"": ""BELONGS_TO_INDUS"", + ""e_type"": ""Indus"", + ""e_id"": ""indus1"" + }}, + {{ + ""s_type"": ""WE"", + ""s_id"": ""we1"", + ""rel"": ""IN_INDUS"", + ""e_type"": ""Indus"", + ""e_id"": ""indus1"" + }}, + {{ + ""s_type"": ""Cert"", + ""s_id"": ""cert1"", + ""rel"": ""REL_TO"", + ""e_type"": ""Skill"", + ""e_id"": ""skill4"" + }}, + {{ + ""s_type"": ""JKeyword"", + ""s_id"": ""jkeyword1"", + ""rel"": ""SIMILAR_TO"", + ""e_type"": ""JKeyword"", + ""e_id"": ""jkeyword1"" + }}, + {{ + ""s_type"": ""Skill"", + ""s_id"": ""skill1"", + ""rel"": ""SIMILAR_TO"", + ""e_type"": ""Skill"", + ""e_id"": ""skill68"" + }} + ] +}} + +""" + return prompt + + +function_schema = [ + { + "name": "extract_entities_and_relationships", + "description": "Extract entities and relationships from resume and job description.", + "parameters": { + "type": "object", + "properties": { + "nodes": { + "type": "object", + "properties": { + "Edu": { # Education + "type": "array", + "items": { + "type": "object", + "properties": { + "id": {"type": "string"}, # Unique identifier + "deg": {"type": "string"}, # degree + "f_study": {"type": "string"}, # field_of_study + "inst": {"type": "string"}, # institution + "s_year": {"type": "string"}, # start_year + "e_year": {"type": "string"}, # end_year + "gpa": {"type": "string"}, # GPA + }, + "required": ["id", "deg", "inst"] + }, + }, + "WE": { # WorkExperience + "type": "array", + "items": { + "type": "object", + "properties": { + "id": {"type": "string"}, # Unique identifier + "pos": {"type": "string"}, # position + "comp": {"type": "string"}, # company + "loc": {"type": "string"}, # location + }, + "required": ["id", "pos", "comp"] + }, + }, + "Proj": { # Project + "type": "array", + "items": { + "type": "object", + "properties": { + "id": {"type": "string"}, # Unique identifier + "ttl": {"type": "string"}, # title + "desc": {"type": "string"}, # description + "tech": {"type": "string"}, # technologies_used + "role": {"type": "string"}, # role + }, + "required": ["id", "ttl"] + }, + }, + "Skill": { # Skill + "type": "array", + "items": { + "type": "object", + "properties": { + "id": {"type": "string"}, # Unique identifier + "name": {"type": "string"} + }, + "required": ["id", "name"] + }, + }, + "Cert": { # Certification + "type": "array", + "items": { + "type": "object", + "properties": { + "id": {"type": "string"}, # Unique identifier + "name": {"type": "string"}, + "issuer": {"type": "string"}, # issuing_organization + "exp": {"type": "string"}, # expiration_date + }, + "required": ["id", "name"] + }, + }, + "SSkill": { # SoftSkill + "type": "array", + "items": { + "type": "object", + "properties": { + "id": {"type": "string"}, # Unique identifier + "name": {"type": "string"} + }, + "required": ["id", "name"] + }, + }, + "JD": { # JobDescription + "type": "object", + "properties": { + "id": {"type": "string"}, # Unique identifier + "comp": {"type": "string"}, # company + "req": {"type": "string"}, # requirements + "resp": {"type": "string"}, # responsibilities + "loc": {"type": "string"}, # location + }, + "required": ["id", "comp"] + }, + "JTitle": { # JobTitle + "type": "object", + "properties": { + "id": {"type": "string"}, # Unique identifier + "ttl": {"type": "string"}, # title + "lvl": {"type": "string"}, # level + "cat": {"type": "string"}, # category + }, + "required": ["id"] # 'ttl' is optional + }, + "JKeyword": { # JobKeyword + "type": "array", + "items": { + "type": "object", + "properties": { + "id": {"type": "string"}, # Unique identifier + "keyword": {"type": "string"} + }, + "required": ["id", "keyword"] + }, + }, + "Indus": { # Industry + "type": "array", + "items": { + "type": "object", + "properties": { + "id": {"type": "string"}, # Unique identifier + "name": {"type": "string"} + }, + "required": ["id", "name"] + }, + }, + }, + }, + "rels": { # relationships + "type": "array", + "items": { + "type": "object", + "properties": { + "s_type": {"type": "string"}, # start_node_type + "s_id": {"type": "string"}, # start_node_id + "rel": {"type": "string"}, # relationship_type + "e_type": {"type": "string"}, # end_node_type + "e_id": {"type": "string"}, # end_node_id + }, + "required": ["s_type", "s_id", "rel", "e_type", "e_id"] + }, + }, + }, + "required": ["nodes", "rels"], + }, + } +] + + + +# Mapping from LLM 'id's to node attribute keys, per batch +llm_id_to_attr_key = {} + +# Helper functions +def get_or_create_node_id(node_type, node_value): + global node_id_counter + node_dict = node_mappings[node_type] + if isinstance(node_value, dict): + # Filter out empty or null values, excluding 'id' + filtered_items = {k: v for k, v in node_value.items() if v and k != 'id'} + key = tuple(sorted(filtered_items.items())) + else: + key = node_value.lower() + + if key in node_dict: + return node_dict[key], False, key + else: + node_id = node_id_counter + node_id_counter += 1 + node_dict[key] = node_id + return node_id, True, key + +def process_singular_entity(node_type, node_writer, keys, node_values): + llm_id = node_values.get('id') + if not llm_id: + print(f"No 'id' found for {node_type} node.") + return None + + meaningful_data = any(v for k, v in node_values.items() if v and k != 'id') + if not meaningful_data: + print(f"Skipping empty {node_type} node.") + return None + + node_id, is_new, attr_key = get_or_create_node_id(node_type, node_values) + llm_id_to_attr_key[llm_id] = (node_type, attr_key) + + if is_new: + row_data = {'id': node_id} + row_data.update({key: node_values.get(key, '') for key in keys if key != 'id'}) + node_writer.writerow(row_data) + node_files[node_type].flush() + return node_id + + +def process_list_of_entities(node_type, node_writer, keys, node_values_list): + ids = [] + for node in node_values_list: + llm_id = node.get('id') + if not llm_id: + print(f"No 'id' found for {node_type} node.") + continue + + node_id, is_new, attr_key = get_or_create_node_id(node_type, node) + llm_id_to_attr_key[llm_id] = (node_type, attr_key) + + if is_new: + row_data = {'id': node_id} + row_data.update({key: node.get(key, '') for key in keys if key != 'id'}) + node_writer.writerow(row_data) + node_files[node_type].flush() + ids.append(node_id) + return ids + +def get_node_id_from_llm_id(llm_id): + mapping = llm_id_to_attr_key.get(llm_id) + if not mapping: + return None + node_type, attr_key = mapping + return node_mappings[node_type].get(attr_key) + +def process_singular_string_entity(node_type, node_writer, keys, node_value): + node_id, is_new = get_or_create_node_id(node_type, node_value) + print(f"Processing singular string entity for {node_type}, is_new={is_new}") + if is_new: + key_name = 'name' if 'name' in keys else 'keyword' + row_data = {'id': node_id, key_name: node_value} + node_writer.writerow(row_data) + node_files[node_type].flush() + print(f"Wrote new {node_type} node to CSV: {row_data}") + else: + print(f"{node_type} node already exists: {node_value}") + return node_id + +# Function to retrieve node ID based on type and reference +def get_node_id(node_type, node_ref, node_ids): + if isinstance(node_ref, int): + # Index-based reference + node_list = node_ids.get((node_type, 'list'), []) + if node_ref < len(node_list): + return node_list[node_ref] + else: + print(f"Invalid node index {node_ref} for node type {node_type}") + return None + else: + # Name-based reference (case-insensitive) + key = node_ref.lower() + return node_mappings[node_type].get(key) + + +# Process each row in the dataset +for index, row in data.iterrows(): + #if index == 3146: + # break + + # Skip rows where 'label' is 'No Fit' + if row['label'].strip().lower() == 'no fit': + print(f"Skipping index {index} due to label 'No Fit'.") + continue + + resume_text = row['resume_text'] + job_description_text = row['job_description_text'] + label = row['label'] + candidate_id = f"candidate_{index}" # Generate a unique candidate_id + job_id = f"job_{index}" # Generate a unique job_id + + # Generate the prompt + prompt = generate_prompt(resume_text, job_description_text) + + # Initialize retry counter + retry_count = 0 + success = False + + MAX_RETRIES = 3 + + while retry_count < MAX_RETRIES and not success: + try: + # Call the OpenAI API with function calling + response = client.chat.completions.create( + model='gpt-4o-mini', # Updated model version + messages=[ + {"role": "system", "content": "You are an expert data annotator."}, + {"role": "user", "content": prompt} + ], + functions=function_schema, + function_call={"name": "extract_entities_and_relationships"}, + max_tokens=2000, # Adjusted to prevent truncation + temperature=0, + top_p=1, + n=1, + stop=None, + ) + + # Extract the function call response using attribute access + message = response.choices[0].message + + if message.function_call: + function_args = message.function_call.arguments + print(f"Function Arguments at index {index}: {function_args}") + try: + extracted_data = json.loads(function_args) + success = True + except json.JSONDecodeError as e: + print(f"Failed to parse function arguments at index {index}. Error: {e}") + print(f"Function Arguments: {function_args[:500]}...") # Print first 500 chars for inspection + extracted_data = None + else: + print(f"No function call in response at index {index}.") + extracted_data = None + + except openai.BadRequestError as e: + print(f"OpenAI API request failed at index {index}: {e}") + break # Skip to the next row + except openai.OpenAIError as e: + print(f"OpenAI API encountered an error at index {index}: {e}") + retry_count += 1 + print(f"Retrying... ({retry_count}/{MAX_RETRIES})") + time.sleep(2) # Wait before retrying + + if not success: + print(f"Skipping index {index} after {MAX_RETRIES} failed attempts.") + continue + + if extracted_data: + print(f"Parsed data at index {index}: {extracted_data}") + # Process nodes + node_ids = {} # To store the IDs of nodes extracted in this iteration + extracted_nodes = extracted_data.get('nodes', {}) + relationships_list = extracted_data.get('rels', []) + + # Process each node type + for node_type, node_values in extracted_nodes.items(): + if node_type in node_types: + node_writer = node_writers[node_type] + keys = node_types[node_type] + + if isinstance(node_values, dict): + print(f"Processing singular entity for {node_type}") + node_id = process_singular_entity(node_type, node_writer, keys, node_values) + node_ids[(node_type, 'single')] = node_id + elif isinstance(node_values, list): + print(f"Processing list entities for {node_type}") + ids = process_list_of_entities(node_type, node_writer, keys, node_values) + node_ids[(node_type, 'list')] = ids + elif isinstance(node_values, str): + print(f"Processing singular string entity for {node_type}") + node_id = process_singular_string_entity(node_type, node_writer, keys, node_values) + node_ids[(node_type, 'single')] = node_id + else: + print(f"Unexpected data type for node_type {node_type} at index {index}") + else: + print(f"Unknown node type: {node_type} at index {index}") + + # Process relationships + for rel in relationships_list: + s_type = rel.get('s_type') # start_node_type + e_type = rel.get('e_type') # end_node_type + s_id = rel.get('s_id') # start_node_id from LLM + e_id = rel.get('e_id') # end_node_id from LLM + + rel_type = rel.get('rel') # relationship_type + + start_node_id = get_node_id_from_llm_id(s_id) + if not start_node_id: + print(f"Start node with LLM id '{s_id}' not found for relationship at index {index}") + continue + + end_node_id = get_node_id_from_llm_id(e_id) + if not end_node_id: + print(f"End node with LLM id '{e_id}' not found for relationship at index {index}") + continue + + # Write relationship + relationships_writer.writerow({ + 'start_node_id': start_node_id, + 'relationship_type': rel_type, + 'end_node_id': end_node_id + }) + relationships_file.flush() + print(f"Wrote relationship: {rel_type} from {s_type}({start_node_id}) to {e_type}({end_node_id})") + # After processing relationships + llm_id_to_attr_key.clear() + else: + print(f"Skipping index {index} due to parsing error.") + +# Close all node CSV files +for node_file in node_files.values(): + node_file.close() + +# Close relationships CSV file +relationships_file.close() + +print("Data extraction complete. CSV files are saved in the 'neo4j_csv' directory.") diff --git a/be_repo/preprocess/neo4j_import.py b/be_repo/preprocess/neo4j_import.py new file mode 100644 index 000000000..afffd1dec --- /dev/null +++ b/be_repo/preprocess/neo4j_import.py @@ -0,0 +1,233 @@ +# neo4j_import.py + +import pandas as pd +from neo4j import GraphDatabase +import json +import os +from tqdm import tqdm +import logging + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# Directory containing the updated CSV files with embeddings +csv_dir = 'neo4j_csv_with_embeddings' + +# Node types and their attributes +node_types = { + 'Edu': ['id', 'deg', 'f_study', 'inst', 's_year', 'e_year', 'gpa', 'embedding'], + 'WE': ['id', 'pos', 'comp', 'loc', 'embedding'], + 'Proj': ['id', 'ttl', 'desc', 'tech', 'role', 'embedding'], + 'Skill': ['id', 'name', 'embedding'], + 'Cert': ['id', 'name', 'issuer', 'exp', 'embedding'], + 'SSkill': ['id', 'name', 'embedding'], + 'JD': ['id', 'comp', 'req', 'resp', 'loc', 'embedding'], + 'JTitle': ['id', 'ttl', 'lvl', 'cat', 'embedding'], + 'JKeyword': ['id', 'keyword', 'embedding'], + 'Indus': ['id', 'name', 'embedding'], +} + +# Neo4j connection details from environment variables +uri = "neo4j+ssc://7bf5a48e.databases.neo4j.io" +AUTH = ("neo4j", "oxsK7V5_86emZlYQlvCfQHfVWS95wXz29OhtU8GAdFc") + +# Initialize Neo4j driver +driver = GraphDatabase.driver(uri, auth=AUTH) + +# Verify connectivity +try: + driver.verify_connectivity() + logger.info("Successfully connected to Neo4j.") +except Exception as e: + logger.error(f"Failed to connect to Neo4j: {e}") + driver.close() + exit(1) + +# Function to load node CSV files into DataFrames +def load_node_dataframes(csv_dir, node_types): + node_dfs = {} + for node_type in node_types.keys(): + file_path = os.path.join(csv_dir, f'{node_type}.csv') + if os.path.exists(file_path): + df = pd.read_csv(file_path) + node_dfs[node_type] = df + logger.info(f"Loaded {len(df)} records for node type '{node_type}'.") + else: + logger.warning(f"CSV file for node type '{node_type}' not found in '{csv_dir}'.") + return node_dfs + +# Function to load relationships CSV file into a DataFrame +def load_relationships_data(csv_dir): + relationships_file = os.path.join(csv_dir, 'relationships.csv') + if os.path.exists(relationships_file): + df = pd.read_csv(relationships_file) + logger.info(f"Loaded {len(df)} relationship records.") + return df + else: + logger.warning(f"Relationships CSV file not found in '{csv_dir}'.") + return None + +# Function to create constraints +def create_constraints(driver): + constraints = [ + "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Edu) REQUIRE n.id IS UNIQUE", + "CREATE CONSTRAINT IF NOT EXISTS FOR (n:WE) REQUIRE n.id IS UNIQUE", + "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Proj) REQUIRE n.id IS UNIQUE", + "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Skill) REQUIRE n.id IS UNIQUE", + "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Cert) REQUIRE n.id IS UNIQUE", + "CREATE CONSTRAINT IF NOT EXISTS FOR (n:SSkill) REQUIRE n.id IS UNIQUE", + "CREATE CONSTRAINT IF NOT EXISTS FOR (n:JD) REQUIRE n.id IS UNIQUE", + "CREATE CONSTRAINT IF NOT EXISTS FOR (n:JTitle) REQUIRE n.id IS UNIQUE", + "CREATE CONSTRAINT IF NOT EXISTS FOR (n:JKeyword) REQUIRE n.id IS UNIQUE", + "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Indus) REQUIRE n.id IS UNIQUE", + ] + with driver.session() as session: + for constraint in constraints: + try: + session.run(constraint) + logger.info(f"Executed constraint: {constraint}") + except Exception as e: + logger.error(f"Failed to execute constraint '{constraint}': {e}") + logger.info("Constraints created or already exist.") + +def standardize_relationship_types(df): + if 'relationship_type' in df.columns: + original_types = df['relationship_type'].unique() + df['relationship_type'] = df['relationship_type'].str.upper().str.replace(' ', '_').str.replace('[^A-Z0-9_]', '', regex=True) + standardized_types = df['relationship_type'].unique() + logger.info(f"Standardized relationship types from {len(original_types)} to {len(standardized_types)} unique types.") + return df + + +# Function to import nodes into Neo4j in batches +def import_nodes_in_batches(tx, node_type, df, batch_size=1000): + columns = df.columns.tolist() + # Prepare property assignments excluding 'id' + set_props = ', '.join([f"n.{col} = row.{col}" for col in columns if col != 'id']) + query = f""" + UNWIND $rows AS row + MERGE (n:{node_type} {{ id: toInteger(row.id) }}) + ON CREATE SET + {set_props} + """ + # Convert embedding JSON strings back to lists if present + if 'embedding' in df.columns: + df['embedding'] = df['embedding'].apply(lambda x: json.loads(x) if pd.notnull(x) else []) + data = df.to_dict('records') + for i in tqdm(range(0, len(data), batch_size), desc=f"Importing {node_type} in batches"): + batch = data[i:i+batch_size] + try: + tx.run(query, rows=batch) + logger.info(f"Imported batch {i//batch_size + 1} for node type '{node_type}'.") + except Exception as e: + logger.error(f"Error importing batch {i//batch_size + 1} for node type '{node_type}': {e}") + +# Function to create a mapping from ID to node type +def create_id_to_type_mapping(node_dfs): + id_to_type = {} + for node_type, df in node_dfs.items(): + for node_id in df['id']: + try: + id_to_type[int(node_id)] = node_type + except ValueError: + logger.warning(f"Invalid ID '{node_id}' in node type '{node_type}'. Skipping.") + logger.info("Created ID to node type mapping.") + return id_to_type + +# Function to infer node types for relationships +def infer_node_types(rel_df, id_to_type): + rel_df['start_node_type'] = rel_df['start_node_id'].apply(lambda x: id_to_type.get(int(x), 'Unknown')) + rel_df['end_node_type'] = rel_df['end_node_id'].apply(lambda x: id_to_type.get(int(x), 'Unknown')) + unknown_start = rel_df[rel_df['start_node_type'] == 'Unknown'] + unknown_end = rel_df[rel_df['end_node_type'] == 'Unknown'] + if not unknown_start.empty or not unknown_end.empty: + logger.warning("Some node IDs could not be mapped to any node type.") + logger.warning("Unknown Start Nodes:") + logger.warning(unknown_start) + logger.warning("Unknown End Nodes:") + logger.warning(unknown_end) + return rel_df + +def import_relationships_in_batches(tx, df, batch_size=1000): + data = df.to_dict('records') + for i in tqdm(range(0, len(data), batch_size), desc="Importing relationships in batches"): + batch = data[i:i+batch_size] + unwind_data = [ + { + "start_id": int(rel['start_node_id']), + "end_id": int(rel['end_node_id']), + "rel_type": rel['relationship_type'] + } + for rel in batch + ] + query = """ + UNWIND $rows AS row + MATCH (a {id: row.start_id}) + MATCH (b {id: row.end_id}) + CALL apoc.merge.relationship(a, row.rel_type, {}, {}, b) YIELD rel + RETURN rel + """ + try: + tx.run(query, rows=unwind_data) + logger.info(f"Imported batch {i//batch_size + 1} of relationships.") + except Exception as e: + logger.error(f"Error importing batch {i//batch_size + 1} of relationships: {e}") + + +# Main function to perform the import +def main(): + # Load node and relationship data + node_dfs = load_node_dataframes(csv_dir, node_types) + relationship_df = load_relationships_data(csv_dir) + + # Create constraints + create_constraints(driver) + + # Create ID to type mapping + id_to_type = create_id_to_type_mapping(node_dfs) + + # Import nodes + with driver.session() as session: + for node_type, df in node_dfs.items(): + logger.info(f"Importing nodes for node type '{node_type}'...") + session.execute_write(import_nodes_in_batches, node_type, df) + logger.info("Node import completed.") + + # Import relationships + if relationship_df is not None: + # Standardize relationship types + relationship_df = standardize_relationship_types(relationship_df) + + # Infer node types if not present + if 'start_node_type' not in relationship_df.columns or 'end_node_type' not in relationship_df.columns: + logger.info("Inferring 'start_node_type' and 'end_node_type' based on node IDs...") + relationship_df = infer_node_types(relationship_df, id_to_type) + + # Check for unknown node types + unknown_rels = relationship_df[ + (relationship_df['start_node_type'] == 'Unknown') | + (relationship_df['end_node_type'] == 'Unknown') + ] + if not unknown_rels.empty: + logger.error("Some relationships have unknown node types. Please verify your data.") + logger.error(unknown_rels) + # Skip unknown relationships + relationship_df = relationship_df[ + (relationship_df['start_node_type'] != 'Unknown') & + (relationship_df['end_node_type'] != 'Unknown') + ] + + # Import relationships + with driver.session() as session: + logger.info("Importing relationships...") + session.execute_write(import_relationships_in_batches, relationship_df) + logger.info("Relationship import completed.") + else: + logger.info("No relationships to import.") + + driver.close() + logger.info("Neo4j import completed.") + +if __name__ == "__main__": + main() From 3f5c4997f489ac63603a8fc8f0275ebd325363b0 Mon Sep 17 00:00:00 2001 From: zihan zhou <95243748+andyasdd1@users.noreply.github.com> Date: Sun, 1 Dec 2024 17:20:52 -0500 Subject: [PATCH 3/9] Update: Add in response from cypherqagraph. Fix: Fix issue where job title doesn't exists, allow node search switch --- be_repo/modules/job_recommendation_system.py | 4 +- be_repo/modules/recommendation_generator.py | 34 +++++++++------ be_repo/modules/retrieval_engine.py | 46 +++++++++----------- 3 files changed, 42 insertions(+), 42 deletions(-) diff --git a/be_repo/modules/job_recommendation_system.py b/be_repo/modules/job_recommendation_system.py index 95e4b609b..5e69615a4 100644 --- a/be_repo/modules/job_recommendation_system.py +++ b/be_repo/modules/job_recommendation_system.py @@ -8,7 +8,6 @@ import sys def main(): - # Redirect standard output to a file sys.stdout = open('output.log', 'w') @@ -34,6 +33,8 @@ def main(): password=NEO4J_PASSWORD ) + node_label = "JTitle" # Adjust as needed; could be dynamic based on user input or other criteria + # Initialize Controller Components resume_processor = ResumeProcessor() retrieval_engine = RetrievalEngine(resume_processor, neo4j_model) @@ -51,7 +52,6 @@ def main(): return # Perform Mixed Retrieval for 'JD' Node Label - node_label = "JD" # Adjust as needed; could be dynamic based on user input or other criteria similar_docs, graph_results = retrieval_engine.perform_mixed_retrieval(resume_text, node_label=node_label) if not similar_docs and not graph_results: diff --git a/be_repo/modules/recommendation_generator.py b/be_repo/modules/recommendation_generator.py index 767063c7d..da367a511 100644 --- a/be_repo/modules/recommendation_generator.py +++ b/be_repo/modules/recommendation_generator.py @@ -9,36 +9,42 @@ def merge_results(self, vector_docs, graph_results): # Process vector similarity results for doc in vector_docs: - comp = doc.metadata.get("comp", "") - resp = doc.metadata.get("resp", "") - job_title = f"{resp} at {comp}".strip() - if job_title: - combined_jobs[job_title] = combined_jobs.get(job_title, 0) + 1 + # Exclude 'id' and get all other non-empty metadata properties + metadata = {k: v for k, v in doc.metadata.items() if k != 'id' and v} + # Create a description string from the non-empty properties + job_description = ', '.join(f"{k}: {v}" for k, v in metadata.items()) + if job_description: + combined_jobs[job_description] = combined_jobs.get(job_description, 0) + 1 # Process graph traversal results - # Access the context from intermediate steps intermediate_steps = graph_results.get('intermediate_steps', []) if len(intermediate_steps) > 1: context = intermediate_steps[1].get('context', []) for job in context: - job_title = job.get('job_title', '') - company = job.get('company', '') - if job_title and company: - combined_job = f"{job_title} at {company}" - combined_jobs[combined_job] = combined_jobs.get(combined_job, 0) + 1 + # Exclude 'id' and get all other non-empty properties + job_data = {k: v for k, v in job.items() if k != 'id' and v} + # Create a description string + job_description = ', '.join(f"{k}: {v}" for k, v in job_data.items()) + if job_description: + combined_jobs[job_description] = combined_jobs.get(job_description, 0) + 1 + + # Include the 'result' from 'graph_results' directly + graph_result_text = graph_results.get('result', '').strip() + if graph_result_text: + combined_jobs[graph_result_text] = combined_jobs.get(graph_result_text, 0) + 1 # Convert to sorted list based on combined score sorted_jobs = sorted(combined_jobs.items(), key=lambda item: item[1], reverse=True) return [job for job, score in sorted_jobs] - + def generate_recommendations(self, vector_docs, graph_results): """ Generate a ranked list of job recommendations by merging vector and graph results. - + Parameters: vector_docs (List[Document]): Documents from vector similarity search. graph_results (dict): Results from graph traversal. - + Returns: List[str]: Ranked list of unique job recommendations. """ diff --git a/be_repo/modules/retrieval_engine.py b/be_repo/modules/retrieval_engine.py index 02fac70e3..83e2b6a83 100644 --- a/be_repo/modules/retrieval_engine.py +++ b/be_repo/modules/retrieval_engine.py @@ -26,13 +26,10 @@ def __init__(self, resume_processor, neo4j_model): self.graph_chain = self.neo4j_model.get_graph_chain() # Define the PromptTemplate with 'context' as input variable - prompt = PromptTemplate( - template=""" - You are an expert Cypher query writer for a Neo4j graph database. + template=""" + You are an assistant that matches resumes to relevant job descriptions. - Given the user's question, generate an efficient Cypher query that: - - extract entities and relationships from the following resume. - - Focus solely on the resume content. + Given the user's resume, find the most relevant job descriptions. **Entities to Extract:** - **Education (Edu):** Details about degrees, fields of study, institutions, start and end years, GPA. @@ -42,30 +39,20 @@ def __init__(self, resume_processor, neo4j_model): - **Certifications (Cert):** Certification names, issuing organizations, expiration dates. - **Soft Skills (SSkill):** Non-technical skills like leadership, communication. - **Relationships to Identify:** - - **UTILIZES_SKILL:** A Work Experience (WE) node utilizes a Skill (Skill) node. - - **USES_TECH:** A Project (Proj) node uses a Skill (Skill) node as a technology. - - **REL_TO (Proj to Skill):** A Project (Proj) node is related to a Skill (Skill) node. - - **REL_TO (Skill to Skill):** A Skill (Skill) node is similar to another Skill (Skill) node. - **Resume:** \"\"\" {context} \"\"\" - """, - input_variables=["input"] - ) - - # Create a documents chain - self.combine_docs_chain = create_stuff_documents_chain(self.llm, prompt=prompt) + """ - # Initialize Retrieval Chain - # Default node_label is 'JD'; can be adjusted as needed - self.retrieval_chain = create_retrieval_chain( - self.neo4j_model.get_retriever(node_label="JD"), - self.combine_docs_chain + self.prompt_template = PromptTemplate( + template=template, + input_variables=["input"] ) + # Create a documents chain + self.combine_docs_chain = create_stuff_documents_chain(self.llm, self.prompt_template) + def perform_mixed_retrieval(self, resume_text, node_label="JD"): """ Perform mixed retrieval using vector similarity and graph traversal. @@ -89,14 +76,21 @@ def perform_mixed_retrieval(self, resume_text, node_label="JD"): # Access the schema property correctly schema = self.neo4j_model.graph.get_schema + # Get the retriever for the given node label + retriever = self.neo4j_model.get_retriever(node_label=node_label) + + # Create the retrieval chain with the retriever and the combine_docs_chain + retrieval_chain = create_retrieval_chain( + retriever, + self.combine_docs_chain + ) + # Perform vector similarity search - similar_docs_result = self.retrieval_chain.invoke({"input": resume_text}) # Corrected to 'context' + similar_docs_result = retrieval_chain.invoke({"input": resume_text}) # Corrected to 'context' similar_docs = similar_docs_result.get("output", []) print("similar_docs_result:", similar_docs_result) print("Keys in similar_docs_result:", similar_docs_result.keys()) - - for doc in similar_docs: print("Document Metadata:", doc.metadata) From d2c49aea738bd4bf88f775fff532a5172f8e47df Mon Sep 17 00:00:00 2001 From: Linchen Xu <592789211@qq.com> Date: Sun, 1 Dec 2024 18:25:52 -0500 Subject: [PATCH 4/9] refact job_recommendation_system.py import --- be_repo/modules/job_recommendation_system.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/be_repo/modules/job_recommendation_system.py b/be_repo/modules/job_recommendation_system.py index 6b9003c11..79f84e1d3 100644 --- a/be_repo/modules/job_recommendation_system.py +++ b/be_repo/modules/job_recommendation_system.py @@ -1,12 +1,13 @@ # job_recommendation_system.py +import logging + from .neo4j_model import Neo4jModel +from .recommendation_generator import RecommendationGenerator from .resume_processor import ResumeProcessor from .retrieval_engine import RetrievalEngine -from .recommendation_generator import RecommendationGenerator from .view import CLIView -import logging -import sys + def job_recommend(resume_text, user_id): # Setup Logging @@ -20,8 +21,8 @@ def job_recommend(resume_text, user_id): # Neo4j Connection Details NEO4J_URI = "neo4j+ssc://7bf5a48e.databases.neo4j.io" # Replace with your Neo4j URI - NEO4J_USERNAME = "neo4j" # Replace with your Neo4j username - NEO4J_PASSWORD = "oxsK7V5_86emZlYQlvCfQHfVWS95wXz29OhtU8GAdFc" # Replace with your Neo4j password + NEO4J_USERNAME = "neo4j" # Replace with your Neo4j username + NEO4J_PASSWORD = "oxsK7V5_86emZlYQlvCfQHfVWS95wXz29OhtU8GAdFc" # Replace with your Neo4j password # Initialize Model neo4j_model = Neo4jModel( From f4129323436ebac7385b11520b0caa4e0507e96b Mon Sep 17 00:00:00 2001 From: zihan zhou <95243748+andyasdd1@users.noreply.github.com> Date: Sat, 30 Nov 2024 23:11:34 -0500 Subject: [PATCH 5/9] Add New Modules: GraphCypherQAChain. Add New Preprocess modules: embedding and nodes retrival. Added New Config: OpenAi key retrieve embedding.py graph-preprocess-simpl.py neo4j_import.py openai_key.py job_recommendation_system.py neo4j_model.py recommendation_generator.py resume_processor.py retrieval_engine.py verify.py view.py --- be_repo/modules/job_recommendation_system.py | 72 ++++++++++++-------- be_repo/modules/retrieval_engine.py | 29 ++++---- be_repo/modules/view.py | 8 +-- 3 files changed, 65 insertions(+), 44 deletions(-) diff --git a/be_repo/modules/job_recommendation_system.py b/be_repo/modules/job_recommendation_system.py index 79f84e1d3..95e4b609b 100644 --- a/be_repo/modules/job_recommendation_system.py +++ b/be_repo/modules/job_recommendation_system.py @@ -1,28 +1,31 @@ # job_recommendation_system.py -import logging +from neo4j_model import Neo4jModel +from resume_processor import ResumeProcessor +from retrieval_engine import RetrievalEngine +from recommendation_generator import RecommendationGenerator +from view import CLIView +import sys -from .neo4j_model import Neo4jModel -from .recommendation_generator import RecommendationGenerator -from .resume_processor import ResumeProcessor -from .retrieval_engine import RetrievalEngine -from .view import CLIView +def main(): + + + # Redirect standard output to a file + sys.stdout = open('output.log', 'w') + + # Your code here + print("Lots of output") -def job_recommend(resume_text, user_id): # Setup Logging + import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) - - # Get Resume Input from User - if not resume_text.strip(): - logger.error(f'No resume text provided, user_id: {user_id}.') - return 'Error: No resume text provided.' - + # Neo4j Connection Details NEO4J_URI = "neo4j+ssc://7bf5a48e.databases.neo4j.io" # Replace with your Neo4j URI - NEO4J_USERNAME = "neo4j" # Replace with your Neo4j username - NEO4J_PASSWORD = "oxsK7V5_86emZlYQlvCfQHfVWS95wXz29OhtU8GAdFc" # Replace with your Neo4j password + NEO4J_USERNAME = "neo4j" # Replace with your Neo4j username + NEO4J_PASSWORD = "oxsK7V5_86emZlYQlvCfQHfVWS95wXz29OhtU8GAdFc" # Replace with your Neo4j password # Initialize Model neo4j_model = Neo4jModel( @@ -30,28 +33,43 @@ def job_recommend(resume_text, user_id): username=NEO4J_USERNAME, password=NEO4J_PASSWORD ) - - node_label = "JTitle" # Adjust as needed; could be dynamic based on user input or other criteria - + # Initialize Controller Components resume_processor = ResumeProcessor() retrieval_engine = RetrievalEngine(resume_processor, neo4j_model) recommendation_generator = RecommendationGenerator() - + # Initialize View view = CLIView() - - # Perform Mixed Retrieval + + # Get Resume Input from User + resume_text = view.get_resume_input() + + if not resume_text.strip(): + logger.error("No resume text provided.") + print("Error: No resume text provided.") + return + + # Perform Mixed Retrieval for 'JD' Node Label + node_label = "JD" # Adjust as needed; could be dynamic based on user input or other criteria similar_docs, graph_results = retrieval_engine.perform_mixed_retrieval(resume_text, node_label=node_label) - + if not similar_docs and not graph_results: - return 'No job recommendations found based on your resume.' - + print("No job recommendations found based on your resume.") + return + # Generate Recommendations try: recommendations = recommendation_generator.generate_recommendations(similar_docs, graph_results) except Exception as e: - return 'Error: Failed to generate job recommendations.' - + print("Error: Failed to generate job recommendations.") + return + # Display Recommendations - return view.display_recommendations(recommendations) + view.display_recommendations(recommendations) + + # Close the file + sys.stdout.close() + +if __name__ == "__main__": + main() diff --git a/be_repo/modules/retrieval_engine.py b/be_repo/modules/retrieval_engine.py index 0cb62aac2..02fac70e3 100644 --- a/be_repo/modules/retrieval_engine.py +++ b/be_repo/modules/retrieval_engine.py @@ -1,10 +1,12 @@ # retrieval_engine.py +from langchain_neo4j import GraphCypherQAChain +from langchain_openai import ChatOpenAI +from langchain.chains.retrieval import create_retrieval_chain from langchain.chains.combine_documents import create_stuff_documents_chain -from langchain.chains.retrieval import create_retrieval_chain +from configs.openai_key import get_openai_api_key # New import from langchain.prompts import PromptTemplate - class RetrievalEngine: def __init__(self, resume_processor, neo4j_model): """ @@ -19,7 +21,7 @@ def __init__(self, resume_processor, neo4j_model): # Initialize Language Model (already initialized in Neo4jModel) self.llm = self.neo4j_model.llm - + # Initialize GraphCypherQAChain (already initialized in Neo4jModel) self.graph_chain = self.neo4j_model.get_graph_chain() @@ -51,12 +53,12 @@ def __init__(self, resume_processor, neo4j_model): {context} \"\"\" """, - input_variables=["input"] + input_variables=["input"] ) - # Create a documents chain + # Create a documents chain self.combine_docs_chain = create_stuff_documents_chain(self.llm, prompt=prompt) - + # Initialize Retrieval Chain # Default node_label is 'JD'; can be adjusted as needed self.retrieval_chain = create_retrieval_chain( @@ -77,13 +79,13 @@ def perform_mixed_retrieval(self, resume_text, node_label="JD"): """ # Process resume into a Document doc = self.resume_processor.process_resume(resume_text) - + if not doc: return [], {} - + # Store the Document in the appropriate vector store self.neo4j_model.store_documents([doc], node_label=node_label) - + # Access the schema property correctly schema = self.neo4j_model.graph.get_schema @@ -92,15 +94,16 @@ def perform_mixed_retrieval(self, resume_text, node_label="JD"): similar_docs = similar_docs_result.get("output", []) print("similar_docs_result:", similar_docs_result) print("Keys in similar_docs_result:", similar_docs_result.keys()) + + for doc in similar_docs: print("Document Metadata:", doc.metadata) - query = (f"Based on the following resume, recommend relevant job positions based on skills and experience, " - f"while ignoring the location: {resume_text}") + query = f"Based on the following resume, recommend relevant job positions: {resume_text}" graph_response = self.graph_chain.invoke({"query": query, "schema": schema}) # After graph query print("Graph Response:") print(graph_response) - - return similar_docs, graph_response + + return similar_docs, graph_response \ No newline at end of file diff --git a/be_repo/modules/view.py b/be_repo/modules/view.py index f6bd1264b..b7af1eaa1 100644 --- a/be_repo/modules/view.py +++ b/be_repo/modules/view.py @@ -28,8 +28,8 @@ def display_recommendations(self, recommendations): Display job recommendations to the user. """ if not recommendations: - return 'No job recommendations found based on your resume.' - res = '\nRecommended Jobs for You:\n' + print("No job recommendations found based on your resume.") + return + print("\nRecommended Jobs for You:") for idx, job in enumerate(recommendations, start=1): - res += f'{idx}. {job}\n' - return res + print(f"{idx}. {job}") From 17e28f059c89ea21e55a8e6dad5f9f1061ca2b20 Mon Sep 17 00:00:00 2001 From: Linchen Xu <592789211@qq.com> Date: Sun, 1 Dec 2024 18:25:52 -0500 Subject: [PATCH 6/9] refact job_recommendation_system.py import --- be_repo/modules/job_recommendation_system.py | 72 ++++++++------------ 1 file changed, 27 insertions(+), 45 deletions(-) diff --git a/be_repo/modules/job_recommendation_system.py b/be_repo/modules/job_recommendation_system.py index 95e4b609b..79f84e1d3 100644 --- a/be_repo/modules/job_recommendation_system.py +++ b/be_repo/modules/job_recommendation_system.py @@ -1,31 +1,28 @@ # job_recommendation_system.py -from neo4j_model import Neo4jModel -from resume_processor import ResumeProcessor -from retrieval_engine import RetrievalEngine -from recommendation_generator import RecommendationGenerator -from view import CLIView -import sys +import logging -def main(): - - - # Redirect standard output to a file - sys.stdout = open('output.log', 'w') - - # Your code here - print("Lots of output") +from .neo4j_model import Neo4jModel +from .recommendation_generator import RecommendationGenerator +from .resume_processor import ResumeProcessor +from .retrieval_engine import RetrievalEngine +from .view import CLIView +def job_recommend(resume_text, user_id): # Setup Logging - import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) - + + # Get Resume Input from User + if not resume_text.strip(): + logger.error(f'No resume text provided, user_id: {user_id}.') + return 'Error: No resume text provided.' + # Neo4j Connection Details NEO4J_URI = "neo4j+ssc://7bf5a48e.databases.neo4j.io" # Replace with your Neo4j URI - NEO4J_USERNAME = "neo4j" # Replace with your Neo4j username - NEO4J_PASSWORD = "oxsK7V5_86emZlYQlvCfQHfVWS95wXz29OhtU8GAdFc" # Replace with your Neo4j password + NEO4J_USERNAME = "neo4j" # Replace with your Neo4j username + NEO4J_PASSWORD = "oxsK7V5_86emZlYQlvCfQHfVWS95wXz29OhtU8GAdFc" # Replace with your Neo4j password # Initialize Model neo4j_model = Neo4jModel( @@ -33,43 +30,28 @@ def main(): username=NEO4J_USERNAME, password=NEO4J_PASSWORD ) - + + node_label = "JTitle" # Adjust as needed; could be dynamic based on user input or other criteria + # Initialize Controller Components resume_processor = ResumeProcessor() retrieval_engine = RetrievalEngine(resume_processor, neo4j_model) recommendation_generator = RecommendationGenerator() - + # Initialize View view = CLIView() - - # Get Resume Input from User - resume_text = view.get_resume_input() - - if not resume_text.strip(): - logger.error("No resume text provided.") - print("Error: No resume text provided.") - return - - # Perform Mixed Retrieval for 'JD' Node Label - node_label = "JD" # Adjust as needed; could be dynamic based on user input or other criteria + + # Perform Mixed Retrieval similar_docs, graph_results = retrieval_engine.perform_mixed_retrieval(resume_text, node_label=node_label) - + if not similar_docs and not graph_results: - print("No job recommendations found based on your resume.") - return - + return 'No job recommendations found based on your resume.' + # Generate Recommendations try: recommendations = recommendation_generator.generate_recommendations(similar_docs, graph_results) except Exception as e: - print("Error: Failed to generate job recommendations.") - return - - # Display Recommendations - view.display_recommendations(recommendations) + return 'Error: Failed to generate job recommendations.' - # Close the file - sys.stdout.close() - -if __name__ == "__main__": - main() + # Display Recommendations + return view.display_recommendations(recommendations) From 82de40694b04ae3054069eeef59d6f28ce0d92d7 Mon Sep 17 00:00:00 2001 From: Linchen Xu <592789211@qq.com> Date: Sun, 1 Dec 2024 18:32:22 -0500 Subject: [PATCH 7/9] refact job_recommendation_system.py import --- be_repo/modules/view.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/be_repo/modules/view.py b/be_repo/modules/view.py index b7af1eaa1..595a8ebfb 100644 --- a/be_repo/modules/view.py +++ b/be_repo/modules/view.py @@ -30,6 +30,7 @@ def display_recommendations(self, recommendations): if not recommendations: print("No job recommendations found based on your resume.") return - print("\nRecommended Jobs for You:") + res = "\nRecommended Jobs for You:\n" for idx, job in enumerate(recommendations, start=1): - print(f"{idx}. {job}") + res += f"{idx}. {job}\n" + return res From d8c2a151e2b959dade2b36502b73d3f1449bba56 Mon Sep 17 00:00:00 2001 From: Linchen Xu <592789211@qq.com> Date: Sun, 1 Dec 2024 18:33:56 -0500 Subject: [PATCH 8/9] refact: return recommended result in api --- be_repo/modules/view.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/be_repo/modules/view.py b/be_repo/modules/view.py index 595a8ebfb..d427efa2c 100644 --- a/be_repo/modules/view.py +++ b/be_repo/modules/view.py @@ -28,8 +28,7 @@ def display_recommendations(self, recommendations): Display job recommendations to the user. """ if not recommendations: - print("No job recommendations found based on your resume.") - return + return "No job recommendations found based on your resume." res = "\nRecommended Jobs for You:\n" for idx, job in enumerate(recommendations, start=1): res += f"{idx}. {job}\n" From 0d604fdf7ce9be776c4c963c513f104d441b5df0 Mon Sep 17 00:00:00 2001 From: zihan zhou <95243748+andyasdd1@users.noreply.github.com> Date: Sun, 1 Dec 2024 17:20:52 -0500 Subject: [PATCH 9/9] Update: Add in response from cypherqagraph. Fix: Fix issue where job title doesn't exists, allow node search switch --- be_repo/modules/retrieval_engine.py | 46 +++++++++++++---------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/be_repo/modules/retrieval_engine.py b/be_repo/modules/retrieval_engine.py index 02fac70e3..83e2b6a83 100644 --- a/be_repo/modules/retrieval_engine.py +++ b/be_repo/modules/retrieval_engine.py @@ -26,13 +26,10 @@ def __init__(self, resume_processor, neo4j_model): self.graph_chain = self.neo4j_model.get_graph_chain() # Define the PromptTemplate with 'context' as input variable - prompt = PromptTemplate( - template=""" - You are an expert Cypher query writer for a Neo4j graph database. + template=""" + You are an assistant that matches resumes to relevant job descriptions. - Given the user's question, generate an efficient Cypher query that: - - extract entities and relationships from the following resume. - - Focus solely on the resume content. + Given the user's resume, find the most relevant job descriptions. **Entities to Extract:** - **Education (Edu):** Details about degrees, fields of study, institutions, start and end years, GPA. @@ -42,30 +39,20 @@ def __init__(self, resume_processor, neo4j_model): - **Certifications (Cert):** Certification names, issuing organizations, expiration dates. - **Soft Skills (SSkill):** Non-technical skills like leadership, communication. - **Relationships to Identify:** - - **UTILIZES_SKILL:** A Work Experience (WE) node utilizes a Skill (Skill) node. - - **USES_TECH:** A Project (Proj) node uses a Skill (Skill) node as a technology. - - **REL_TO (Proj to Skill):** A Project (Proj) node is related to a Skill (Skill) node. - - **REL_TO (Skill to Skill):** A Skill (Skill) node is similar to another Skill (Skill) node. - **Resume:** \"\"\" {context} \"\"\" - """, - input_variables=["input"] - ) - - # Create a documents chain - self.combine_docs_chain = create_stuff_documents_chain(self.llm, prompt=prompt) + """ - # Initialize Retrieval Chain - # Default node_label is 'JD'; can be adjusted as needed - self.retrieval_chain = create_retrieval_chain( - self.neo4j_model.get_retriever(node_label="JD"), - self.combine_docs_chain + self.prompt_template = PromptTemplate( + template=template, + input_variables=["input"] ) + # Create a documents chain + self.combine_docs_chain = create_stuff_documents_chain(self.llm, self.prompt_template) + def perform_mixed_retrieval(self, resume_text, node_label="JD"): """ Perform mixed retrieval using vector similarity and graph traversal. @@ -89,14 +76,21 @@ def perform_mixed_retrieval(self, resume_text, node_label="JD"): # Access the schema property correctly schema = self.neo4j_model.graph.get_schema + # Get the retriever for the given node label + retriever = self.neo4j_model.get_retriever(node_label=node_label) + + # Create the retrieval chain with the retriever and the combine_docs_chain + retrieval_chain = create_retrieval_chain( + retriever, + self.combine_docs_chain + ) + # Perform vector similarity search - similar_docs_result = self.retrieval_chain.invoke({"input": resume_text}) # Corrected to 'context' + similar_docs_result = retrieval_chain.invoke({"input": resume_text}) # Corrected to 'context' similar_docs = similar_docs_result.get("output", []) print("similar_docs_result:", similar_docs_result) print("Keys in similar_docs_result:", similar_docs_result.keys()) - - for doc in similar_docs: print("Document Metadata:", doc.metadata)