Skip to content

Commit 4df0e51

Browse files
committed
Merge branch 'develop'
2 parents ecea861 + 47a9c96 commit 4df0e51

37 files changed

+2538
-721
lines changed

Project/backend/codebase/graph_analysis/graph_analysis.py

Lines changed: 26 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,18 @@
22
import os
33
import json
44

5+
def get_top_n_central_nodes(centrality_dict, n):
6+
"""Sort nodes based on centrality measure and return top N nodes.
7+
8+
Args:
9+
centrality_dict: Dictionary of nodes with their centrality values.
10+
n: Number of top nodes to return.
11+
12+
Returns:
13+
Sorted list of top N nodes with their centrality values.
14+
"""
15+
sorted_nodes = sorted(centrality_dict.items(), key=lambda item: item[1], reverse=True)
16+
return sorted_nodes[:n]
517

618
def analyze_graph_structure(G):
719
"""Analyzes the structure of a knowledge graph and provides hopefully useful information.
@@ -17,15 +29,12 @@ def analyze_graph_structure(G):
1729
# Basic Graph Statistics
1830
num_nodes = G.number_of_nodes() # Total number of nodes
1931
num_edges = G.number_of_edges() # Total number of edges
20-
density = nx.density(G) # Ratio of actual edges to possible edges (0 to 1)
21-
average_degree = 2 * num_edges / num_nodes # Average number of edges per node
2232

23-
# Degree Distribution
24-
degree_distribution = dict(G.degree())
25-
# Degree distribution can indicate the presence of hubs or important nodes
33+
if num_nodes == 0 or num_edges == 0:
34+
raise ValueError("The graph is empty or not properly constructed.")
2635

36+
# Degree Centrality: Measures node connectivity
2737
degree_centrality = nx.degree_centrality(G)
28-
2938
""" Centrality Measures
3039
- Degree Centrality: Measures node connectivity
3140
- Nodes with high degree centrality are important in the network
@@ -36,8 +45,9 @@ def analyze_graph_structure(G):
3645
- Degree Centrality: node1 = 0.33(1/3), node2 = 0.66(2/3), node3 = 0.33(1/3)
3746
"""
3847

39-
betweenness_centrality = nx.betweenness_centrality(G)
4048

49+
# Betweenness Centrality: Measures node's control over information flow
50+
betweenness_centrality = nx.betweenness_centrality(G)
4151
"""
4252
- Betweenness Centrality: Measures node's control over information flow
4353
- Nodes with high betweenness centrality are important in the network
@@ -54,24 +64,8 @@ def analyze_graph_structure(G):
5464
- Betweenness Centrality show the dependency of the network on a node
5565
5666
"""
57-
58-
# - Closeness Centrality: Measures average length of the shortest path from a node to all other nodes
59-
closeness_centrality = nx.closeness_centrality(G)
60-
61-
"""
62-
- Closeness Centrality: Measures average length of the shortest path from a node to all other nodes
63-
- Nodes with high closeness centrality are important in the network
64-
65-
Examples: 4 nodes are connected
66-
0
67-
/ | \
68-
2--1--3
69-
70-
- Here, node 0, 1 (1.0) has the highest closeness centrality because it is connected to all other nodes (node 2, 3 = 0.75)
71-
- Closeness Centrality show the average distance of a node to all other nodes in the network
72-
"""
73-
74-
# - Eigenvector Centrality: Measures influence of a node in a network
67+
68+
# eigenvector centrality measures the influence of a node in a network
7569
eigenvector_centrality = nx.eigenvector_centrality(G)
7670

7771
"""
@@ -92,78 +86,16 @@ def analyze_graph_structure(G):
9286
9387
"""
9488

95-
# Community Structure
96-
# - Louvain Algorithm (for community detection)
97-
communities = list(nx.community.greedy_modularity_communities(G))
98-
community_sizes = [len(community) for community in communities]
99-
num_communities = len(communities)
100-
# Communities can reveal modular structures in the graph
101-
"""
102-
- Community Detection: Identifying groups of nodes that are more connected to each other than to the rest of the network
103-
- Communities can reveal modular structures in the graph
104-
- Communities can be used to identify groups of nodes that are more connected to each other than to the rest of the network
105-
106-
Examples: 7 nodes are connected
107-
1
108-
/ \
109-
2-----3
110-
\ / 5
111-
4-----/ \
112-
6-----7
113-
114-
- Here, nodes 1, 2, 3, 4 are in one community and nodes 5, 6, 7 are in another community
115-
"""
116-
117-
# Graph Connectivity
118-
# - Check if the graph is connected
119-
is_connected = nx.is_connected(G)
120-
# - Calculate diameter: Longest shortest path between any two nodes
121-
diameter = nx.diameter(G) if is_connected else float('inf')
122-
# - Average shortest path length: Average of all shortest paths in the graph
123-
average_shortest_path_length = nx.average_shortest_path_length(G) if is_connected else float('inf')
124-
125-
# Clustering Coefficient
126-
# - Measures the degree to which nodes tend to cluster together
127-
average_clustering_coefficient = nx.average_clustering(G)
128-
129-
# Assortativity
130-
# - Measures the similarity of connections in the graph with respect to node degree
131-
assortativity = nx.degree_assortativity_coefficient(G)
132-
133-
# Graph Diameter and Radius
134-
# - Diameter: Longest shortest path in the graph
135-
# - Radius: Minimum eccentricity of any node
136-
radius = nx.radius(G) if is_connected else float('inf')
137-
138-
# Graph Transitivity
139-
# - Measures the overall probability for the network to have adjacent nodes interconnected
140-
transitivity = nx.transitivity(G)
141-
142-
# Return a dictionary containing the structural information
14389
graph_info = {
14490
"num_nodes": num_nodes,
14591
"num_edges": num_edges,
146-
"density": density,
147-
"average_degree": average_degree,
148-
"degree_distribution": degree_distribution,
149-
"degree_centrality": degree_centrality,
150-
"betweenness_centrality": betweenness_centrality,
151-
"closeness_centrality": closeness_centrality,
152-
"eigenvector_centrality": eigenvector_centrality,
153-
"num_communities": num_communities,
154-
"community_sizes": community_sizes,
155-
"is_connected": is_connected,
156-
"diameter": diameter,
157-
"average_shortest_path_length": average_shortest_path_length,
158-
"average_clustering_coefficient": average_clustering_coefficient,
159-
"assortativity": assortativity,
160-
"radius": radius,
161-
"transitivity": transitivity
92+
"top_degree_centrality": get_top_n_central_nodes(degree_centrality, top_n),
93+
"top_betweenness_centrality": get_top_n_central_nodes(betweenness_centrality, top_n),
94+
"top_eigenvector_centrality": get_top_n_central_nodes(eigenvector_centrality, top_n)
16295
}
16396

16497
return graph_info
16598

166-
16799
def print_graph_info(graph_info):
168100
"""Prints the graph information in a formatted and readable way.
169101
@@ -176,9 +108,12 @@ def print_graph_info(graph_info):
176108

177109
graph_directory = os.fsencode("../.media/graphs/")
178110

111+
112+
top_n = int(input("Enter the number of top nodes to display: "))
113+
179114
with os.scandir("./Project/backend/codebase/.media/graphs/") as it:
180115
for entry in it:
181-
if entry.name.endswith(".gml") and entry.is_file():
116+
if entry.name.endswith("c.gml") and entry.is_file():
182117
print("-----------------------")
183118
print(f"Filename: {entry.name}")
184119
graph = nx.read_gml(entry.path)

Project/backend/codebase/graph_creator/gemini.py

Lines changed: 25 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import os
22
from datetime import datetime
3+
34
import google.generativeai as genai
5+
46
from graph_creator.services.json_handler import transform_llm_output_to_dict
57

68

@@ -36,27 +38,29 @@ def extract_entities_and_relations(chunk, genai_client):
3638
"""
3739
SYS_PROMPT = (
3840
"Only answer in a JSON format. \n"
39-
"You are a network graph maker who extracts terms and their relations from a given context. "
40-
"You are provided with a context chunk (delimited by ```) Your task is to extract the ontology "
41-
"of terms mentioned in the given context. These terms should represent the key concepts as per the context. \n"
42-
"Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.\n"
43-
"\tTerms may include object, entity, location, organization, person, \n"
44-
"\tcondition, acronym, documents, service, concept, etc.\n"
45-
"\tTerms should be as atomistic as possible\n\n"
46-
"Thought 2: Think about how these terms can have one on one relation with other terms.\n"
47-
"\tTerms that are mentioned in the same sentence or the same paragraph are typically related to each other.\n"
48-
"\tTerms can be related to many other terms\n\n"
49-
"Thought 3: Find out the relation between each such related pair of terms. \n\n"
50-
"Format your output as a list of JSON. Each element of the list contains a pair of terms"
41+
"You are a network graph maker who extracts the most critical terms and their relations from a given context. "
42+
"You are provided with a context chunk (delimited by ```). Your task is to extract the ontology "
43+
"of the few key terms that are indispensable for understanding the given context. These terms should represent the core concepts as per the context. \n"
44+
"Thought 1: Identify the few most critical terms in the entire context.\n"
45+
"\tTerms may include only the most significant objects, entities, locations, organizations, people, conditions, acronyms, documents, services, concepts, etc.\n"
46+
"\tExclude all terms that are not crucial to the core message.\n"
47+
"\tDo not extract a term from every sentence; focus only on the most important terms across the entire context.\n\n"
48+
"Thought 2: Determine how these indispensable terms are directly related to each other.\n"
49+
"\tTerms that are mentioned in the same sentence or paragraph are typically related to each other.\n"
50+
"\tFocus solely on relationships that reveal the most critical interactions or dependencies, ignoring all minor details.\n\n"
51+
"Thought 3: Identify the specific type of relationship between each related pair of terms.\n"
52+
"\tEnsure the relationship is crucial, highly relevant, and necessary for understanding the context.\n\n"
53+
"Format your output as a list of JSON. Each element of the list contains a pair of terms "
5154
"and the relation between them, like the following: \n"
5255
"[\n"
5356
" {\n"
54-
' "node_1": "A concept from extracted ontology",\n'
55-
' "node_2": "A related concept from extracted ontology",\n'
57+
' "node_1": "A core concept from extracted ontology",\n'
58+
' "node_2": "A related core concept from extracted ontology",\n'
5659
' "edge": "relationship between the two concepts, node_1 and node_2"\n'
5760
" }, {...}\n"
5861
"]"
5962
)
63+
6064
USER_PROMPT = f"context: ```{chunk}``` \n\n output: "
6165

6266
chat_session = genai_client.start_chat(history=[])
@@ -74,9 +78,12 @@ def check_for_connecting_relation(
7478
"""
7579
SYS_PROMPT = (
7680
"Only answer in JSON format. \n"
77-
"Your task is to help create a knowledge graph by extracting one more relation between any entity of list_1 with any entity of list_2.\n"
78-
"We want to connect the subgraphs of nodes and relations that were extracted from the given text chunk (delimited by ```)."
79-
"For this one more relation needs to be extracted from the given text chunk between any entity of list_1 and list_2:\n"
81+
"Your task is to help create a knowledge graph by extracting one more relation between any entity of list_1 "
82+
"with any entity of list_2.\n "
83+
"We want to connect the subgraphs of nodes and relations that were extracted from the given text chunk ("
84+
"delimited by ```). "
85+
"For this one more relation needs to be extracted from the given text chunk between any entity of list_1 and "
86+
"list_2:\n "
8087
f"list_1: {entities_component_1}\n"
8188
f"list_2: {entities_component_2}\n"
8289
"Only use the exact entities given in the lists."
@@ -110,7 +117,7 @@ def check_for_connecting_relation_(
110117
The text chunk to be proccessed
111118
entities_component_1 : list
112119
List of entities
113-
entities_component_1 : list
120+
entities_component_2 : list
114121
List of entities
115122
116123
Returns

Project/backend/codebase/graph_creator/graph_creator_main.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
1+
import logging
12
import mimetypes
2-
import pandas
33

4+
from graph_creator import graph_handler
5+
from graph_creator import pdf_handler
46
from graph_creator.llama3 import process_chunks as groq_process_chunks
57
from graph_creator.models.graph_job import GraphJob
6-
from graph_creator import pdf_handler
7-
from graph_creator import graph_handler
88
from graph_creator.services import netx_graphdb
99

10+
logging.basicConfig(level=logging.INFO)
11+
logger = logging.getLogger(__name__)
12+
1013

1114
def process_file_to_graph(g_job: GraphJob):
1215
"""
@@ -58,11 +61,9 @@ def process_file_to_entities_and_relations(file: str):
5861
] # Assuming chunk has 'page_content' attribute
5962

6063
# Generate response using LLM
61-
# response_json = process_chunks(text_chunks, prompt_template)
6264
response_json = groq_process_chunks(text_chunks)
63-
print(response_json)
6465
except Exception as e:
65-
print(e)
66+
logging.error(e)
6667
response_json = None
6768

6869
return response_json, chunks
@@ -83,6 +84,7 @@ def create_and_store_graph(uuid, entities_and_relations, chunks):
8384

8485
# combine knowledge graph pieces
8586
# combined = graph_handler.connect_with_chunk_proximity(df_e_and_r)
87+
# combined['chunk_id'] = '1'
8688
for i in range(len(chunks)):
8789
chunks[i] = chunks[i].dict()
8890
combined = graph_handler.connect_with_llm(df_e_and_r, chunks, 30)
@@ -91,7 +93,7 @@ def create_and_store_graph(uuid, entities_and_relations, chunks):
9193
graph_db_service = netx_graphdb.NetXGraphDB()
9294

9395
# read entities and relations
94-
graph = graph_db_service.create_graph_from_df(combined)
96+
graph = graph_db_service.create_graph_from_df(combined, chunks)
9597

9698
# save graph as file
9799
graph_db_service.save_graph(uuid, graph)

0 commit comments

Comments
 (0)