Skip to content

Commit a3227d4

Browse files
Merge branch 'develop'
Signed-off-by: Nikolas Rauscher <nikolas.rauscher@gmail.com>
2 parents 8979f38 + 75c3bdc commit a3227d4

34 files changed

+1489
-935
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,5 @@ bin/
3434

3535
# MacOS
3636
.DS_Store
37+
# embeddings
38+
Project/backend/codebase/embeddings/*
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Measurments of running Graph creator
2+
# india health article
3+
4+
llama3, chunk_size=1500, chunk_overlap=150, 30 calls/min, 60 sec wait
5+
6+
gemini-pro, chunk_size=1500, chunk_overlap=150, - calls/min, - sec wait
7+
8+
gemini-flash, chunk_size=1500, chunk_overlap=150, - calls/min, - sec wait
9+
10+
# Measurments as averages over all llm calls
11+
# two kinds of promts used: extract_entities_and_relations and check_for_connecting_relation
12+
13+
14+
# Execution speed of prompts by llm model
15+
16+
gemini-flash: 12,75s (10 extraction requests) per request extracting, 1.57s connecting
17+
18+
gemini: 23,54s (10 extraction requests) per request extracting, 2,37s connecting
19+
20+
groq+llama3: 0.72s (10 extraction requests) per request extracting, 0,48s connecting
21+
22+
---------------------------------------
23+
24+
# Statistics on the number of extracted entities by llm model
25+
26+
llama: 3078 tokens / 1770 words -> 177 / 180 entities (34 / 47 connecting requests)
27+
28+
gemini: 3078 tokens / 1770 words -> 303 / 316 entities (35 connecting requests)
29+
30+
gemini-flash: 3078 tokens / 1770 words -> 309 / 369 entities (28 connecting requests)
31+
32+
33+
-----------------------------------------
34+
35+
# Duration of knowledge graph extraction by llm model
36+
37+
gemini-flash: 127,5s for entity extraction and 105s for connecting
38+
39+
gemini: 212s for entity extraction and 189s for connecting
40+
41+
groq+llama3: 7,9s for entity extraction and 136s for connecting

Project/backend/.env.example

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ DEBUG=True
88
ALLOWED_HOSTS=*
99
CSRF_TRUSTED_ORIGINS='*'
1010
CORS_ALLOWED_ORIGINS='*'
11+
CHUNK_SIZE=1500
1112

1213
# Database
1314
POSTGRES_USER=amos

Project/backend/codebase/graph_analysis/graph_analysis.py

Lines changed: 29 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@ def get_top_n_central_nodes(centrality_dict, n):
1212
Returns:
1313
Sorted list of top N nodes with their centrality values.
1414
"""
15+
# sorted_nodes = sorted(centrality_dict.items(), key=lambda item: item[1], reverse=True)
16+
# return sorted_nodes[:n]
1517
sorted_nodes = sorted(centrality_dict.items(), key=lambda item: item[1], reverse=True)
16-
return sorted_nodes[:n]
18+
return [node for node, _ in sorted_nodes[:n]]
1719

1820
def analyze_graph_structure(G):
1921
"""Analyzes the structure of a knowledge graph and provides hopefully useful information.
@@ -30,6 +32,10 @@ def analyze_graph_structure(G):
3032
num_nodes = G.number_of_nodes() # Total number of nodes
3133
num_edges = G.number_of_edges() # Total number of edges
3234

35+
# Degree Distribution
36+
degree_distribution = dict(G.degree())
37+
# Degree distribution can indicate the presence of hubs or important nodes
38+
3339
if num_nodes == 0 or num_edges == 0:
3440
raise ValueError("The graph is empty or not properly constructed.")
3541

@@ -48,7 +54,7 @@ def analyze_graph_structure(G):
4854

4955
# Betweenness Centrality: Measures node's control over information flow
5056
betweenness_centrality = nx.betweenness_centrality(G)
51-
"""
57+
"""
5258
- Betweenness Centrality: Measures node's control over information flow
5359
- Nodes with high betweenness centrality are important in the network
5460
@@ -86,36 +92,31 @@ def analyze_graph_structure(G):
8692
8793
"""
8894

89-
graph_info = {
90-
"num_nodes": num_nodes,
91-
"num_edges": num_edges,
92-
"top_degree_centrality": get_top_n_central_nodes(degree_centrality, top_n),
93-
"top_betweenness_centrality": get_top_n_central_nodes(betweenness_centrality, top_n),
94-
"top_eigenvector_centrality": get_top_n_central_nodes(eigenvector_centrality, top_n)
95-
}
96-
97-
return graph_info
98-
99-
def print_graph_info(graph_info):
100-
"""Prints the graph information in a formatted and readable way.
95+
# - Closeness Centrality: Measures average length of the shortest path from a node to all other nodes
96+
closeness_centrality = nx.closeness_centrality(G)
10197

102-
Args:
103-
graph_info: A dictionary containing information about the graph's structure.
10498
"""
99+
- Closeness Centrality: Measures average length of the shortest path from a node to all other nodes
100+
- Nodes with high closeness centrality are important in the network
105101
106-
print(json.dumps(graph_info, indent=4))
107-
102+
Examples: 4 nodes are connected
103+
0
104+
/ | \
105+
2--1--3
108106
109-
graph_directory = os.fsencode("../.media/graphs/")
107+
- Here, node 0, 1 (1.0) has the highest closeness centrality because it is connected to all other nodes (node 2, 3 = 0.75)
108+
- Closeness Centrality show the average distance of a node to all other nodes in the network
109+
"""
110+
n = 20 # Number of top nodes to return
111+
# Calculate centrality measures
112+
degree_centrality = get_top_n_central_nodes(nx.degree_centrality(G), n)
113+
betweenness_centrality = get_top_n_central_nodes(nx.betweenness_centrality(G), n)
114+
eigenvector_centrality = get_top_n_central_nodes(nx.eigenvector_centrality(G), n)
115+
closeness_centrality = get_top_n_central_nodes(nx.closeness_centrality(G), n)
110116

117+
# Find intersection of top nodes from all measures (set intersection)
118+
all_centrality_nodes = set(degree_centrality) & set(betweenness_centrality) & set(eigenvector_centrality) & set(closeness_centrality)
111119

112-
top_n = int(input("Enter the number of top nodes to display: "))
120+
top_nodes = list(all_centrality_nodes)[:6]
113121

114-
with os.scandir("./Project/backend/codebase/.media/graphs/") as it:
115-
for entry in it:
116-
if entry.name.endswith("c.gml") and entry.is_file():
117-
print("-----------------------")
118-
print(f"Filename: {entry.name}")
119-
graph = nx.read_gml(entry.path)
120-
graph_info = analyze_graph_structure(graph)
121-
print_graph_info(graph_info)
122+
return top_nodes

0 commit comments

Comments
 (0)