Skip to content

Commit 1a6215d

Browse files
authored
Merge pull request #196 from amosproj/develop
Develop
2 parents a62e4fb + 66f1520 commit 1a6215d

33 files changed

+2027
-633
lines changed

Project/backend/codebase/graph_analysis/graph_analysis.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,24 @@
11
import networkx as nx
2-
import os
3-
import json
2+
43

54
def get_top_n_central_nodes(centrality_dict, n):
65
"""Sort nodes based on centrality measure and return top N nodes.
7-
6+
87
Args:
98
centrality_dict: Dictionary of nodes with their centrality values.
109
n: Number of top nodes to return.
11-
10+
1211
Returns:
1312
Sorted list of top N nodes with their centrality values.
1413
"""
1514
# sorted_nodes = sorted(centrality_dict.items(), key=lambda item: item[1], reverse=True)
1615
# return sorted_nodes[:n]
17-
sorted_nodes = sorted(centrality_dict.items(), key=lambda item: item[1], reverse=True)
16+
sorted_nodes = sorted(
17+
centrality_dict.items(), key=lambda item: item[1], reverse=True
18+
)
1819
return [node for node, _ in sorted_nodes[:n]]
1920

21+
2022
def analyze_graph_structure(G):
2123
"""Analyzes the structure of a knowledge graph and provides hopefully useful information.
2224
Currently, I am not sure how to use most of the information, but we may find a way to use it
@@ -32,7 +34,7 @@ def analyze_graph_structure(G):
3234
num_nodes = G.number_of_nodes() # Total number of nodes
3335
num_edges = G.number_of_edges() # Total number of edges
3436

35-
# Degree Distribution
37+
# Degree Distribution
3638
degree_distribution = dict(G.degree())
3739
# Degree distribution can indicate the presence of hubs or important nodes
3840

@@ -51,7 +53,6 @@ def analyze_graph_structure(G):
5153
- Degree Centrality: node1 = 0.33(1/3), node2 = 0.66(2/3), node3 = 0.33(1/3)
5254
"""
5355

54-
5556
# Betweenness Centrality: Measures node's control over information flow
5657
betweenness_centrality = nx.betweenness_centrality(G)
5758
"""
@@ -70,7 +71,7 @@ def analyze_graph_structure(G):
7071
- Betweenness Centrality show the dependency of the network on a node
7172
7273
"""
73-
74+
7475
# eigenvector centrality measures the influence of a node in a network
7576
eigenvector_centrality = nx.eigenvector_centrality(G)
7677

@@ -115,7 +116,12 @@ def analyze_graph_structure(G):
115116
closeness_centrality = get_top_n_central_nodes(nx.closeness_centrality(G), n)
116117

117118
# Find intersection of top nodes from all measures (set intersection)
118-
all_centrality_nodes = set(degree_centrality) & set(betweenness_centrality) & set(eigenvector_centrality) & set(closeness_centrality)
119+
all_centrality_nodes = (
120+
set(degree_centrality)
121+
& set(betweenness_centrality)
122+
& set(eigenvector_centrality)
123+
& set(closeness_centrality)
124+
)
119125

120126
top_nodes = list(all_centrality_nodes)[:6]
121127

Project/backend/codebase/graph_creator/graph_handler.py

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import json
22
import logging
33
import re
4-
import time
54

65
# from graph_creator import llama3
76
# from graph_creator import embedding_handler # To be integrated
@@ -345,18 +344,34 @@ def add_relations_to_data(entity_and_relation_df, new_relations):
345344
return entity_and_relation_df
346345

347346

348-
def add_topic(data: pd.DataFrame) -> pd.DataFrame:
349-
documents = list(set(data['node_1']).union(set(data['node_2'])))
347+
def add_topic(data: pd.DataFrame, max_topics: int = 25) -> pd.DataFrame:
348+
documents = list(set(data["node_1"]).union(set(data["node_2"])))
350349

351350
topic_model = BERTopic()
352351
topics, probabilities = topic_model.fit_transform(documents)
353-
topic_name_info = {row['Topic']: row['Name'] for _, row in topic_model.get_topic_info().iterrows()}
354-
doc_topic_map = {doc: topic for doc, topic in zip(documents, topics)}
355-
doc_topic_strings_map = {doc: topic_name_info.get(topic, "no_topic") for doc, topic in doc_topic_map.items()}
352+
topic_info = topic_model.get_topic_info()
353+
354+
# Keep only the top given number of topics
355+
top_topics = topic_model.get_topic_info().head(max_topics)["Topic"].tolist()
356+
357+
topic_name_info = {
358+
row["Topic"]: row["Name"] for _, row in topic_info.iterrows()
359+
}
360+
361+
# Create a mapping for "other" topics
362+
doc_topic_map = {doc: (topic if topic in top_topics else "other") for doc, topic in zip(documents, topics)}
363+
doc_topic_strings_map = {
364+
doc: (topic_name_info.get(topic, "other") if topic != "other" else "other")
365+
for doc, topic in doc_topic_map.items()
366+
}
356367

357368
# Add new columns to the DataFrame and populate them
358-
data['topic_node_1'] = [doc_topic_strings_map[node] for i, node in data['node_1'].items()]
359-
data['topic_node_2'] = [doc_topic_strings_map[node] for i, node in data['node_2'].items()]
369+
data["topic_node_1"] = [
370+
doc_topic_strings_map[node] for i, node in data["node_1"].items()
371+
]
372+
data["topic_node_2"] = [
373+
doc_topic_strings_map[node] for i, node in data["node_2"].items()
374+
]
360375
return data
361376

362377

Project/backend/codebase/graph_creator/pdf_handler.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@ def process_pdf_into_chunks(filename):
3131
raise ValueError("Failed to load PDF documents.")
3232

3333
# splits text into chunks including metadata for mapping from chunk to pdf page (splits[0].metadata['page'])
34-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=os.getenv("CHUNK_SIZE", 1500), chunk_overlap=150)
34+
text_splitter = RecursiveCharacterTextSplitter(
35+
chunk_size=os.getenv("CHUNK_SIZE", 1500), chunk_overlap=150
36+
)
3537
splits = text_splitter.split_documents(docs)
3638

3739
return splits

Project/backend/codebase/graph_creator/router.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,4 +297,4 @@ async def query_graph(
297297
)
298298
graph = netx_services.load_graph(graph_job_id=graph_job_id)
299299
graph_keywords = analyze_graph_structure(graph)
300-
return graph_keywords
300+
return graph_keywords

Project/backend/codebase/graph_creator/services/file_handler.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@
77
from langchain_community.document_loaders import Docx2txtLoader
88
from langchain_community.document_loaders import UnstructuredPowerPointLoader
99

10-
from langchain_text_splitters import RecursiveCharacterTextSplitter, RecursiveJsonSplitter
10+
from langchain_text_splitters import (
11+
RecursiveCharacterTextSplitter,
12+
RecursiveJsonSplitter,
13+
)
1114

1215

1316
class FileHandler:
@@ -40,7 +43,9 @@ def _process_doc_to_chunks(docs):
4043
raise ValueError("Failed to load documents.")
4144

4245
# splits text into chunks including metadata for mapping from chunk to pdf page (splits[0].metadata['page'])
43-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=os.getenv("CHUNK_SIZE", 1500), chunk_overlap=150)
46+
text_splitter = RecursiveCharacterTextSplitter(
47+
chunk_size=os.getenv("CHUNK_SIZE", 1500), chunk_overlap=150
48+
)
4449
splits = text_splitter.split_documents(docs)
4550
return splits
4651

@@ -56,4 +61,3 @@ def _get_json_chunks(self):
5661
splitter = RecursiveJsonSplitter(max_chunk_size=os.getenv("CHUNK_SIZE", 1500))
5762
json_chunks = splitter.create_documents(texts=[json_data])
5863
return json_chunks
59-

Project/backend/codebase/graph_creator/services/llm/gemini.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def extract_entities_and_relations(self, chunk):
109109
" }, {...}\n"
110110
"]"
111111
)
112-
112+
113113
USER_PROMPT = f"context: ```{chunk}``` \n\n output: "
114114

115115
chat_session = self.genai_client.start_chat(history=[])

Project/backend/codebase/graph_creator/services/netx_graphdb.py

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,18 @@ def create_graph_from_df(self, data: pd.DataFrame, chunks: dict) -> nx.Graph:
3636

3737
chunk_id = edge["chunk_id"]
3838
page_number = chunk_to_page[int(chunk_id)]
39+
if isinstance(page_number, int):
40+
page_number += 1
3941

4042
# Add nodes with page attribute
4143
if edge["node_1"] not in graph:
42-
graph.add_node(edge["node_1"], pages=set([]), topic=edge["topic_node_1"])
44+
graph.add_node(
45+
edge["node_1"], pages=set([]), topic=edge["topic_node_1"]
46+
)
4347
if edge["node_2"] not in graph:
44-
graph.add_node(edge["node_2"], pages=set([]), topic=edge["topic_node_2"])
48+
graph.add_node(
49+
edge["node_2"], pages=set([]), topic=edge["topic_node_2"]
50+
)
4551

4652
# Add edge with attributes to the graph
4753
graph.add_edge(edge["node_1"], edge["node_2"], relation=edge["edge"])
@@ -128,7 +134,7 @@ def _get_graph_file_path_local_storage(graph_job_id: uuid.UUID) -> str:
128134

129135
@staticmethod
130136
def _graph_bfs_edges(
131-
graph: nx.Graph, graph_job: GraphJob, node: str, adj_depth: int
137+
graph: nx.Graph, graph_job: GraphJob, node: str, adj_depth: int
132138
) -> GraphVisData:
133139
nodes_data = []
134140
edges_data = []
@@ -170,14 +176,16 @@ def _graph_bfs_edges(
170176
)
171177
)
172178

173-
return GraphVisData(document_name=graph_job.name,
174-
graph_created_at=graph_job.updated_at,
175-
nodes=nodes_data,
176-
edges=edges_data)
179+
return GraphVisData(
180+
document_name=graph_job.name,
181+
graph_created_at=graph_job.updated_at,
182+
nodes=nodes_data,
183+
edges=edges_data,
184+
)
177185

178186
@staticmethod
179187
def _all_graph_data_for_visualization(
180-
graph: nx.Graph, graph_job: GraphJob
188+
graph: nx.Graph, graph_job: GraphJob
181189
) -> GraphVisData:
182190
nodes_data = []
183191
edges_data = []
@@ -208,7 +216,9 @@ def _all_graph_data_for_visualization(
208216
)
209217
)
210218

211-
return GraphVisData(document_name=graph_job.name,
212-
graph_created_at=graph_job.updated_at,
213-
nodes=nodes_data,
214-
edges=edges_data)
219+
return GraphVisData(
220+
document_name=graph_job.name,
221+
graph_created_at=graph_job.updated_at,
222+
nodes=nodes_data,
223+
edges=edges_data,
224+
)

Project/backend/codebase/tests/test_graph_handler.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from requests import patch
2-
from graph_creator.services.llm.llama_gemini_combination import llama_gemini_combination
32
from graph_creator import graph_handler
4-
from unittest.mock import patch, MagicMock
3+
from unittest.mock import patch
54

65
import json
76
import pandas as pd
@@ -74,6 +73,7 @@ def test_relation_extraction_from_llm_entity_not_in_lists():
7473
# Assert
7574
assert relation is None
7675

76+
7777
def test_component_connection_with_llm(mocker):
7878
"""
7979
Tests if component combination with llm works
@@ -89,7 +89,7 @@ def test_component_connection_with_llm(mocker):
8989
]
9090
"""
9191

92-
patcher = patch('graph_creator.services.llm.llama_gemini_combination')
92+
patcher = patch("graph_creator.services.llm.llama_gemini_combination")
9393
MockLlama3 = patcher.start()
9494
mock_instance = MockLlama3.return_value
9595

Project/frontend/src/components/App/index.css

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,6 @@ img {
5959
gap: 10px;
6060
}
6161

62-
63-
6462
.main_wrapper {
6563
display: flex;
6664
flex-direction: column;
@@ -71,13 +69,11 @@ img {
7169
min-height: 100%;
7270
}
7371

74-
75-
7672
.Appcontainer {
7773
display: flex;
7874
flex-direction: column;
7975
align-items: center;
8076
gap: 20px;
8177
min-width: 100%;
8278
min-height: 100%;
83-
}
79+
}

Project/frontend/src/components/App/index.tsx

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@ import {
1717
} from '@mui/material';
1818

1919
import logo from '../../assets/team-logo.png';
20-
import Graph from '../Graph/index_visjs';
20+
import Graph from '../Graph_page/GraphVisualization';
2121
import UploadPage from '../UploadPage';
2222
import LandingPage from '../LandingPage';
23-
23+
import Navbar from '../Navbar/Navbar';
2424
import './index.css';
2525

2626
const theme = createTheme({
@@ -46,9 +46,7 @@ function App() {
4646
px={2}
4747
py={1}
4848
>
49-
<NavLink to="/">
50-
<img src={logo} alt="Logo" className="logo" />
51-
</NavLink>
49+
<Navbar />
5250
<Typography variant="h6">Graph Masters</Typography>
5351
<NavLink to="/" style={{ textDecoration: 'none' }}>
5452
<Typography variant="h6" sx={{ color: 'white' }}>

0 commit comments

Comments
 (0)