Skip to content

Commit 79272f8

Browse files
Merge branch 'fani-lab:main' into main
2 parents eae5c1c + 0d8732b commit 79272f8

11 files changed

+914
-146
lines changed

docs/_todo.txt

Lines changed: 16 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,19 @@
1-
multilayer neural classifiers
1+
graph neural network - e2e with transfer support (transductive splits per fold)
2+
node2vec,
3+
- test
4+
- eval
5+
6+
metapath2vec
7+
- train
8+
- test
9+
- eval
10+
11+
homo versions: gcn, gs, gat, gatv2, gin
12+
- train
13+
- test
14+
- eval
215

3-
nmt (from nmt branch and sigir25 paper)
4-
>> seperate _prep() from learn()
5-
>> learn() does not save checkpoints
16+
multilayer neural classifiers
617

718
- rrn >> isn't it temporal?
819
- bundle recommenders
@@ -18,20 +29,12 @@ multilayer neural classifiers
1829
{emb}_{fnn,bnn}_cl_{train/test} >> future
1930

2031
-- end2end
21-
make a connection between train/vali/test splits of teams and train/valid/test edge lists >> this should be done per-fold!!
22-
23-
Let's start the transfer part for now. I'll complete the tasks when I come to e2e
24-
25-
- strict splits, transductive, inductive
32+
- inductive
2633
- hetero version of gnns
27-
28-
Notes:
29-
3034
Our graph are mainly hetero, but gnn methods are mainly designed for homo graph. So, we create hetero, make it to_homo(), ...
3135
For strict hetero, we will use HeteroConv that wraps gnn methods per node type
3236
Indeed, it is worth study a mixture of gnn methods for node types :D
3337

34-
3538
update to readme for cmn layer and main readme
3639

3740
d2v
@@ -49,20 +52,6 @@ def evaluate_model(model, val_corpus):
4952
sims.append(top_score)
5053
return sum(sims) / len(sims) >> a doc, when in test, should return the same doc if in train, or the most similar one (textoverlap?)
5154

52-
53-
-- gnn-transferred >> for now, entire graph is seen. But
54-
Strict split way: create a training graph using training teams. The test teams should then added to infer the node embeddings.
55-
luckily the nodes are already seen (skills, experts) (team nodes are always unseen), but if not seen, either remove the node or infer based on surroundings.
56-
57-
58-
node2vec, metapath2vec
59-
-- transductive vs. inductive >> train, valid, test
60-
-- yet to be run on entire datasets >> also check to engage cuda
61-
62-
homo versions: gcn, gs, gat, gatv2, gin
63-
-- transductive vs. inductive >> train, valid, test
64-
-- yet to be run on entire datasets >> also check to engage cuda
65-
6655
=========
6756
hetero versions of gcn, gs, gat, gatv2, gin
6857
han >> pure hetero like m2v

docs/tutorial/cikm25/index.html

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head> </head>
4+
<body>
5+
<p>Hello World!</p>
6+
<p>This page is under construction for our tutorial at CIKM25, 10-14 Nov. 2025 in Seoul, Korea ...</p>
7+
<p><a href="https://fani-lab.github.io/OpeNTF/tutorial/wsdm25/">Our previous tutorial was at WSDM25, March 10, 2025, in Hannover, Germany </a> </p>
8+
</body>
9+
</html>
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import os
2+
import torch
3+
import copy
4+
import torch.nn.functional as F
5+
from torch.utils.data import DataLoader, TensorDataset
6+
from torch_geometric.loader import NeighborLoader
7+
from torch_geometric.utils import negative_sampling
8+
9+
10+
class FoldDataset:
11+
def __init__(self, all_edges, fold_indices):
12+
"""
13+
all_edges: torch.Tensor of shape [2, num_edges] (src, dst)
14+
fold_indices: list of (train_idx, val_idx) tuples for each fold
15+
"""
16+
self.all_edges = all_edges
17+
self.fold_indices = fold_indices
18+
19+
def get_fold(self, i):
20+
train_idx, val_idx = self.fold_indices[i]
21+
return self.all_edges[:, train_idx], self.all_edges[:, val_idx]
22+
23+
24+
class LinkPredictionCVRunner:
25+
def __init__(self, data, test_edges, fold_dataset, model_fn, decoder_fn,
26+
save_dir='cv_results', device='cpu', num_epochs=10, batch_size=1024,
27+
edge_type_supervision=('a', 'to', 'b')):
28+
self.orig_data = data
29+
self.test_edges = test_edges
30+
self.fold_dataset = fold_dataset
31+
self.model_fn = model_fn
32+
self.decoder_fn = decoder_fn
33+
self.device = device
34+
self.num_epochs = num_epochs
35+
self.batch_size = batch_size
36+
self.save_dir = save_dir
37+
self.edge_type_supervision = edge_type_supervision
38+
os.makedirs(save_dir, exist_ok=True)
39+
40+
def _to_homo_cached(self, data):
41+
if hasattr(data, '_homo_cache'):
42+
return data._homo_cache
43+
data._homo_cache = data.to_homogeneous()
44+
return data._homo_cache
45+
46+
def run_fold(self, fold_idx):
47+
train_edges, val_edges = self.fold_dataset.get_fold(fold_idx)
48+
49+
# Create fold-specific training graph
50+
data_fold = copy.deepcopy(self.orig_data)
51+
if self.edge_type_supervision is not None:
52+
data_fold[self.edge_type_supervision].edge_index = train_edges
53+
54+
homo = self._to_homo_cached(data_fold)
55+
56+
if self.edge_type_supervision is not None:
57+
etype_id = homo.edge_type_names.index(self.edge_type_supervision)
58+
edge_type_mask = homo.edge_type == etype_id
59+
pos_edge_index = homo.edge_index[:, edge_type_mask]
60+
else:
61+
pos_edge_index = train_edges # assume full global edge supervision input
62+
63+
edge_dataset = TensorDataset(pos_edge_index[0], pos_edge_index[1])
64+
edge_loader = DataLoader(edge_dataset, batch_size=self.batch_size, shuffle=True)
65+
66+
model = self.model_fn().to(self.device)
67+
decoder = self.decoder_fn().to(self.device)
68+
optimizer = torch.optim.Adam(list(model.parameters()) + list(decoder.parameters()), lr=1e-3)
69+
70+
for epoch in range(self.num_epochs):
71+
model.train()
72+
decoder.train()
73+
for src_pos, dst_pos in edge_loader:
74+
node_ids = torch.cat([src_pos, dst_pos]).unique()
75+
sub_loader = NeighborLoader(
76+
homo,
77+
input_nodes=node_ids,
78+
num_neighbors=[15, 10],
79+
batch_size=node_ids.size(0),
80+
shuffle=False
81+
)
82+
sub_data = next(iter(sub_loader)).to(self.device)
83+
84+
x = model(sub_data.x, sub_data.edge_index)
85+
src_pos, dst_pos = src_pos.to(self.device), dst_pos.to(self.device)
86+
pos_out = decoder(x[src_pos], x[dst_pos])
87+
88+
neg_dst = torch.randint(0, homo.num_nodes, (len(src_pos),), device=self.device)
89+
neg_out = decoder(x[src_pos], x[neg_dst])
90+
91+
pos_loss = F.binary_cross_entropy_with_logits(pos_out, torch.ones_like(pos_out))
92+
neg_loss = F.binary_cross_entropy_with_logits(neg_out, torch.zeros_like(neg_out))
93+
loss = pos_loss + neg_loss
94+
95+
loss.backward()
96+
optimizer.step()
97+
optimizer.zero_grad()
98+
99+
# Save model & results
100+
model_path = os.path.join(self.save_dir, f'model_fold{fold_idx}.pt')
101+
torch.save({'model': model.state_dict(), 'decoder': decoder.state_dict()}, model_path)
102+
103+
val_result = self.evaluate(model, decoder, homo, val_edges, name=f'val_fold{fold_idx}')
104+
test_result = self.evaluate(model, decoder, homo, self.test_edges, name=f'test_fold{fold_idx}')
105+
106+
result_path = os.path.join(self.save_dir, f'result_fold{fold_idx}.pt')
107+
torch.save({'val': val_result, 'test': test_result}, result_path)
108+
109+
@torch.no_grad()
110+
def evaluate(self, model, decoder, homo, edge_index, name='val'):
111+
model.eval()
112+
decoder.eval()
113+
x = model(homo.x.to(self.device), homo.edge_index.to(self.device))
114+
115+
src, dst = edge_index[0].to(self.device), edge_index[1].to(self.device)
116+
pos_pred = decoder(x[src], x[dst]).sigmoid()
117+
118+
neg_dst = torch.randint(0, homo.num_nodes, (len(src),), device=self.device)
119+
neg_pred = decoder(x[src], x[neg_dst]).sigmoid()
120+
121+
y_true = torch.cat([torch.ones_like(pos_pred), torch.zeros_like(neg_pred)])
122+
y_score = torch.cat([pos_pred, neg_pred])
123+
124+
from sklearn.metrics import roc_auc_score, average_precision_score
125+
auc = roc_auc_score(y_true.cpu(), y_score.cpu())
126+
ap = average_precision_score(y_true.cpu(), y_score.cpu())
127+
return {'auc': auc, 'ap': ap}
128+
129+
def run_all_folds(self):
130+
for i in range(len(self.fold_dataset.fold_indices)):
131+
self.run_fold(i)
132+
print("All folds completed.")
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import os
2+
import torch
3+
import copy
4+
import torch.nn.functional as F
5+
from torch.utils.data import DataLoader, TensorDataset
6+
from torch_geometric.loader import NeighborLoader
7+
from torch_geometric.utils import negative_sampling
8+
9+
10+
class FoldDataset:
11+
def __init__(self, all_edges, fold_indices):
12+
"""
13+
all_edges: torch.Tensor of shape [2, num_edges] (src, dst)
14+
fold_indices: list of (train_idx, val_idx) tuples for each fold
15+
"""
16+
self.all_edges = all_edges
17+
self.fold_indices = fold_indices
18+
19+
def get_fold(self, i):
20+
train_idx, val_idx = self.fold_indices[i]
21+
return self.all_edges[:, train_idx], self.all_edges[:, val_idx]
22+
23+
24+
class LinkPredictionCVRunner:
25+
def __init__(self, data, test_edges, fold_dataset, model_fn, decoder_fn,
26+
save_dir='cv_results', device='cpu', num_epochs=10, batch_size=1024,
27+
edge_types_supervision=None):
28+
"""
29+
edge_types_supervision: list of edge types to supervise on.
30+
If empty or None, supervise on all edge types.
31+
"""
32+
self.orig_data = data
33+
self.test_edges = test_edges
34+
self.fold_dataset = fold_dataset
35+
self.model_fn = model_fn
36+
self.decoder_fn = decoder_fn
37+
self.device = device
38+
self.num_epochs = num_epochs
39+
self.batch_size = batch_size
40+
self.save_dir = save_dir
41+
self.edge_types_supervision = edge_types_supervision or []
42+
os.makedirs(save_dir, exist_ok=True)
43+
44+
def _to_homo_cached(self, data):
45+
if hasattr(data, '_homo_cache'):
46+
return data._homo_cache
47+
data._homo_cache = data.to_homogeneous()
48+
return data._homo_cache
49+
50+
def run_fold(self, fold_idx):
51+
train_edges, val_edges = self.fold_dataset.get_fold(fold_idx)
52+
53+
data_fold = copy.deepcopy(self.orig_data)
54+
if self.edge_types_supervision:
55+
for etype in self.edge_types_supervision:
56+
data_fold[etype].edge_index = train_edges
57+
58+
homo = self._to_homo_cached(data_fold)
59+
60+
if self.edge_types_supervision:
61+
etype_ids = [homo.edge_type_names.index(etype) for etype in self.edge_types_supervision]
62+
edge_type_mask = torch.isin(homo.edge_type, torch.tensor(etype_ids, device=homo.edge_type.device))
63+
pos_edge_index = homo.edge_index[:, edge_type_mask]
64+
else:
65+
pos_edge_index = train_edges
66+
67+
edge_dataset = TensorDataset(pos_edge_index[0], pos_edge_index[1])
68+
edge_loader = DataLoader(edge_dataset, batch_size=self.batch_size, shuffle=True)
69+
70+
model = self.model_fn().to(self.device)
71+
decoder = self.decoder_fn().to(self.device)
72+
optimizer = torch.optim.Adam(list(model.parameters()) + list(decoder.parameters()), lr=1e-3)
73+
74+
for epoch in range(self.num_epochs):
75+
model.train()
76+
decoder.train()
77+
for src_pos, dst_pos in edge_loader:
78+
node_ids = torch.cat([src_pos, dst_pos]).unique()
79+
sub_loader = NeighborLoader(
80+
homo,
81+
input_nodes=node_ids,
82+
num_neighbors=[15, 10],
83+
batch_size=node_ids.size(0),
84+
shuffle=False
85+
)
86+
sub_data = next(iter(sub_loader)).to(self.device)
87+
88+
x = model(sub_data.x, sub_data.edge_index)
89+
src_pos, dst_pos = src_pos.to(self.device), dst_pos.to(self.device)
90+
pos_out = decoder(x[src_pos], x[dst_pos])
91+
92+
neg_dst = torch.randint(0, homo.num_nodes, (len(src_pos),), device=self.device)
93+
neg_out = decoder(x[src_pos], x[neg_dst])
94+
95+
pos_loss = F.binary_cross_entropy_with_logits(pos_out, torch.ones_like(pos_out))
96+
neg_loss = F.binary_cross_entropy_with_logits(neg_out, torch.zeros_like(neg_out))
97+
loss = pos_loss + neg_loss
98+
99+
loss.backward()
100+
optimizer.step()
101+
optimizer.zero_grad()
102+
103+
model_path = os.path.join(self.save_dir, f'model_fold{fold_idx}.pt')
104+
torch.save({'model': model.state_dict(), 'decoder': decoder.state_dict()}, model_path)
105+
106+
val_result = self.evaluate(model, decoder, homo, val_edges, name=f'val_fold{fold_idx}')
107+
test_result = self.evaluate(model, decoder, homo, self.test_edges, name=f'test_fold{fold_idx}')
108+
109+
result_path = os.path.join(self.save_dir, f'result_fold{fold_idx}.pt')
110+
torch.save({'val': val_result, 'test': test_result}, result_path)
111+
112+
@torch.no_grad()
113+
def evaluate(self, model, decoder, homo, edge_index, name='val'):
114+
model.eval()
115+
decoder.eval()
116+
x = model(homo.x.to(self.device), homo.edge_index.to(self.device))
117+
118+
src, dst = edge_index[0].to(self.device), edge_index[1].to(self.device)
119+
pos_pred = decoder(x[src], x[dst]).sigmoid()
120+
121+
neg_dst = torch.randint(0, homo.num_nodes, (len(src),), device=self.device)
122+
neg_pred = decoder(x[src], x[neg_dst]).sigmoid()
123+
124+
y_true = torch.cat([torch.ones_like(pos_pred), torch.zeros_like(neg_pred)])
125+
y_score = torch.cat([pos_pred, neg_pred])
126+
127+
from sklearn.metrics import roc_auc_score, average_precision_score
128+
auc = roc_auc_score(y_true.cpu(), y_score.cpu())
129+
ap = average_precision_score(y_true.cpu(), y_score.cpu())
130+
return {'auc': auc, 'ap': ap}
131+
132+
def run_all_folds(self):
133+
for i in range(len(self.fold_dataset.fold_indices)):
134+
self.run_fold(i)
135+
print("All folds completed.")

0 commit comments

Comments
 (0)