Skip to content

Commit 3d25efd

Browse files
committed
first test passed. Getting closer to release (still a way, but closer)
1 parent 6da6cb8 commit 3d25efd

File tree

204 files changed

+2011
-89
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

204 files changed

+2011
-89
lines changed

examples/scripts/readme_snippets.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,3 +437,86 @@ def output_dim(self):
437437
n_epochs=1,
438438
batch_size=32,
439439
)
440+
441+
442+
# 8. A Two tower model
443+
np.random.seed(42)
444+
445+
# user_features dataframe
446+
user_ids = np.arange(1, 101)
447+
ages = np.random.randint(18, 60, size=100)
448+
genders = np.random.choice(["male", "female"], size=100)
449+
locations = np.random.choice(["city_a", "city_b", "city_c", "city_d"], size=100)
450+
user_features = pd.DataFrame(
451+
{"id": user_ids, "age": ages, "gender": genders, "location": locations}
452+
)
453+
454+
# item_features dataframe
455+
item_ids = np.arange(1, 101)
456+
prices = np.random.uniform(10, 500, size=100).round(2)
457+
colors = np.random.choice(["red", "blue", "green", "black"], size=100)
458+
categories = np.random.choice(["electronics", "clothing", "home", "toys"], size=100)
459+
460+
item_features = pd.DataFrame(
461+
{"id": item_ids, "price": prices, "color": colors, "category": categories}
462+
)
463+
464+
# Interactions dataframe
465+
interaction_user_ids = np.random.choice(user_ids, size=1000)
466+
interaction_item_ids = np.random.choice(item_ids, size=1000)
467+
purchased = np.random.choice([0, 1], size=1000, p=[0.7, 0.3])
468+
interactions = pd.DataFrame(
469+
{
470+
"user_id": interaction_user_ids,
471+
"item_id": interaction_item_ids,
472+
"purchased": purchased,
473+
}
474+
)
475+
user_item_purchased = interactions.merge(
476+
user_features, left_on="user_id", right_on="id"
477+
).merge(item_features, left_on="item_id", right_on="id")
478+
479+
480+
# Users
481+
tab_preprocessor_user = TabPreprocessor(
482+
cat_embed_cols=["gender", "location"],
483+
continuous_cols=["age"],
484+
)
485+
X_user = tab_preprocessor_user.fit_transform(user_item_purchased)
486+
tab_mlp_user = TabMlp(
487+
column_idx=tab_preprocessor_user.column_idx,
488+
cat_embed_input=tab_preprocessor_user.cat_embed_input,
489+
continuous_cols=["age"],
490+
mlp_hidden_dims=[16, 8],
491+
mlp_dropout=[0.2, 0.2],
492+
)
493+
494+
# Items
495+
tab_preprocessor_item = TabPreprocessor(
496+
cat_embed_cols=["color", "category"],
497+
continuous_cols=["price"],
498+
)
499+
X_item = tab_preprocessor_item.fit_transform(user_item_purchased)
500+
tab_mlp_item = TabMlp(
501+
column_idx=tab_preprocessor_item.column_idx,
502+
cat_embed_input=tab_preprocessor_item.cat_embed_input,
503+
continuous_cols=["price"],
504+
mlp_hidden_dims=[16, 8],
505+
mlp_dropout=[0.2, 0.2],
506+
)
507+
508+
two_tower_model = ModelFuser([tab_mlp_user, tab_mlp_item], fusion_method="dot")
509+
510+
model = WideDeep(deeptabular=two_tower_model)
511+
512+
trainer = Trainer(
513+
model,
514+
objective="binary",
515+
)
516+
517+
trainer.fit(
518+
X_tab=[X_user, X_item],
519+
target=interactions.purchased.values,
520+
n_epochs=1,
521+
batch_size=32,
522+
)

tests/test_multi_model_and_mutil_data/data_for_muti_tabular_components/test.csv

Lines changed: 101 additions & 0 deletions
Large diffs are not rendered by default.

tests/test_multi_model_and_mutil_data/data_for_muti_tabular_components/train.csv

Lines changed: 801 additions & 0 deletions
Large diffs are not rendered by default.

tests/test_multi_model_and_mutil_data/data_for_muti_tabular_components/val.csv

Lines changed: 101 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
# sometimes I call this script generate_fake_data.py
2+
import os
3+
import random
4+
from typing import Tuple
5+
from pathlib import Path
6+
7+
import cv2
8+
import numpy as np
9+
import pandas as pd
10+
from faker import Faker
11+
12+
13+
def generate_fake_data() -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
14+
15+
current_dir = os.path.dirname(os.path.realpath(__file__))
16+
17+
# Set seed for reproducibility
18+
random.seed(42)
19+
20+
Faker.seed(42)
21+
22+
num_rows = 64 + 16 + 16
23+
24+
# Generate random categorical data
25+
categories = ["category_A", "category_B", "category_C"]
26+
27+
cat_col = [random.choice(categories) for _ in range(num_rows)]
28+
29+
# Generate random numerical data
30+
num_col = [np.random.rand() for _ in range(num_rows)]
31+
32+
# Generate random sentences
33+
fake = Faker()
34+
text_col1 = [fake.sentence() for _ in range(num_rows)]
35+
text_col2 = [fake.sentence() for _ in range(num_rows)]
36+
37+
# Generate the image data
38+
img_folder = "images"
39+
40+
img_path = "/".join([current_dir, "load_from_folder_test_data", img_folder])
41+
42+
if not os.path.exists(img_path):
43+
os.makedirs(img_path)
44+
45+
for i in range(num_rows):
46+
image = np.random.randint(0, 256, (16, 16, 3), dtype="uint8")
47+
image_name = "image_set1_{}.png".format(i)
48+
cv2.imwrite("/".join([img_path, image_name]), image)
49+
50+
image = np.random.randint(0, 256, (16, 16, 3), dtype="uint8")
51+
image_name = "image_set2_{}.png".format(i)
52+
cv2.imwrite("/".join([img_path, image_name]), image)
53+
54+
# Generate fake target values
55+
target = [random.choice([0, 1]) for _ in range(num_rows)]
56+
57+
# Create DataFrame
58+
data = {
59+
"cat_col": cat_col,
60+
"num_col": num_col,
61+
"text_col1": text_col1,
62+
"text_col2": text_col2,
63+
"image_col1": ["image_set1_{}.png".format(i) for i in range(num_rows)],
64+
"image_col2": ["image_set2_{}.png".format(i) for i in range(num_rows)],
65+
"target": target,
66+
}
67+
68+
df = pd.DataFrame(data)
69+
70+
save_dir = Path(current_dir) / "load_from_folder_test_data"
71+
72+
if not save_dir.exists():
73+
save_dir.mkdir(parents=True)
74+
75+
train_df = df.iloc[:64]
76+
val_df = df.iloc[64:80]
77+
test_df = df.iloc[80:]
78+
79+
train_df.to_csv(save_dir / "train.csv", index=False)
80+
val_df.to_csv(save_dir / "val.csv", index=False)
81+
test_df.to_csv(save_dir / "test.csv", index=False)
82+
83+
print("Dataset and images created and saved successfully.")
84+
85+
return train_df, val_df, test_df
86+
87+
88+
def generate_fake_data_for_mutil_tabular_components() -> (
89+
Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]
90+
):
91+
92+
current_dir = os.path.dirname(os.path.realpath(__file__))
93+
save_dir = Path(current_dir) / "data_for_muti_tabular_components"
94+
95+
if not save_dir.exists():
96+
save_dir.mkdir(parents=True)
97+
98+
fake = Faker()
99+
100+
random.seed(42)
101+
np.random.seed(42)
102+
103+
# Create User Features DataFrame
104+
user_ids = range(1, 33)
105+
ages = np.random.randint(18, 65, size=32)
106+
genders = np.random.choice(["male", "female"], size=32)
107+
locations = np.random.choice(["location_a", "location_b", "location_c"], size=32)
108+
reviews = [fake.sentence(nb_words=10) for _ in range(32)]
109+
110+
user_features = pd.DataFrame(
111+
{
112+
"id": user_ids,
113+
"age": ages,
114+
"gender": genders,
115+
"location": locations,
116+
"review": reviews,
117+
}
118+
)
119+
120+
# Create Item Features DataFrame
121+
item_ids = range(1, 33)
122+
prices = np.round(np.random.uniform(10, 1000, size=32), 2)
123+
colors = np.random.choice(["red", "blue", "green", "yellow"], size=32)
124+
categories = np.random.choice(["category_1", "category_2", "category_3"], size=32)
125+
descriptions = [fake.sentence(nb_words=10) for _ in range(32)]
126+
127+
item_features = pd.DataFrame(
128+
{
129+
"id": item_ids,
130+
"price": prices,
131+
"color": colors,
132+
"category": categories,
133+
"description": descriptions,
134+
}
135+
)
136+
137+
# Create Interaction DataFrame
138+
interaction_data = []
139+
for _ in range(1000): # maybe 1000 interactions is too much for a test
140+
user_id = random.choice(user_ids)
141+
item_id = random.choice(item_ids)
142+
purchased = random.choice([0, 1])
143+
interaction_data.append([user_id, item_id, purchased])
144+
145+
interactions = pd.DataFrame(
146+
interaction_data, columns=["user_id", "item_id", "purchased"]
147+
)
148+
149+
user_item_purchased_df = interactions.merge(
150+
user_features, left_on="user_id", right_on="id"
151+
).merge(item_features, left_on="item_id", right_on="id")
152+
153+
train_df = user_item_purchased_df.iloc[:800]
154+
val_df = user_item_purchased_df.iloc[800:900]
155+
test_df = user_item_purchased_df.iloc[900:]
156+
157+
train_df.to_csv(save_dir / "train.csv", index=False)
158+
val_df.to_csv(save_dir / "val.csv", index=False)
159+
test_df.to_csv(save_dir / "test.csv", index=False)
160+
161+
return train_df, val_df, test_df
162+
163+
164+
if __name__ == "__main__":
165+
# _, _, _ = generate_fake_data()
166+
_, _, _ = generate_fake_data_for_mutil_tabular_components()

0 commit comments

Comments
 (0)