Skip to content

Commit 90c8193

Browse files
authored
Merge pull request #194 from jrzaurin/large-datasets
Large datasets
2 parents 74f1ab6 + 59ab075 commit 90c8193

File tree

124 files changed

+8754
-1031
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

124 files changed

+8754
-1031
lines changed

.coveragerc

+4-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@ omit =
55
pytorch_widedeep/bayesian_models/bayesian_nn/modules/*
66

77
[report]
8+
exclude_also =
9+
def __repr__
810
omit =
911
pytorch_widedeep/optim/*
1012
pytorch_widedeep/bayesian_models/bayesian_nn/modules/*
11-
precision = 2
13+
precision = 2
14+

VERSION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.3.2
1+
1.4.0

examples/notebooks/20_load_from_folder_functionality.ipynb

+975
Large diffs are not rendered by default.

examples/scripts/adult_census.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -164,4 +164,4 @@
164164
)
165165

166166
# 3. Either fit or directly predict
167-
preds = trainer_new.predict(X_wide=X_wide, X_tab=X_tab)
167+
preds = trainer_new.predict(X_wide=X_wide, X_tab=X_tab, batch_size=256)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
import numpy as np
2+
import torch
3+
import pandas as pd
4+
from torch.utils.data import DataLoader
5+
6+
from pytorch_widedeep.models import TabResnet # noqa: F401
7+
from pytorch_widedeep.models import TabMlp, Vision, BasicRNN, WideDeep
8+
from pytorch_widedeep.training import TrainerFromFolder
9+
from pytorch_widedeep.callbacks import EarlyStopping, ModelCheckpoint
10+
from pytorch_widedeep.preprocessing import (
11+
ImagePreprocessor,
12+
ChunkTabPreprocessor,
13+
ChunkTextPreprocessor,
14+
)
15+
from pytorch_widedeep.load_from_folder import (
16+
TabFromFolder,
17+
TextFromFolder,
18+
ImageFromFolder,
19+
WideDeepDatasetFromFolder,
20+
)
21+
22+
use_cuda = torch.cuda.is_available()
23+
24+
if __name__ == "__main__":
25+
# The airbnb dataset, which you could get from here:
26+
# http://insideairbnb.com/get-the-data.html, is too big to be included in
27+
# our datasets module (when including images). Therefore, go there,
28+
# download it, and use the download_images.py script to get the images
29+
# and the airbnb_data_processing.py to process the data. We'll find
30+
# better datasets in the future ;). Note that here we are only using a
31+
# small sample to illustrate the use, so PLEASE ignore the results, just
32+
# focus on usage
33+
34+
# For this exercise, we use a small sample of the airbnb dataset,
35+
# comprised of tabular data with a text column ('description') and an
36+
# image column ('id') that point to the images of the properties listed
37+
# in Airbnb. We know the size of the sample before hand (1001) so we set
38+
# a series of parameters accordingly
39+
train_size = 800
40+
eval_size = 100
41+
test_size = 101
42+
chunksize = 100
43+
n_chunks = int(np.ceil(train_size / chunksize))
44+
45+
data_path = "../tmp_data/airbnb/"
46+
train_fname = "airbnb_sample_train.csv"
47+
eval_fname = "airbnb_sample_eval.csv"
48+
test_fname = "airbnb_sample_test.csv"
49+
50+
# the images are stored in the 'property_picture' while the text is a
51+
# column in the 'airbnb_sample' dataframe. Let's then define the dir and
52+
# file variables
53+
img_path = "../tmp_data/airbnb/property_picture/"
54+
img_col = "id"
55+
text_col = "description"
56+
target_col = "yield"
57+
cat_embed_cols = [
58+
"host_listings_count",
59+
"neighbourhood_cleansed",
60+
"is_location_exact",
61+
"property_type",
62+
"room_type",
63+
"accommodates",
64+
"bathrooms",
65+
"bedrooms",
66+
"beds",
67+
"guests_included",
68+
"minimum_nights",
69+
"instant_bookable",
70+
"cancellation_policy",
71+
"has_house_rules",
72+
"host_gender",
73+
"accommodates_catg",
74+
"guests_included_catg",
75+
"minimum_nights_catg",
76+
"host_listings_count_catg",
77+
"bathrooms_catg",
78+
"bedrooms_catg",
79+
"beds_catg",
80+
"security_deposit",
81+
"extra_people",
82+
]
83+
cont_cols = ["latitude", "longitude"]
84+
85+
# Now, processing the data from here on can be done in two ways:
86+
# 1. The tabular data itsel fits in memory as is only the images that do
87+
# not: in this case you could use the 'standard' Preprocessors and off
88+
# you go, move directly to the '[...]FromFolder' functionalities
89+
90+
# 2. The tabular data is also very large and does not fit in memory, so we
91+
# have to process it in chuncks. For this second case I have created the
92+
# Chunk Processors (Wide, Tab and Text). Note that at the moment ONLY csv
93+
# format is allowed for the tabular file. More formats will be supported
94+
# in the future.
95+
96+
# For the following I will assume (simply for illustration purposes) that
97+
# we are in the second case. Nonetheless, the process, whether 1 or 2,
98+
# can be summarised as follows:
99+
# 1. Process the data
100+
# 2. Define the loaders from folder
101+
# 3. Define the datasets and dataloaders
102+
# 4. Define the model and the Trainer
103+
# 5. Fit the model and Predict
104+
105+
# Process the data in Chunks
106+
tab_preprocessor = ChunkTabPreprocessor(
107+
embed_cols=cat_embed_cols,
108+
continuous_cols=cont_cols,
109+
n_chunks=n_chunks,
110+
default_embed_dim=8,
111+
verbose=0,
112+
)
113+
114+
text_preprocessor = ChunkTextPreprocessor(
115+
n_chunks=n_chunks,
116+
text_col=text_col,
117+
n_cpus=1,
118+
)
119+
120+
# Note that all the (pre)processing of the images will occur 'on the fly',
121+
# as they are loaded from disk. Therefore, the flow for the image dataset
122+
# and for the tabular and text data modes is not entirely the same.
123+
# Tabular and text data uses Chunk processors while such processing
124+
# approach is not needed for the images
125+
img_preprocessor = ImagePreprocessor(
126+
img_col=img_col,
127+
img_path=img_path,
128+
)
129+
130+
for i, chunk in enumerate(
131+
pd.read_csv("/".join([data_path, train_fname]), chunksize=chunksize)
132+
):
133+
print(f"chunk in loop: {i}")
134+
tab_preprocessor.fit(chunk)
135+
text_preprocessor.fit(chunk)
136+
137+
# Instantiate the loaders from folder: again here some explanation is
138+
# required. As I mentioned earlier the "[...]FromFolder" functionalities
139+
# are thought for the case when we have tabular and text and/or image
140+
# datasets and the latter do not fit in memory, so they have to be loaded
141+
# from disk. With this in mind, the tabular data is the reference, and
142+
# must have columns that point to the image files and to the text files
143+
# (in case these exists instead of a column with the texts). Since the
144+
# tabular data is used as a reference, is the one that has to be splitted
145+
# in train/validation/test. The test and image 'FromFolder' objects only
146+
# point to the corresponding column or files, and therefore, we do not
147+
# need to create a separate instance per train/validation/test dataset
148+
train_tab_folder = TabFromFolder(
149+
fname=train_fname,
150+
directory=data_path,
151+
target_col=target_col,
152+
preprocessor=tab_preprocessor,
153+
text_col=text_col,
154+
img_col=img_col,
155+
)
156+
eval_tab_folder = TabFromFolder(fname=eval_fname, reference=train_tab_folder) # type: ignore[arg-type]
157+
test_tab_folder = TabFromFolder(
158+
fname=test_fname, reference=train_tab_folder, ignore_target=True # type: ignore[arg-type]
159+
)
160+
161+
text_folder = TextFromFolder(
162+
preprocessor=text_preprocessor,
163+
)
164+
165+
img_folder = ImageFromFolder(preprocessor=img_preprocessor)
166+
167+
# Following 'standard' pytorch approaches, we define the datasets and then
168+
# the dataloaders
169+
train_dataset_folder = WideDeepDatasetFromFolder(
170+
n_samples=train_size,
171+
tab_from_folder=train_tab_folder,
172+
text_from_folder=text_folder,
173+
img_from_folder=img_folder,
174+
)
175+
eval_dataset_folder = WideDeepDatasetFromFolder(
176+
n_samples=eval_size,
177+
tab_from_folder=eval_tab_folder,
178+
reference=train_dataset_folder,
179+
)
180+
test_dataset_folder = WideDeepDatasetFromFolder(
181+
n_samples=test_size,
182+
tab_from_folder=test_tab_folder,
183+
reference=train_dataset_folder,
184+
)
185+
train_loader = DataLoader(train_dataset_folder, batch_size=16, num_workers=1)
186+
eval_loader = DataLoader(eval_dataset_folder, batch_size=16, num_workers=1)
187+
test_loader = DataLoader(test_dataset_folder, batch_size=16, num_workers=1)
188+
189+
# And from here on, is all pretty standard within the library
190+
basic_rnn = BasicRNN(
191+
vocab_size=len(text_preprocessor.vocab.itos),
192+
embed_dim=32,
193+
hidden_dim=64,
194+
n_layers=2,
195+
head_hidden_dims=[100, 50],
196+
)
197+
198+
deepimage = Vision(
199+
pretrained_model_name="resnet18", n_trainable=0, head_hidden_dims=[200, 100]
200+
)
201+
202+
deepdense = TabMlp(
203+
mlp_hidden_dims=[64, 32],
204+
column_idx=tab_preprocessor.column_idx,
205+
cat_embed_input=tab_preprocessor.cat_embed_input,
206+
continuous_cols=cont_cols,
207+
)
208+
209+
model = WideDeep(
210+
deeptabular=deepdense,
211+
deeptext=basic_rnn,
212+
deepimage=deepimage,
213+
)
214+
215+
callbacks = [EarlyStopping, ModelCheckpoint(filepath="model_weights/wd_out.pt")]
216+
217+
trainer = TrainerFromFolder(
218+
model,
219+
objective="regression",
220+
callbacks=callbacks,
221+
)
222+
223+
trainer.fit(
224+
train_loader=train_loader,
225+
eval_loader=eval_loader,
226+
finetune=True,
227+
finetune_epochs=1,
228+
)
229+
preds = trainer.predict(test_loader=test_loader)

mkdocs/mkdocs.yml

+2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ nav:
2626
- Image utils: pytorch-widedeep/utils/image_utils.md
2727
- Text utils: pytorch-widedeep/utils/text_utils.md
2828
- Preprocessing: pytorch-widedeep/preprocessing.md
29+
- Load From Folder: pytorch-widedeep/load_from_folder.md
2930
- Model Components: pytorch-widedeep/model_components.md
3031
- Bayesian models: pytorch-widedeep/bayesian_models.md
3132
- Losses: pytorch-widedeep/losses.md
@@ -59,6 +60,7 @@ nav:
5960
- 18_feature_importance_via_attention_weights: examples/18_feature_importance_via_attention_weights.ipynb
6061
- 19_wide_and_deep_for_recsys_pt1: examples/19_wide_and_deep_for_recsys_pt1.ipynb
6162
- 19_wide_and_deep_for_recsys_pt2: examples/19_wide_and_deep_for_recsys_pt2.ipynb
63+
- 20_load_from_folder_functionality.: examples/20_load_from_folder_functionality.ipynb
6264
- Contributing: contributing.md
6365

6466
theme:

mkdocs/site/404.html

+32
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,8 @@
406406

407407

408408

409+
410+
409411

410412

411413

@@ -553,6 +555,20 @@
553555

554556

555557

558+
<li class="md-nav__item">
559+
<a href="/pytorch-widedeep/load_from_folder.html" class="md-nav__link">
560+
Load From Folder
561+
</a>
562+
</li>
563+
564+
565+
566+
567+
568+
569+
570+
571+
556572
<li class="md-nav__item">
557573
<a href="/pytorch-widedeep/model_components.html" class="md-nav__link">
558574
Model Components
@@ -749,6 +765,8 @@
749765

750766

751767

768+
769+
752770

753771

754772

@@ -1060,6 +1078,20 @@
10601078

10611079

10621080

1081+
1082+
1083+
1084+
1085+
1086+
<li class="md-nav__item">
1087+
<a href="/examples/20_load_from_folder_functionality.html" class="md-nav__link">
1088+
20_load_from_folder_functionality.
1089+
</a>
1090+
</li>
1091+
1092+
1093+
1094+
10631095
</ul>
10641096
</nav>
10651097
</li>

0 commit comments

Comments
 (0)