|
8 | 8 | from torch import nn
|
9 | 9 | from torch import Tensor
|
10 | 10 | from collections import OrderedDict
|
11 |
| -from typing import List, Optional, Sequence, Tuple, Union |
| 11 | +from typing import Optional, Sequence |
12 | 12 |
|
13 | 13 | from minerva.losses.negative_cossine_similatiry import NegativeCosineSimilarity
|
14 |
| - |
15 |
| -# --- Model Parts --------------------------------------------------------- |
16 |
| - |
17 |
| -# Borrowed from https://github.com/lightly-ai/lightly/blob/master/lightly/models/modules/heads.py#L15 |
18 |
| - |
19 |
| - |
20 |
| -class ProjectionHead(nn.Module): |
21 |
| - """Base class for all projection and prediction heads.""" |
22 |
| - |
23 |
| - def __init__( |
24 |
| - self, |
25 |
| - blocks: Sequence[ |
26 |
| - Union[ |
27 |
| - Tuple[int, int, Optional[nn.Module], Optional[nn.Module]], |
28 |
| - Tuple[int, int, Optional[nn.Module], Optional[nn.Module], bool], |
29 |
| - ], |
30 |
| - ], |
31 |
| - ) -> None: |
32 |
| - super().__init__() |
33 |
| - |
34 |
| - layers: List[nn.Module] = [] |
35 |
| - for block in blocks: |
36 |
| - input_dim, output_dim, batch_norm, non_linearity, *bias = block |
37 |
| - use_bias = bias[0] if bias else not bool(batch_norm) |
38 |
| - layers.append(nn.Linear(input_dim, output_dim, bias=use_bias)) |
39 |
| - if batch_norm: |
40 |
| - layers.append(batch_norm) |
41 |
| - if non_linearity: |
42 |
| - layers.append(non_linearity) |
43 |
| - self.layers = nn.Sequential(*layers) |
44 |
| - |
45 |
| - def preprocess_step(self, x: Tensor) -> Tensor: |
46 |
| - return x |
47 |
| - |
48 |
| - def forward(self, x: Tensor) -> Tensor: |
49 |
| - x = self.preprocess_step(x) |
50 |
| - projection: Tensor = self.layers(x) |
51 |
| - return projection |
52 |
| - |
53 |
| - |
54 |
| -class BYOLProjectionHead(ProjectionHead): |
55 |
| - """Projection head used for BYOL. |
56 |
| - "This MLP consists in a linear layer with output size 4096 followed by |
57 |
| - batch normalization, rectified linear units (ReLU), and a final |
58 |
| - linear layer with output dimension 256." [0] |
59 |
| - [0]: BYOL, 2020, https://arxiv.org/abs/2006.07733 |
60 |
| - """ |
61 |
| - |
62 |
| - def __init__( |
63 |
| - self, input_dim: int = 2048, hidden_dim: int = 4096, output_dim: int = 256 |
64 |
| - ): |
65 |
| - super(BYOLProjectionHead, self).__init__( |
66 |
| - [ |
67 |
| - (input_dim, hidden_dim, nn.BatchNorm1d(hidden_dim), nn.ReLU()), |
68 |
| - (hidden_dim, output_dim, None, None), |
69 |
| - ] |
70 |
| - ) |
71 |
| - |
72 |
| - self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) |
73 |
| - |
74 |
| - def preprocess_step(self, x: Tensor) -> Tensor: |
75 |
| - return self.avgpool(x).flatten(start_dim=1) |
76 |
| - |
77 |
| - |
78 |
| -class BYOLPredictionHead(ProjectionHead): |
79 |
| - """Prediction head used for BYOL. |
80 |
| - "This MLP consists in a linear layer with output size 4096 followed by |
81 |
| - batch normalization, rectified linear units (ReLU), and a final |
82 |
| - linear layer with output dimension 256." [0] |
83 |
| - [0]: BYOL, 2020, https://arxiv.org/abs/2006.07733 |
84 |
| - """ |
85 |
| - |
86 |
| - def __init__( |
87 |
| - self, input_dim: int = 256, hidden_dim: int = 4096, output_dim: int = 256 |
88 |
| - ): |
89 |
| - super(BYOLPredictionHead, self).__init__( |
90 |
| - [ |
91 |
| - (input_dim, hidden_dim, nn.BatchNorm1d(hidden_dim), nn.ReLU()), |
92 |
| - (hidden_dim, output_dim, None, None), |
93 |
| - ] |
94 |
| - ) |
95 |
| - |
96 |
| - |
97 |
| -# --- Class implementation ---------------------------------------------------------- |
| 14 | +from minerva.models.nets.mlp import MLP |
| 15 | +from torch.optim import Optimizer |
| 16 | +from minerva.models.nets.image.deeplabv3 import DeepLabV3Backbone |
98 | 17 |
|
99 | 18 |
|
100 | 19 | class BYOL(L.LightningModule):
|
101 |
| - """A Bootstrap Your Own Latent (BYOL) model for self-supervised learning. |
| 20 | + """Bootstrap Your Own Latent (BYOL) model for self-supervised learning. |
102 | 21 |
|
103 | 22 | References
|
104 | 23 | ----------
|
105 | 24 | Grill, J., Strub, F., Altché, F., Tallec, C., Richemond, P. H., Buchatskaya, E., ... & Valko, M. (2020).
|
106 |
| - "Bootstrap your own latent-a new approach to self-supervised learning." Advances in neural information processing systems, 33, 21271-21284. |
| 25 | + "Bootstrap your own latent - a new approach to self-supervised learning." Advances in Neural Information Processing Systems, 33, 21271-21284. |
107 | 26 | """
|
108 | 27 |
|
109 | 28 | def __init__(
|
110 | 29 | self,
|
111 | 30 | backbone: Optional[nn.Module] = None,
|
112 |
| - learning_rate: float = 0.025, |
113 |
| - schedule: int = 90000, |
| 31 | + projection_head: Optional[nn.Module] = None, |
| 32 | + prediction_head: Optional[nn.Module] = None, |
| 33 | + learning_rate: Optional[float] = 1e-3, |
| 34 | + schedule: Optional[int] = 90000, |
| 35 | + criterion: Optional[Optimizer] = None, |
114 | 36 | ):
|
115 | 37 | """
|
116 | 38 | Initializes the BYOL model.
|
117 | 39 |
|
118 | 40 | Parameters
|
119 | 41 | ----------
|
120 |
| - backbone: Optional[nn.Module] |
121 |
| - The backbone network for feature extraction. Defaults to ResNet18. |
122 |
| - learning_rate: float |
123 |
| - The learning rate for the optimizer. Defaults to 0.025. |
124 |
| - schedule: int |
| 42 | + backbone : Optional[nn.Module] |
| 43 | + The backbone network for feature extraction. Defaults to DeepLabV3Backbone. |
| 44 | + projection_head : Optional[nn.Module] |
| 45 | + Optional custom projection head module. If None, a default MLP-based projection head is used. |
| 46 | + prediction_head : Optional[nn.Module] |
| 47 | + Optional custom prediction head module. If None, a default MLP-based prediction head is used. |
| 48 | + learning_rate : float |
| 49 | + The learning rate for the optimizer. Defaults to 1e-3. |
| 50 | + schedule : int |
125 | 51 | The total number of steps for cosine decay scheduling. Defaults to 90000.
|
| 52 | + criterion : Optional[Optimizer] |
| 53 | + Loss function to use. Defaults to NegativeCosineSimilarity. |
126 | 54 | """
|
127 | 55 | super().__init__()
|
128 |
| - self.backbone = backbone or nn.Sequential( |
129 |
| - *list(torchvision.models.resnet18().children())[:-1] |
130 |
| - ) |
| 56 | + self.backbone = backbone or DeepLabV3Backbone() |
131 | 57 | self.learning_rate = learning_rate
|
132 |
| - self.projection_head = BYOLProjectionHead(2048, 4096, 256) |
133 |
| - self.prediction_head = BYOLPredictionHead(256, 4096, 256) |
| 58 | + self.projection_head = projection_head or self._default_projection_head() |
| 59 | + self.prediction_head = prediction_head or self._default_prediction_head() |
134 | 60 | self.backbone_momentum = copy.deepcopy(self.backbone)
|
135 | 61 | self.projection_head_momentum = copy.deepcopy(self.projection_head)
|
136 | 62 | self.deactivate_requires_grad(self.backbone_momentum)
|
137 | 63 | self.deactivate_requires_grad(self.projection_head_momentum)
|
138 |
| - self.criterion = NegativeCosineSimilarity() |
| 64 | + self.criterion = criterion or NegativeCosineSimilarity() |
139 | 65 | self.schedule_length = schedule
|
140 | 66 |
|
| 67 | + def _default_projection_head(self) -> nn.Module: |
| 68 | + """Creates the default projection head used in BYOL.""" |
| 69 | + return nn.Sequential( |
| 70 | + nn.AdaptiveAvgPool2d((1, 1)), |
| 71 | + nn.Flatten(start_dim=1), |
| 72 | + MLP( |
| 73 | + layer_sizes=[2048, 4096, 256], |
| 74 | + activation_cls=nn.ReLU, |
| 75 | + intermediate_ops=[nn.BatchNorm1d(4096), None], |
| 76 | + ), |
| 77 | + ) |
| 78 | + |
| 79 | + def _default_prediction_head(self) -> nn.Module: |
| 80 | + """Creates the default prediction head used in BYOL.""" |
| 81 | + return MLP( |
| 82 | + layer_sizes=[256, 4096, 256], |
| 83 | + activation_cls=nn.ReLU, |
| 84 | + intermediate_ops=[nn.BatchNorm1d(4096), None], |
| 85 | + ) |
| 86 | + |
141 | 87 | def forward(self, x: Tensor) -> Tensor:
|
142 | 88 | """
|
143 | 89 | Forward pass for the BYOL model.
|
|
0 commit comments