update

Gordon119 · Gordon119 · commit b27b19dc9dc0 · 2024-07-19T21:56:36.000+08:00
diff --git a/docs/examples/plot_dataset_tutorial.py b/docs/examples/plot_dataset_tutorial.py
@@ -62,7 +62,7 @@
 ###############################################################################
 # In this case, if you want to use a deep learning model,
 # use ``load_datasets`` from ``libmultilabel.nn.data_utils`` and change the data to the dataframes we created.
-# Here is the modification of our `Bert model quickstart <https://www.csie.ntu.edu.tw/~cjlin/libmultilabel/auto_examples/plot_BERT_quickstart.html>`_.
+# Here is the modification of our `Bert model quickstart <../auto_examples/plot_bert_quickstart.html>`_.
 
 from libmultilabel.nn.data_utils import load_datasets
 
diff --git a/docs/examples/plot_linear_gridsearch_tutorial.py b/docs/examples/plot_linear_gridsearch_tutorial.py
@@ -34,7 +34,7 @@
 # As for the estimator ``MultiLabelEstimator``, argument ``options`` is a LIBLINEAR option
 # (see *train Usage* in `liblinear <https://github.com/cjlin1/liblinear>`__ README), and
 # ``linear_technique`` is one of the linear techniques, including ``1vsrest``, ``thresholding``, ``cost_sensitive``,
-# ``cost_sensitive_micro``, and ``binary_and_mulitclass``.
+# ``cost_sensitive_micro``, and ``binary_and_multiclass``.
 #
 # We can specify the aliases of the components used by the pipeline.
 # For example, ``tfidf`` is the alias of ``TfidfVectorizer`` and ``clf`` is the alias of the estimator.
diff --git a/docs/examples/plot_linear_tree_tutorial.py b/docs/examples/plot_linear_tree_tutorial.py
@@ -1,5 +1,5 @@
 """
-Handling Data with Many Labels using Linear Methods.
+Handling Data with Many Labels Using Linear Methods
 ====================================================
 
 For the case that the amount of labels is very large,
diff --git a/docs/examples/plot_multi_label.py b/docs/examples/plot_multi_label.py
@@ -69,12 +69,12 @@
 #      - Training time (sec)
 #
 #    * - Linear method (one-vs-rest)
-#      - 0.5171960144875225
-#      - 4.327306747436523
+#      - 0.52
+#      - 4.33
 #
 #    * - Deep learning method (BERT)
-#      - 0.564618763137536
-#      - 5412.955321788788
+#      - 0.56
+#      - 5412.96
 #
 # Step 2. Training: 
 # -----------------
@@ -120,13 +120,13 @@
 #      - Macro-F1
 #
 #    * - One-vs-rest
-#      - 0.5171960144875225
+#      - 0.52
 #
 #    * - Thresholding
-#      - 0.5643407144065415
+#      - 0.56
 #
 #    * - Cost-sensitive
-#      - 0.5704056980791481
+#      - 0.57
 #
 # From the comparison, one can see that these techniques improves the naive method.
 #
@@ -139,7 +139,7 @@
 # Training models directly in this case may result in high runtime and space consumption. 
 # A solution to reduce these costs is to utilize tree-based models.
 # Here we provide an example comparing a linear one-vs-rest model and a tree model on the EUR-Lex-57k dataset, which has a larger label space. 
-# We start by training a tree model following another detailed `tutorial <../auto_examples/plot_linear_tree_tutorial.html>`__.
+# We start by training a tree model following the `linear tree tutorial <../auto_examples/plot_linear_tree_tutorial.html>`__.
 
 datasets_eurlex = linear.load_dataset("txt", "data/eurlex57k/train.txt", "data/eurlex57k/test.txt")
 preprocessor_eurlex = linear.Preprocessor()
@@ -168,7 +168,7 @@
 # 
 # It is clear that the tree model significantly improves efficiency.
 # As for deep learning, a similar improvement in efficiency can be observed. 
-# Details for the tree-based deep learning model can be found in this `tutorial <../tutorials/AttentionXML.html>`__.
+# Details for the tree-based deep learning model can be found in the `deep learning tree tutorial <../tutorials/AttentionXML.html>`__.
 #
 # Step 3. Evaluation: Pick Suitable Metrics
 # -----------------------------------------
@@ -203,8 +203,7 @@
 # -----------------------------
 # Models with suboptimal hyperparameters may lead to poor performance :cite:p:`JJL21a`.
 # Users can incorporate hyperparameter tuning into the training process.
-# Because this functionality is more complex and cannot be adequately demonstrated within a code snippet, please refer to these two tutorials for more details about hyperparameter tuning (`linear  <../auto_examples/plot_gridsearch_tutorial.html>`_
-# and `deep learning  <../tutorials/Parameter_Selection_for_Neural_Networks.html>`_).
+# Because this functionality is more complex and cannot be adequately demonstrated within a code snippet, please refer to these two tutorials for more details about hyperparameter tuning (`linear  <../auto_examples/plot_linear_gridsearch_tutorial.html>`_ and `deep learning  <../tutorials/Parameter_Selection_for_Neural_Networks.html>`_).
 # Another thing to consider is that hyperparameter search can be time-consuming, especially in the case of deep learning. 
 # Users need to conduct this step with consideration of the available resources and time.
 #
@@ -214,7 +213,7 @@
 # To use as much information as possible, for linear methods, after determining the best hyperparameters, all available data are generally trained under these optimal hyperparameters to obtain the final model. 
 # We refer to this as the "retrain" strategy.
 #
-# For linear methods, the `tutorial <../auto_examples/plot_gridsearch_tutorial.html>`__ for hyperparameter search already handles retraining by default. 
+# For linear methods, the `tutorial <../auto_examples/plot_linear_gridsearch_tutorial.html>`_ for hyperparameter search already handles retraining by default. 
 # As for deep learning, since this additional step is not common in practice, we include it in the last section of this `tutorial <../tutorials/Parameter_Selection_for_Neural_Networks.html>`__.
 #
 # Step 6. Prediction
diff --git a/docs/neural_networks.rst b/docs/neural_networks.rst
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -2,7 +2,7 @@ Tutorials
 =========
 
 .. toctree::
-    :maxdepth: 2
+    :maxdepth: 1
     :titlesonly:
 
     ../auto_examples/plot_multi_label
diff --git a/docs/tutorials/AttentionXML.rst b/docs/tutorials/AttentionXML.rst
@@ -1,5 +1,5 @@
-Handling Data with Many Labels with AttentionXML
-================================================
+Handling Data with Many Labels Using Neural Networks
+====================================================
 As time and space complexities grow linearly as the label size increases, it is inefficient to train models in its
 original label space. We consider adopting AttentionXML :cite:p:`RY19a` to address the issue by training
 models with a reduced space of labels.
@@ -38,7 +38,7 @@ There are 2 extra hyperparameters for AttentionXML that users need to know:
 
 Performance
 -----------
-We compared the performance between BiLSTM and AttentionXML as they have similar architectures. The datasest,
+We compared the performance between BiLSTM and AttentionXML as they have similar architectures. The dataset,
 Wiki10-31K, has 30,938 classes, which makes it hard for models to train in a one-vs-all manner.
 
 Both models were trained on an A100 Nvidia GPU. Their test results are shown below. Notice the difference

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@`
`34`	`34`	# As for the estimator ``MultiLabelEstimator``, argument ``options`` is a LIBLINEAR option
`35`	`35`	# (see train Usage in `liblinear <https://github.com/cjlin1/liblinear>`__ README), and
`36`	`36`	# ``linear_technique`` is one of the linear techniques, including ``1vsrest``, ``thresholding``, ``cost_sensitive``,
`37`		-# ``cost_sensitive_micro``, and ``binary_and_mulitclass``.
	`37`	+# ``cost_sensitive_micro``, and ``binary_and_multiclass``.
`38`	`38`	`#`
`39`	`39`	`# We can specify the aliases of the components used by the pipeline.`
`40`	`40`	# For example, ``tfidf`` is the alias of ``TfidfVectorizer`` and ``clf`` is the alias of the estimator.
Original file line number	Diff line number	Diff line change
`@@ -69,12 +69,12 @@`
`69`	`69`	`# - Training time (sec)`
`70`	`70`	`#`
`71`	`71`	`# * - Linear method (one-vs-rest)`
`72`		`-# - 0.5171960144875225`
`73`		`-# - 4.327306747436523`
	`72`	`+# - 0.52`
	`73`	`+# - 4.33`
`74`	`74`	`#`
`75`	`75`	`# * - Deep learning method (BERT)`
`76`		`-# - 0.564618763137536`
`77`		`-# - 5412.955321788788`
	`76`	`+# - 0.56`
	`77`	`+# - 5412.96`
`78`	`78`	`#`
`79`	`79`	`# Step 2. Training:`
`80`	`80`	`# -----------------`
`@@ -120,13 +120,13 @@`
`120`	`120`	`# - Macro-F1`
`121`	`121`	`#`
`122`	`122`	`# * - One-vs-rest`
`123`		`-# - 0.5171960144875225`
	`123`	`+# - 0.52`
`124`	`124`	`#`
`125`	`125`	`# * - Thresholding`
`126`		`-# - 0.5643407144065415`
	`126`	`+# - 0.56`
`127`	`127`	`#`
`128`	`128`	`# * - Cost-sensitive`
`129`		`-# - 0.5704056980791481`
	`129`	`+# - 0.57`
`130`	`130`	`#`
`131`	`131`	`# From the comparison, one can see that these techniques improves the naive method.`
`132`	`132`	`#`
`@@ -139,7 +139,7 @@`
`139`	`139`	`# Training models directly in this case may result in high runtime and space consumption.`
`140`	`140`	`# A solution to reduce these costs is to utilize tree-based models.`
`141`	`141`	`# Here we provide an example comparing a linear one-vs-rest model and a tree model on the EUR-Lex-57k dataset, which has a larger label space.`
`142`		-# We start by training a tree model following another detailed `tutorial <../auto_examples/plot_linear_tree_tutorial.html>`__.
	`142`	+# We start by training a tree model following the `linear tree tutorial <../auto_examples/plot_linear_tree_tutorial.html>`__.
`143`	`143`
`144`	`144`	`datasets_eurlex = linear.load_dataset("txt", "data/eurlex57k/train.txt", "data/eurlex57k/test.txt")`
`145`	`145`	`preprocessor_eurlex = linear.Preprocessor()`
`@@ -168,7 +168,7 @@`
`168`	`168`	`#`
`169`	`169`	`# It is clear that the tree model significantly improves efficiency.`
`170`	`170`	`# As for deep learning, a similar improvement in efficiency can be observed.`
`171`		-# Details for the tree-based deep learning model can be found in this `tutorial <../tutorials/AttentionXML.html>`__.
	`171`	+# Details for the tree-based deep learning model can be found in the `deep learning tree tutorial <../tutorials/AttentionXML.html>`__.
`172`	`172`	`#`
`173`	`173`	`# Step 3. Evaluation: Pick Suitable Metrics`
`174`	`174`	`# -----------------------------------------`
`@@ -203,8 +203,7 @@`
`203`	`203`	`# -----------------------------`
`204`	`204`	# Models with suboptimal hyperparameters may lead to poor performance :cite:p:`JJL21a`.
`205`	`205`	`# Users can incorporate hyperparameter tuning into the training process.`
`206`		-# Because this functionality is more complex and cannot be adequately demonstrated within a code snippet, please refer to these two tutorials for more details about hyperparameter tuning (`linear <../auto_examples/plot_gridsearch_tutorial.html>`_
`207`		-# and `deep learning <../tutorials/Parameter_Selection_for_Neural_Networks.html>`_).
	`206`	+# Because this functionality is more complex and cannot be adequately demonstrated within a code snippet, please refer to these two tutorials for more details about hyperparameter tuning (`linear <../auto_examples/plot_linear_gridsearch_tutorial.html>`_ and `deep learning <../tutorials/Parameter_Selection_for_Neural_Networks.html>`_).
`208`	`207`	`# Another thing to consider is that hyperparameter search can be time-consuming, especially in the case of deep learning.`
`209`	`208`	`# Users need to conduct this step with consideration of the available resources and time.`
`210`	`209`	`#`
`@@ -214,7 +213,7 @@`
`214`	`213`	`# To use as much information as possible, for linear methods, after determining the best hyperparameters, all available data are generally trained under these optimal hyperparameters to obtain the final model.`
`215`	`214`	`# We refer to this as the "retrain" strategy.`
`216`	`215`	`#`
`217`		-# For linear methods, the `tutorial <../auto_examples/plot_gridsearch_tutorial.html>`__ for hyperparameter search already handles retraining by default.
	`216`	+# For linear methods, the `tutorial <../auto_examples/plot_linear_gridsearch_tutorial.html>`_ for hyperparameter search already handles retraining by default.
`218`	`217`	# As for deep learning, since this additional step is not common in practice, we include it in the last section of this `tutorial <../tutorials/Parameter_Selection_for_Neural_Networks.html>`__.
`219`	`218`	`#`
`220`	`219`	`# Step 6. Prediction`