diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 4e8f4f28..29baf6db 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -5,7 +5,6 @@ on: push: branches: - draft - - draft-2.0 - master jobs: deploy: diff --git a/CHANGELOG.md b/CHANGELOG.md index 407447dc..f0b68043 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `filter_vector` - `flatten_dimensions` - `load_geojson` + - `load_ml_model` - `load_url` + - `ml_fit_class_random_forest` + - `ml_fit_regr_random_forest` + - `ml_predict` + - `save_ml_model` - `unflatten_dimension` - `vector_buffer` - `vector_reproject` diff --git a/meta/subtype-schemas.json b/meta/subtype-schemas.json index 347df234..83ce72ba 100644 --- a/meta/subtype-schemas.json +++ b/meta/subtype-schemas.json @@ -232,6 +232,12 @@ } } }, + "ml-model": { + "type": "object", + "subtype": "ml-model", + "title": "Machine Learning Model", + "description": "A machine learning model, accompanied with STAC metadata that implements the the STAC ml-model extension." + }, "output-format": { "type": "string", "subtype": "output-format", diff --git a/proposals/aggregate_spatial_window.json b/proposals/aggregate_spatial_window.json index 747db151..9e5dce4a 100644 --- a/proposals/aggregate_spatial_window.json +++ b/proposals/aggregate_spatial_window.json @@ -75,7 +75,7 @@ }, { "name": "boundary", - "description": "Behavior to apply if the number of values for the axes `x` and `y` is not a multiple of the corresponding value in the `size` parameter. Options are:\n\n- `pad` (default): pad the data cube with the no-data value `null` to fit the required window size.\n\n- `trim`: trim the data cube to fit the required window size.\n\nSet the parameter `align` to specifies to which corner the data is aligned to.", + "description": "Behavior to apply if the number of values for the axes `x` and `y` is not a multiple of the corresponding value in the `size` parameter. Options are:\n\n- `pad` (default): pad the data cube with the no-data value `null` to fit the required window size.\n\n- `trim`: trim the data cube to fit the required window size.\n\nUse the parameter `align` to align the data to the desired corner.", "schema": { "type": "string", "enum": [ diff --git a/proposals/load_ml_model.json b/proposals/load_ml_model.json new file mode 100644 index 00000000..7fa86d89 --- /dev/null +++ b/proposals/load_ml_model.json @@ -0,0 +1,46 @@ +{ + "id": "load_ml_model", + "summary": "Load a ML model", + "description": "Loads a machine learning model from a STAC Item.\n\nSuch a model could be trained and saved as part of a previous batch job with processes such as ``ml_fit_regr_random_forest()`` and ``save_ml_model()``.", + "categories": [ + "machine learning", + "import" + ], + "experimental": true, + "parameters": [ + { + "name": "uri", + "description": "The STAC Item to load the machine learning model from. The STAC Item must implement the `ml-model` extension.", + "schema": [ + { + "title": "URL", + "type": "string", + "format": "uri", + "subtype": "uri", + "pattern": "^https?://" + }, + { + "title": "User-uploaded File", + "type": "string", + "subtype": "file-path", + "pattern": "^[^\r\n\\:'\"]+$" + } + ] + } + ], + "returns": { + "description": "A machine learning model to be used with machine learning processes such as ``ml_predict()``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://github.com/stac-extensions/ml-model", + "title": "STAC ml-model extension", + "type": "text/html", + "rel": "about" + } + ] +} diff --git a/proposals/ml_fit_class_random_forest.json b/proposals/ml_fit_class_random_forest.json new file mode 100644 index 00000000..63da48a1 --- /dev/null +++ b/proposals/ml_fit_class_random_forest.json @@ -0,0 +1,110 @@ +{ + "id": "ml_fit_class_random_forest", + "summary": "Train a random forest classification model", + "description": "Executes the fit of a random forest classification based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest classification model is based on the approach by Breiman (2001).", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "predictors", + "description": "The predictors for the classification model as a vector data cube. Aggregated to the features (vectors) of the target input variable.", + "schema": [ + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "bands" + } + ] + }, + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "other" + } + ] + } + ] + }, + { + "name": "target", + "description": "The training sites for the classification model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).", + "schema": { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + } + ] + } + }, + { + "name": "max_variables", + "description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split.\n- `sqrt`: The square root of the number of variables are considered for each split. This is often the default for classification.", + "schema": [ + { + "type": "integer", + "minimum": 1 + }, + { + "type": "string", + "enum": [ + "all", + "log2", + "onethird", + "sqrt" + ] + } + ] + }, + { + "name": "num_trees", + "description": "The number of trees build within the Random Forest classification.", + "optional": true, + "default": 100, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "seed", + "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.", + "optional": true, + "default": null, + "schema": { + "type": [ + "integer", + "null" + ] + } + } + ], + "returns": { + "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://doi.org/10.1023/A:1010933404324", + "title": "Breiman (2001): Random Forests", + "type": "text/html", + "rel": "about" + } + ] +} diff --git a/proposals/ml_fit_regr_random_forest.json b/proposals/ml_fit_regr_random_forest.json new file mode 100644 index 00000000..39207324 --- /dev/null +++ b/proposals/ml_fit_regr_random_forest.json @@ -0,0 +1,110 @@ +{ + "id": "ml_fit_regr_random_forest", + "summary": "Train a random forest regression model", + "description": "Executes the fit of a random forest regression based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest regression model is based on the approach by Breiman (2001).", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "predictors", + "description": "The predictors for the regression model as a vector data cube. Aggregated to the features (vectors) of the target input variable.", + "schema": [ + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "bands" + } + ] + }, + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "other" + } + ] + } + ] + }, + { + "name": "target", + "description": "The training sites for the regression model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).", + "schema": { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + } + ] + } + }, + { + "name": "max_variables", + "description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split. This is often the default for regression.\n- `sqrt`: The square root of the number of variables are considered for each split.", + "schema": [ + { + "type": "integer", + "minimum": 1 + }, + { + "type": "string", + "enum": [ + "all", + "log2", + "onethird", + "sqrt" + ] + } + ] + }, + { + "name": "num_trees", + "description": "The number of trees build within the Random Forest regression.", + "optional": true, + "default": 100, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "seed", + "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.", + "optional": true, + "default": null, + "schema": { + "type": [ + "integer", + "null" + ] + } + } + ], + "returns": { + "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://doi.org/10.1023/A:1010933404324", + "title": "Breiman (2001): Random Forests", + "type": "text/html", + "rel": "about" + } + ] +} diff --git a/proposals/ml_predict.json b/proposals/ml_predict.json new file mode 100644 index 00000000..87cd2500 --- /dev/null +++ b/proposals/ml_predict.json @@ -0,0 +1,49 @@ +{ + "id": "ml_predict", + "summary": "Predict using ML", + "description": "Applies a machine learning model to a data cube of input features and returns the predicted values.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "data", + "description": "The data cube containing the input features.", + "schema": { + "type": "object", + "subtype": "datacube" + } + }, + { + "name": "model", + "description": "A ML model that was trained with one of the ML training processes such as ``ml_fit_regr_random_forest()``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + { + "name": "dimensions", + "description": "Zero or more dimensions that will be reduced by the model. Fails with a `DimensionNotAvailable` exception if one of the specified dimensions does not exist.", + "schema": { + "type": "array", + "items": { + "type": "string" + } + } + } + ], + "returns": { + "description": "A data cube with the predicted values. It removes the specified dimensions and adds new dimension for the predicted values. It has the name `predictions` and is of type `other`. If a single value is returned, the dimension has a single label with name `0`.", + "schema": { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "other" + } + ] + } + } +} diff --git a/proposals/predict_curve.json b/proposals/predict_curve.json index 479b7fec..c4d78d99 100644 --- a/proposals/predict_curve.json +++ b/proposals/predict_curve.json @@ -1,7 +1,7 @@ { "id": "predict_curve", - "summary": "Predict values", - "description": "Predict values using a model function and pre-computed parameters. The process is intended to compute values for new labels.", + "summary": "Predict values using a model function", + "description": "Predict values using a model function and pre-computed parameters. The process is primarily intended to compute values for new labels, but it can also fill gaps where existing labels contain no-data (`null`) values.", "categories": [ "cubes", "math" diff --git a/proposals/save_ml_model.json b/proposals/save_ml_model.json new file mode 100644 index 00000000..5e9ea8b0 --- /dev/null +++ b/proposals/save_ml_model.json @@ -0,0 +1,44 @@ +{ + "id": "save_ml_model", + "summary": "Save a ML model", + "description": "Saves a machine learning model as part of a batch job.\n\nThe model will be accompanied by a separate STAC Item that implements the [ml-model extension](https://github.com/stac-extensions/ml-model).", + "categories": [ + "machine learning", + "import" + ], + "experimental": true, + "parameters": [ + { + "name": "data", + "description": "The data to store as a machine learning model.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + { + "name": "options", + "description": "Additional parameters to create the file(s).", + "schema": { + "type": "object", + "additionalParameters": false + }, + "default": {}, + "optional": true + } + ], + "returns": { + "description": "Returns `false` if the process failed to store the model, `true` otherwise.", + "schema": { + "type": "boolean" + } + }, + "links": [ + { + "href": "https://github.com/stac-extensions/ml-model", + "title": "STAC ml-model extension", + "type": "text/html", + "rel": "about" + } + ] +} \ No newline at end of file diff --git a/tests/docs.html b/tests/docs.html index 23ef98e4..04b1c192 100644 --- a/tests/docs.html +++ b/tests/docs.html @@ -113,8 +113,8 @@ props: { document: 'processes.json', categorize: true, - apiVersion: '1.1.0', - title: 'openEO processes (draft)', + apiVersion: '1.2.0', + title: 'openEO processes (2.0.0-rc.1)', notice: '**Note:** This is the list of all processes specified by the openEO project. Back-ends implement a varying set of processes. Thus, the processes you can use at a specific back-end may derive from the specification, may include non-standardized processes and may not implement all processes listed here. Please check each back-end individually for the processes they support. The client libraries usually have a function called `listProcesses` or `list_processes` for that.' } }) diff --git a/tests/package.json b/tests/package.json index 15f6d2e8..1da8693f 100644 --- a/tests/package.json +++ b/tests/package.json @@ -1,6 +1,6 @@ { "name": "@openeo/processes", - "version": "2.0.0", + "version": "2.0.0-rc.1", "author": "openEO Consortium", "contributors": [ {