From d8eb3213a49e7137ccd9854a526a2faccb5f3006 Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Tue, 8 Jul 2025 23:30:39 +0100 Subject: [PATCH 01/14] adding percentile rank docs Signed-off-by: Anton Rubin --- _aggregations/metric/percentile-ranks.md | 285 ++++++++++++++++++++--- 1 file changed, 257 insertions(+), 28 deletions(-) diff --git a/_aggregations/metric/percentile-ranks.md b/_aggregations/metric/percentile-ranks.md index 44562187b3f..d5fca212807 100644 --- a/_aggregations/metric/percentile-ranks.md +++ b/_aggregations/metric/percentile-ranks.md @@ -9,20 +9,112 @@ redirect_from: # Percentile rank aggregations -Percentile rank is the percentile of values at or below a threshold grouped by a specified value. For example, if a value is greater than or equal to 80% of the values, it has a percentile rank of 80. +The `percentile_ranks` aggregation estimates the percentage of observed values that fall below or at given thresholds. This is useful for understanding the relative standing of a particular value within a distribution of values. + +For example, if you want to know how a transaction amount of `45` compares to other transaction values in a dataset, a percentile rank aggregation will return a value like `82.3`, which means 82.3% of transactions were less than or equal to `45`. + + +## Examples + +See following examples covering multiple approaches to using `percentile_ranks`. + +### Add sample data + +First, create a test index: + +```json +PUT /transaction_data +{ + "mappings": { + "properties": { + "amount": { + "type": "double" + } + } + } +} +``` +{% include copy-curl.html %} + +Add sample numeric values to illustrate percentile rank calculations: + +```json +POST /transaction_data/_bulk +{ "index": {} } +{ "amount": 10 } +{ "index": {} } +{ "amount": 20 } +{ "index": {} } +{ "amount": 30 } +{ "index": {} } +{ "amount": 40 } +{ "index": {} } +{ "amount": 50 } +{ "index": {} } +{ "amount": 60 } +{ "index": {} } +{ "amount": 70 } +``` +{% include copy-curl.html %} + +### Basic percentile rank aggregation + +Run a `percentile_ranks` aggregation to calculate how certain values compare to the overall distribution: + +```json +GET /transaction_data/_search +{ + "size": 0, + "aggs": { + "rank_check": { + "percentile_ranks": { + "field": "amount", + "values": [25, 55] + } + } + } +} +``` +{% include copy-curl.html %} + +The response demonstrates that 28.6% of the values are less than or equal to `25`, and 71.4% are less than or equal to `55`. + +```json +{ + ... + "hits": { + "total": { + "value": 7, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "rank_check": { + "values": { + "25.0": 28.57142857142857, + "55.0": 71.42857142857143 + } + } + } +} +``` + +### Keyed response + +You can change the format of the aggregation response by setting the `keyed` parameter to `false`: ```json -GET opensearch_dashboards_sample_data_ecommerce/_search +GET /transaction_data/_search { "size": 0, "aggs": { - "percentile_rank_taxful_total_price": { + "rank_check": { "percentile_ranks": { - "field": "taxful_total_price", - "values": [ - 10, - 15 - ] + "field": "amount", + "values": [25, 55], + "keyed": false } } } @@ -30,43 +122,180 @@ GET opensearch_dashboards_sample_data_ecommerce/_search ``` {% include copy-curl.html %} -#### Example response +The response includes an array instead of an object: ```json -... -"aggregations" : { - "percentile_rank_taxful_total_price" : { - "values" : { - "10.0" : 0.055096056411283456, - "15.0" : 0.0830092961834656 +{ + ... + "hits": { + "total": { + "value": 7, + "relation": "eq" + }, + "max_score": null, + "hits": [] + }, + "aggregations": { + "rank_check": { + "values": [ + { + "key": 25, + "value": 28.57142857142857 + }, + { + "key": 55, + "value": 71.42857142857143 + } + ] } } - } } ``` -This response indicates that the value `10` is at the `5.5`th percentile and the value `15` is at the `8.3`rd percentile. +## Precision tuning with compression -As with the `percentiles` aggregation, you can control the level of approximation by setting the optional `tdigest.compression` field. A larger value increases the precision of the approximation but uses more heap space. The default value is 100. +Percentile ranks are calculated using the `tdigest` algorithm by default. You can control the trade-off between accuracy and memory usage by adjusting the `tdigest.compression` configuration. Higher values provide better accuracy, however require more memory. -For example, use the following request to set `compression` to `200`: +The following example is configured with `tdigest.compression` set to `200`: ```json -GET opensearch_dashboards_sample_data_ecommerce/_search +GET /transaction_data/_search { "size": 0, "aggs": { - "percentile_rank_taxful_total_price": { + "rank_check": { "percentile_ranks": { - "field": "taxful_total_price", - "values": [ - 10, - 15 - ], - "tdigest": { + "field": "amount", + "values": [25, 55], + "tdigest": { "compression": 200 } } } } -} \ No newline at end of file +} +``` +{% include copy-curl.html %} + +### HDR histogram + +As an alternative to `tdigest`, you can use the High Dynamic Range (HDR) histogram algorithm, which is better suited for large numbers of buckets and fast processing. + +You should use HDR if you: + +* Are aggregating across many buckets. +* Don't require extreme precision in the tail percentiles. +* Have sufficient memory available. + +You should avoid HDR if: + +* Tail accuracy is important. +* You're analyzing skewed or sparse data distributions. + +The following example is configured with `hdr.number_of_significant_value_digits` set to `3`: + +```json +GET /transaction_data/_search +{ + "size": 0, + "aggs": { + "rank_check": { + "percentile_ranks": { + "field": "amount", + "values": [25, 55], + "hdr": { + "number_of_significant_value_digits": 3 + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Missing values + +If some documents are missing the target field, you can instruct the query to use a fallback value by setting the `missing` parameter. This ensures that documents without an amount field will be treated as if the value were `0`, and included in the percentile ranks computation. See following example: + +```json +GET /transaction_data/_search +{ + "size": 0, + "aggs": { + "rank_check": { + "percentile_ranks": { + "field": "amount", + "values": [25, 55], + "missing": 0 + } + } + } +} +``` +{% include copy-curl.html %} + +## Script + +Instead of specifying a field, you can dynamically compute the value using a script. This is useful when you need to apply transformations, such as converting currencies or applying weights. + +### Inline script + +The following example uses inline script to calculate the percentile ranks of the transformed values `30` and `60`, against values from the amount field multiplied by 10%: + +```json +GET /transaction_data/_search +{ + "size": 0, + "aggs": { + "rank_check": { + "percentile_ranks": { + "values": [30, 60], + "script": { + "source": "doc['amount'].value * 1.1" + } + } + } + } +} +``` +{% include copy-curl.html %} + +## Stored script + +Stored scripts can also be used. + +To use a stored script first create it using the following command: + +```json +POST _scripts/percentile_script +{ + "script": { + "lang": "painless", + "source": "doc[params.field].value * params.multiplier" + } +} +``` +{% include copy-curl.html %} + +Use the stored script in the `percentile_ranks` aggregation: + +```json +GET /transaction_data/_search +{ + "size": 0, + "aggs": { + "rank_check": { + "percentile_ranks": { + "values": [30, 60], + "script": { + "id": "percentile_script", + "params": { + "field": "amount", + "multiplier": 1.1 + } + } + } + } + } +} +``` From 399dffcecaef261fd09458ef843318405bec06bf Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Wed, 9 Jul 2025 14:28:02 +0100 Subject: [PATCH 02/14] Update percentile-ranks.md Signed-off-by: AntonEliatra --- _aggregations/metric/percentile-ranks.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/_aggregations/metric/percentile-ranks.md b/_aggregations/metric/percentile-ranks.md index d5fca212807..0a365a445ed 100644 --- a/_aggregations/metric/percentile-ranks.md +++ b/_aggregations/metric/percentile-ranks.md @@ -152,7 +152,7 @@ The response includes an array instead of an object: } ``` -## Precision tuning with compression +### Precision tuning with compression Percentile ranks are calculated using the `tdigest` algorithm by default. You can control the trade-off between accuracy and memory usage by adjusting the `tdigest.compression` configuration. Higher values provide better accuracy, however require more memory. @@ -213,9 +213,9 @@ GET /transaction_data/_search ``` {% include copy-curl.html %} -## Missing values +### Missing values -If some documents are missing the target field, you can instruct the query to use a fallback value by setting the `missing` parameter. This ensures that documents without an amount field will be treated as if the value were `0`, and included in the percentile ranks computation. See following example: +If some documents are missing the target field, you can instruct the query to use a fallback value by setting the `missing` parameter. The following example ensures that documents without an amount field will be treated as if the value were `0`, and included in the percentile ranks computation: ```json GET /transaction_data/_search @@ -234,11 +234,11 @@ GET /transaction_data/_search ``` {% include copy-curl.html %} -## Script +### Script Instead of specifying a field, you can dynamically compute the value using a script. This is useful when you need to apply transformations, such as converting currencies or applying weights. -### Inline script +#### Inline script The following example uses inline script to calculate the percentile ranks of the transformed values `30` and `60`, against values from the amount field multiplied by 10%: @@ -260,7 +260,7 @@ GET /transaction_data/_search ``` {% include copy-curl.html %} -## Stored script +#### Stored script Stored scripts can also be used. From 11b93958c311c186999e828fe3ef3a9004390a52 Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Wed, 9 Jul 2025 21:56:26 +0100 Subject: [PATCH 03/14] adding percentile ranks aggs docs and links Signed-off-by: Anton Rubin --- _aggregations/metric/percentile-ranks.md | 10 +- _aggregations/metric/percentile.md | 347 ++++++++++++++++++++--- 2 files changed, 320 insertions(+), 37 deletions(-) diff --git a/_aggregations/metric/percentile-ranks.md b/_aggregations/metric/percentile-ranks.md index 0a365a445ed..cca3c7b37de 100644 --- a/_aggregations/metric/percentile-ranks.md +++ b/_aggregations/metric/percentile-ranks.md @@ -20,7 +20,7 @@ See following examples covering multiple approaches to using `percentile_ranks`. ### Add sample data -First, create a test index: +First, create a sample index: ```json PUT /transaction_data @@ -57,7 +57,7 @@ POST /transaction_data/_bulk ``` {% include copy-curl.html %} -### Basic percentile rank aggregation +### Percentile rank aggregation Run a `percentile_ranks` aggregation to calculate how certain values compare to the overall distribution: @@ -152,9 +152,9 @@ The response includes an array instead of an object: } ``` -### Precision tuning with compression +### Precision tuning with tdigest -Percentile ranks are calculated using the `tdigest` algorithm by default. You can control the trade-off between accuracy and memory usage by adjusting the `tdigest.compression` configuration. Higher values provide better accuracy, however require more memory. +Percentile ranks are calculated using the `tdigest` algorithm by default. You can control the trade-off between accuracy and memory usage by adjusting the `tdigest.compression` configuration. Higher values provide better accuracy, however require more memory. For more information on how tdigest works see [precision tuning with tdigest]({{site.url}}{{site.baseurl}}/aggregations/metric/percentile/#precision-tuning-with-tdigest) The following example is configured with `tdigest.compression` set to `200`: @@ -179,7 +179,7 @@ GET /transaction_data/_search ### HDR histogram -As an alternative to `tdigest`, you can use the High Dynamic Range (HDR) histogram algorithm, which is better suited for large numbers of buckets and fast processing. +As an alternative to `tdigest`, you can use the High Dynamic Range (HDR) histogram algorithm, which is better suited for large numbers of buckets and fast processing. For further details regarding how HDR histogram works see [HDR histogram]({{site.url}}{{site.baseurl}}/aggregations/metric/percentile/#hdr-histogram) You should use HDR if you: diff --git a/_aggregations/metric/percentile.md b/_aggregations/metric/percentile.md index d9168e4539b..30e3d950cbe 100644 --- a/_aggregations/metric/percentile.md +++ b/_aggregations/metric/percentile.md @@ -9,22 +9,109 @@ redirect_from: # Percentile aggregations -Percentile is the percentage of the data that's at or below a certain threshold value. +The `percentiles` aggregation estimates the value at a given percentile of a numeric field. This is useful for understanding distribution boundaries. -The `percentile` metric is a multi-value metric aggregation that lets you find outliers in your data or figure out the distribution of your data. +For example, a 95th percentile of `load_time` = `120ms` means 95% of values are less than or equal to 120ms. -Like the `cardinality` metric, the `percentile` metric is also approximate. +Similarly to the [`cardinality`]({{site.url}}{{site.baseurl}}/aggregations/metric/cardinality/) metric, the `percentile` metric is also approximate. -The following example calculates the percentile in relation to the `taxful_total_price` field: +## Examples + +The following examples demonstrate different configurations of the `percentiles` aggregation. + +### Create an index and add sample data + +First, create an index: + +```json +PUT /latency_data +{ + "mappings": { + "properties": { + "load_time": { + "type": "double" + } + } + } +} +``` +{% include copy-curl.html %} + +Add sample numeric values to illustrate percentile calculations: + +```json +POST /latency_data/_bulk +{ "index": {} } +{ "load_time": 20 } +{ "index": {} } +{ "load_time": 40 } +{ "index": {} } +{ "load_time": 60 } +{ "index": {} } +{ "load_time": 80 } +{ "index": {} } +{ "load_time": 100 } +{ "index": {} } +{ "load_time": 120 } +{ "index": {} } +{ "load_time": 140 } +``` + +{% include copy-curl.html %} + +### Percentiles aggregation + +The following example calculates the default set of percentiles for the `load_time` field" + +```json +GET /latency_data/_search +{ + "size": 0, + "aggs": { + "load_time_percentiles": { + "percentiles": { + "field": "load_time" + } + } + } +} +``` +{% include copy-curl.html %} + +By default, the 1st, 5th, 25th, 50th, 75th, 95th, and 99th percentiles are returned: + +```json +{ + ... + "aggregations": { + "load_time_percentiles": { + "values": { + "1.0": 20, + "5.0": 20, + "25.0": 40, + "50.0": 80, + "75.0": 120, + "95.0": 140, + "99.0": 140 + } + } + } +} +``` + +### Custom percentiles + +You can specify the exact percentiles using `percents` setting: ```json -GET opensearch_dashboards_sample_data_ecommerce/_search +GET /latency_data/_search { "size": 0, "aggs": { - "percentile_taxful_total_price": { + "load_time_percentiles": { "percentiles": { - "field": "taxful_total_price" + "field": "load_time", + "percents": [50, 90, 99] } } } @@ -32,39 +119,118 @@ GET opensearch_dashboards_sample_data_ecommerce/_search ``` {% include copy-curl.html %} -#### Example response +The response includes only the three requested percentiles aggregations: ```json -... -"aggregations" : { - "percentile_taxful_total_price" : { - "values" : { - "1.0" : 21.984375, - "5.0" : 27.984375, - "25.0" : 44.96875, - "50.0" : 64.22061688311689, - "75.0" : 93.0, - "95.0" : 156.0, - "99.0" : 222.0 +{ + ... + "aggregations": { + "load_time_percentiles": { + "values": { + "50.0": 80, + "90.0": 140, + "99.0": 140 + } } } - } } ``` -You can control the level of approximation using the optional `tdigest.compression` field. A larger value indicates that the data structure that approximates percentiles is more accurate but uses more heap space. The default value is 100. +### Keyed response + +You can set `keyed: false` to return results as an array: + +```json +GET /latency_data/_search +{ + "size": 0, + "aggs": { + "load_time_percentiles": { + "percentiles": { + "field": "load_time", + "keyed": false + } + } + } +} +``` +{% include copy-curl.html %} + +The response provides percentiles as an array of values: + +```json +{ + ... + "aggregations": { + "load_time_percentiles": { + "values": [ + { + "key": 1, + "value": 20 + }, + { + "key": 5, + "value": 20 + }, + { + "key": 25, + "value": 40 + }, + { + "key": 50, + "value": 80 + }, + { + "key": 75, + "value": 120 + }, + { + "key": 95, + "value": 140 + }, + { + "key": 99, + "value": 140 + } + ] + } + } +} +``` + +### Precision tuning with tdigest + +The `tdigest` algorithm is the default method used to calculate percentiles. It provides a memory-efficient way to estimate percentile ranks, especially when working with floating-point data such as response times or latencies. + +Unlike exact percentile calculations, `tdigest` uses a probabilistic approach that groups values into centroids—small clusters that summarize the distribution. This method enables accurate estimates for most percentiles without needing to store all the raw data in memory. -For example, use the following request to set `compression` to `200`: +The algorithm is designed to be highly accurate near the tails of the distribution, the low percentiles (1st) and high percentiles (99th), which are often the most important for performance analysis. You can control the precision of the results using the `compression` parameter. + +A higher compression value means more centroids are used, increasing accuracy, especially in the tails, but requiring more memory and CPU. A lower compression value reduces memory usage and speeds up execution, but results may be less accurate. + +#### When to use tdigest + +Use tdigest when: + +* Your data includes floating-point values, such as response times, latency, or duration. +* You need accurate results in the extreme percentiles, for example 1st, 99th. + +Avoid tdigest if: + +* You are working only with integer data and want maximum speed. +* You care less about accuracy in the distribution tails and prefer faster aggregation (consider using [`hdr`](#hdr-histogram) instead). + + The following example sets `tdigest.compression` to `200`: ```json -GET opensearch_dashboards_sample_data_ecommerce/_search +GET /latency_data/_search { "size": 0, "aggs": { - "percentile_taxful_total_price": { + "load_time_percentiles": { "percentiles": { - "field": "taxful_total_price", - "tdigest": { + "field": "load_time", + "tdigest": { "compression": 200 } } @@ -72,18 +238,98 @@ GET opensearch_dashboards_sample_data_ecommerce/_search } } ``` +{% include copy-curl.html %} + +### HDR histogram + +HDR (High Dynamic Range) histogram is an alternative to [`tdigest`](#precision-tuning-with-tdigest) for calculating percentiles. It is especially useful when dealing with large datasets and latency measurements. It is designed for speed and supports a wide dynamic range of values while maintaining a fixed, configurable level of precision. + +Unlike [`tdigest`](#precision-tuning-with-tdigest), which offers more accuracy in the tails of a distribution (extreme percentiles), HDR prioritizes speed and uniform accuracy across the range. It works best when the number of buckets is large and extreme precision in rare values is not required. + +For example, if you're measuring response times ranging from 1 microsecond to 1 hour and configure HDR with 3 significant digits, it will record values with a precision of ±1 microsecond for values up to 1 millisecond and ±3.6 seconds for values near 1 hour. + +This trade-off makes HDR much faster and more memory-intensive than [`tdigest`](#precision-tuning-with-tdigest) + +Breakdown of HDR significant digits: + +| Significant Digits | Relative Precision (Max Error) | +| ------------------ | ------------------------------ | +| 1 | 1 part in 10 = 10% | +| 2 | 1 part in 100 = 1% | +| 3 | 1 part in 1,000 = 0.1% | +| 4 | 1 part in 10,000 = 0.01% | +| 5 | 1 part in 100,000 = 0.001% | + +You should use HDR if you: + +* Are aggregating across many buckets. +* Don't require extreme precision in the tail percentiles. +* Have sufficient memory available. + +You should avoid HDR if: + +* Tail accuracy is important. +* You are analyzing skewed or sparse data distributions. + +The following example is configured with `hdr.number_of_significant_value_digits` set to `3`: + +```json +GET /latency_data/_search +{ + "size": 0, + "aggs": { + "load_time_percentiles": { + "percentiles": { + "field": "load_time", + "hdr": { + "number_of_significant_value_digits": 3 + } + } + } + } +} +``` +{% include copy-curl.html %} + +### Missing values + +You can use `missing` setting to configure fallback value for documents that do not have the target field. See following example: + +```json +GET /latency_data/_search +{ + "size": 0, + "aggs": { + "load_time_percentiles": { + "percentiles": { + "field": "load_time", + "missing": 0 + } + } + } +} +``` +{% include copy-curl.html %} + +### Script + +Instead of specifying a field, you can dynamically compute the value using a script. This is useful when you need to apply transformations, such as converting currencies or applying weights. -The default percentiles returned are `1, 5, 25, 50, 75, 95, 99`. You can specify other percentiles in the optional `percents` field. For example, to get the 99.9th and 99.99th percentiles, run the following request: +#### Inline script + +Use a script to compute derived values: ```json -GET opensearch_dashboards_sample_data_ecommerce/_search +GET /latency_data/_search { "size": 0, "aggs": { - "percentile_taxful_total_price": { + "adjusted_percentiles": { "percentiles": { - "field": "taxful_total_price", - "percents": [99.9, 99.99] + "script": { + "source": "doc['load_time'].value * 1.2" + }, + "percents": [50, 95] } } } @@ -91,4 +337,41 @@ GET opensearch_dashboards_sample_data_ecommerce/_search ``` {% include copy-curl.html %} -The specified value overrides the default percentiles, so only the percentiles you specify are returned. +#### Stored script + +Stored scripts can also be used. + +First, create a sample script using the following command: + +```json +POST _scripts/load_script +{ + "script": { + "lang": "painless", + "source": "doc[params.field].value * params.multiplier" + } +} +``` + +Use the stored script in the `percentiles` aggregation, providing the `params` used by the stored script: + +```json +GET /latency_data/_search +{ + "size": 0, + "aggs": { + "adjusted_percentiles": { + "percentiles": { + "script": { + "id": "load_script", + "params": { + "field": "load_time", + "multiplier": 1.2 + } + }, + "percents": [50, 95] + } + } + } +} +``` From 287b5140af50bee10b051222ee34d94b5c00673d Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Thu, 17 Jul 2025 10:43:36 +0100 Subject: [PATCH 04/14] addressing PR comments Signed-off-by: Anton Rubin --- _aggregations/metric/percentile-ranks.md | 14 ++++++++++++++ _aggregations/metric/percentile.md | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/_aggregations/metric/percentile-ranks.md b/_aggregations/metric/percentile-ranks.md index cca3c7b37de..5252a0b2f1f 100644 --- a/_aggregations/metric/percentile-ranks.md +++ b/_aggregations/metric/percentile-ranks.md @@ -13,6 +13,20 @@ The `percentile_ranks` aggregation estimates the percentage of observed values t For example, if you want to know how a transaction amount of `45` compares to other transaction values in a dataset, a percentile rank aggregation will return a value like `82.3`, which means 82.3% of transactions were less than or equal to `45`. +## Parameters + +The `percentile_ranks` aggregation takes the following parameters: + +| Parameter | Data type | Required/Optional | Description | +| ---------------------------------------- | ---------------- | ----------------- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `field` | String | Required | The numeric field to compute percentile ranks on. | +| `values` | Array of doubles | Required | The values for which to calculate percentile ranks. | +| `keyed` | Boolean | Optional | If set to `false`, returns results as an array. Otherwise returns results as a JSON object. Default is `true`. | +| `tdigest.compression` | Double | Optional | Controls accuracy and memory usage of the `tdigest` algorithm. See [precision tuning with tdigest](#precision-tuning-with-tdigest). | +| `hdr.number_of_significant_value_digits` | Integer | Optional | The precision setting for the HDR histogram. See [HDR histogram](#hdr-histogram). | +| `missing` | Number | Optional | The default value to use when the target field is missing in a document. | +| `script` | Object | Optional | The script to compute custom values instead of using a field. Supports inline or stored scripts. | + ## Examples diff --git a/_aggregations/metric/percentile.md b/_aggregations/metric/percentile.md index 30e3d950cbe..dc8edc81577 100644 --- a/_aggregations/metric/percentile.md +++ b/_aggregations/metric/percentile.md @@ -15,6 +15,20 @@ For example, a 95th percentile of `load_time` = `120ms` means 95% of values are Similarly to the [`cardinality`]({{site.url}}{{site.baseurl}}/aggregations/metric/cardinality/) metric, the `percentile` metric is also approximate. +## Parameters + +The `percentiles` aggregation takes the following parameters: + +| Parameter | Data type | Required/Optional | Description | +| ---------------------------------------- | ---------------- | -------- | --------------------------------------------------------------------------------------------------------------------------- | +| `field` | String | Required | The numeric field to compute percentiles on. | +| `percents` | Array of doubles | Optional | The list of percentiles to calculate. Default is `[1, 5, 25, 50, 75, 95, 99]`. | +| `keyed` | Boolean | Optional | If set to `false` returns results as a array, otherwise returns results as JSON object. Default is `true` | +| `tdigest.compression` | Double | Optional | Controls accuracy and memory usage of the `tdigest` algorithm. For further details see [precision tuning with tdigest](#precision-tuning-with-tdigest). | +| `hdr.number_of_significant_value_digits` | Integer | Optional | Precision setting for the HDR histogram. For further details see [HDR histogram](#hdr-histogram) | +| `missing` | Number | Optional | Default value for documents missing the field. | +| `script` | Object | Optional | Script to compute custom values instead of using a field. Supports inline or stored scripts. | + ## Examples The following examples demonstrate different configurations of the `percentiles` aggregation. From 340750d15a6d3c921db4ea6a40690ebc8f3d2b3e Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Thu, 17 Jul 2025 10:48:03 +0100 Subject: [PATCH 05/14] addressing PR comments Signed-off-by: Anton Rubin --- _aggregations/metric/percentile-ranks.md | 8 ++++---- _aggregations/metric/percentile.md | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/_aggregations/metric/percentile-ranks.md b/_aggregations/metric/percentile-ranks.md index 5252a0b2f1f..f0f8085a74b 100644 --- a/_aggregations/metric/percentile-ranks.md +++ b/_aggregations/metric/percentile-ranks.md @@ -19,13 +19,13 @@ The `percentile_ranks` aggregation takes the following parameters: | Parameter | Data type | Required/Optional | Description | | ---------------------------------------- | ---------------- | ----------------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `field` | String | Required | The numeric field to compute percentile ranks on. | -| `values` | Array of doubles | Required | The values for which to calculate percentile ranks. | +| `field` | String | Required | The numeric field used to compute percentile ranks on. | +| `values` | Array of doubles | Required | The values used to calculate percentile ranks. | | `keyed` | Boolean | Optional | If set to `false`, returns results as an array. Otherwise returns results as a JSON object. Default is `true`. | | `tdigest.compression` | Double | Optional | Controls accuracy and memory usage of the `tdigest` algorithm. See [precision tuning with tdigest](#precision-tuning-with-tdigest). | | `hdr.number_of_significant_value_digits` | Integer | Optional | The precision setting for the HDR histogram. See [HDR histogram](#hdr-histogram). | -| `missing` | Number | Optional | The default value to use when the target field is missing in a document. | -| `script` | Object | Optional | The script to compute custom values instead of using a field. Supports inline or stored scripts. | +| `missing` | Number | Optional | The default value used when the target field is missing in a document. | +| `script` | Object | Optional | The script used to compute custom values instead of using a field. Supports inline or stored scripts. | ## Examples diff --git a/_aggregations/metric/percentile.md b/_aggregations/metric/percentile.md index dc8edc81577..97f5dde3fa1 100644 --- a/_aggregations/metric/percentile.md +++ b/_aggregations/metric/percentile.md @@ -21,8 +21,8 @@ The `percentiles` aggregation takes the following parameters: | Parameter | Data type | Required/Optional | Description | | ---------------------------------------- | ---------------- | -------- | --------------------------------------------------------------------------------------------------------------------------- | -| `field` | String | Required | The numeric field to compute percentiles on. | -| `percents` | Array of doubles | Optional | The list of percentiles to calculate. Default is `[1, 5, 25, 50, 75, 95, 99]`. | +| `field` | String | Required | The numeric field used to compute percentiles on. | +| `percents` | Array of doubles | Optional | The list of percentiles returned in the response. Default is `[1, 5, 25, 50, 75, 95, 99]`. | | `keyed` | Boolean | Optional | If set to `false` returns results as a array, otherwise returns results as JSON object. Default is `true` | | `tdigest.compression` | Double | Optional | Controls accuracy and memory usage of the `tdigest` algorithm. For further details see [precision tuning with tdigest](#precision-tuning-with-tdigest). | | `hdr.number_of_significant_value_digits` | Integer | Optional | Precision setting for the HDR histogram. For further details see [HDR histogram](#hdr-histogram) | From e1f58625af2cc01f1216fa70bbef91d372b4b081 Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Thu, 17 Jul 2025 10:52:05 +0100 Subject: [PATCH 06/14] addressing PR comments Signed-off-by: Anton Rubin --- _aggregations/metric/percentile.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_aggregations/metric/percentile.md b/_aggregations/metric/percentile.md index 97f5dde3fa1..db20af10a66 100644 --- a/_aggregations/metric/percentile.md +++ b/_aggregations/metric/percentile.md @@ -26,7 +26,7 @@ The `percentiles` aggregation takes the following parameters: | `keyed` | Boolean | Optional | If set to `false` returns results as a array, otherwise returns results as JSON object. Default is `true` | | `tdigest.compression` | Double | Optional | Controls accuracy and memory usage of the `tdigest` algorithm. For further details see [precision tuning with tdigest](#precision-tuning-with-tdigest). | | `hdr.number_of_significant_value_digits` | Integer | Optional | Precision setting for the HDR histogram. For further details see [HDR histogram](#hdr-histogram) | -| `missing` | Number | Optional | Default value for documents missing the field. | +| `missing` | Number | Optional | Default value used for documents missing the field. | | `script` | Object | Optional | Script to compute custom values instead of using a field. Supports inline or stored scripts. | ## Examples From 809e3049bf11736eac3cdfd4b191aee6d7f15244 Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Thu, 17 Jul 2025 10:52:22 +0100 Subject: [PATCH 07/14] addressing PR comments Signed-off-by: Anton Rubin --- _aggregations/metric/percentile.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_aggregations/metric/percentile.md b/_aggregations/metric/percentile.md index db20af10a66..20e557d57d9 100644 --- a/_aggregations/metric/percentile.md +++ b/_aggregations/metric/percentile.md @@ -27,7 +27,7 @@ The `percentiles` aggregation takes the following parameters: | `tdigest.compression` | Double | Optional | Controls accuracy and memory usage of the `tdigest` algorithm. For further details see [precision tuning with tdigest](#precision-tuning-with-tdigest). | | `hdr.number_of_significant_value_digits` | Integer | Optional | Precision setting for the HDR histogram. For further details see [HDR histogram](#hdr-histogram) | | `missing` | Number | Optional | Default value used for documents missing the field. | -| `script` | Object | Optional | Script to compute custom values instead of using a field. Supports inline or stored scripts. | +| `script` | Object | Optional | Script used to compute custom values instead of using a field. Supports inline or stored scripts. | ## Examples From 0fddd06f72c5c1a27640e4c7f5e3010ab42e8379 Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Thu, 17 Jul 2025 10:56:09 +0100 Subject: [PATCH 08/14] addressing PR comments Signed-off-by: Anton Rubin --- _aggregations/metric/percentile-ranks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_aggregations/metric/percentile-ranks.md b/_aggregations/metric/percentile-ranks.md index f0f8085a74b..5d41c0e5ab9 100644 --- a/_aggregations/metric/percentile-ranks.md +++ b/_aggregations/metric/percentile-ranks.md @@ -168,7 +168,7 @@ The response includes an array instead of an object: ### Precision tuning with tdigest -Percentile ranks are calculated using the `tdigest` algorithm by default. You can control the trade-off between accuracy and memory usage by adjusting the `tdigest.compression` configuration. Higher values provide better accuracy, however require more memory. For more information on how tdigest works see [precision tuning with tdigest]({{site.url}}{{site.baseurl}}/aggregations/metric/percentile/#precision-tuning-with-tdigest) +Percentile ranks are calculated using the `tdigest` algorithm by default. You can control the trade-off between accuracy and memory usage by adjusting the `tdigest.compression` configuration. Higher values provide better accuracy, however require more memory. For more information about how tdigest works see [precision tuning with tdigest]({{site.url}}{{site.baseurl}}/aggregations/metric/percentile/#precision-tuning-with-tdigest) The following example is configured with `tdigest.compression` set to `200`: From 2a9d069a592e2d232853783102d29c870ebb5303 Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Thu, 17 Jul 2025 18:42:28 +0100 Subject: [PATCH 09/14] Apply suggestions from code review Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: AntonEliatra --- _aggregations/metric/percentile-ranks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_aggregations/metric/percentile-ranks.md b/_aggregations/metric/percentile-ranks.md index 5d41c0e5ab9..485bd7779dd 100644 --- a/_aggregations/metric/percentile-ranks.md +++ b/_aggregations/metric/percentile-ranks.md @@ -15,7 +15,7 @@ For example, if you want to know how a transaction amount of `45` compares to ot ## Parameters -The `percentile_ranks` aggregation takes the following parameters: +The `percentile_ranks` aggregation takes the following parameters. | Parameter | Data type | Required/Optional | Description | | ---------------------------------------- | ---------------- | ----------------- | ----------------------------------------------------------------------------------------------------------------------------------- | From 5cd1f49fde83f8ac086bf308911e3686d70cc477 Mon Sep 17 00:00:00 2001 From: Anton Rubin Date: Wed, 23 Jul 2025 16:56:22 +0100 Subject: [PATCH 10/14] addressing PR comments Signed-off-by: Anton Rubin --- _aggregations/metric/percentile-ranks.md | 2 +- _aggregations/metric/percentile.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/_aggregations/metric/percentile-ranks.md b/_aggregations/metric/percentile-ranks.md index 485bd7779dd..8e9f6636a1a 100644 --- a/_aggregations/metric/percentile-ranks.md +++ b/_aggregations/metric/percentile-ranks.md @@ -117,7 +117,7 @@ The response demonstrates that 28.6% of the values are less than or equal to `25 ### Keyed response -You can change the format of the aggregation response by setting the `keyed` parameter to `false`: +You can change the format of the returned aggregation from json object to list of key-value pairs by setting the `keyed` parameter to `false`: ```json GET /transaction_data/_search diff --git a/_aggregations/metric/percentile.md b/_aggregations/metric/percentile.md index 20e557d57d9..210c2ff6b73 100644 --- a/_aggregations/metric/percentile.md +++ b/_aggregations/metric/percentile.md @@ -152,7 +152,7 @@ The response includes only the three requested percentiles aggregations: ### Keyed response -You can set `keyed: false` to return results as an array: +You can change the format of the returned aggregation from JSON object to list of key-value pairs by setting the `keyed` parameter to `false`: ```json GET /latency_data/_search From 726bf4cbf12ec769f3308905c1e7dc004d1fe237 Mon Sep 17 00:00:00 2001 From: AntonEliatra Date: Mon, 28 Jul 2025 16:05:36 +0100 Subject: [PATCH 11/14] Apply suggestions from code review Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: AntonEliatra --- _aggregations/metric/percentile-ranks.md | 33 +++++++-------- _aggregations/metric/percentile.md | 54 ++++++++++++------------ 2 files changed, 41 insertions(+), 46 deletions(-) diff --git a/_aggregations/metric/percentile-ranks.md b/_aggregations/metric/percentile-ranks.md index 8e9f6636a1a..79145ee295f 100644 --- a/_aggregations/metric/percentile-ranks.md +++ b/_aggregations/metric/percentile-ranks.md @@ -11,7 +11,7 @@ redirect_from: The `percentile_ranks` aggregation estimates the percentage of observed values that fall below or at given thresholds. This is useful for understanding the relative standing of a particular value within a distribution of values. -For example, if you want to know how a transaction amount of `45` compares to other transaction values in a dataset, a percentile rank aggregation will return a value like `82.3`, which means 82.3% of transactions were less than or equal to `45`. +For example, you can use a percentile rank aggregation to learn how a transaction amount of `45` compares to other transaction values in a dataset. The percentile rank aggregation returns a value like `82.3`, which means 82.3% of transactions are less than or equal to `45`. ## Parameters @@ -22,17 +22,15 @@ The `percentile_ranks` aggregation takes the following parameters. | `field` | String | Required | The numeric field used to compute percentile ranks on. | | `values` | Array of doubles | Required | The values used to calculate percentile ranks. | | `keyed` | Boolean | Optional | If set to `false`, returns results as an array. Otherwise returns results as a JSON object. Default is `true`. | -| `tdigest.compression` | Double | Optional | Controls accuracy and memory usage of the `tdigest` algorithm. See [precision tuning with tdigest](#precision-tuning-with-tdigest). | +| `tdigest.compression` | Double | Optional | Controls accuracy and memory usage of the `tdigest` algorithm. See [Precision tuning with tdigest](#precision-tuning-with-tdigest). | | `hdr.number_of_significant_value_digits` | Integer | Optional | The precision setting for the HDR histogram. See [HDR histogram](#hdr-histogram). | | `missing` | Number | Optional | The default value used when the target field is missing in a document. | -| `script` | Object | Optional | The script used to compute custom values instead of using a field. Supports inline or stored scripts. | +| `script` | Object | Optional | The script used to compute custom values instead of using a field. Supports inline and stored scripts. | -## Examples +## Example -See following examples covering multiple approaches to using `percentile_ranks`. -### Add sample data First, create a sample index: @@ -71,7 +69,6 @@ POST /transaction_data/_bulk ``` {% include copy-curl.html %} -### Percentile rank aggregation Run a `percentile_ranks` aggregation to calculate how certain values compare to the overall distribution: @@ -91,7 +88,7 @@ GET /transaction_data/_search ``` {% include copy-curl.html %} -The response demonstrates that 28.6% of the values are less than or equal to `25`, and 71.4% are less than or equal to `55`. +The response demonstrates that 28.6% of the values are less than or equal to `25` and 71.4% are less than or equal to `55`: ```json { @@ -115,9 +112,9 @@ The response demonstrates that 28.6% of the values are less than or equal to `25 } ``` -### Keyed response +## Keyed response -You can change the format of the returned aggregation from json object to list of key-value pairs by setting the `keyed` parameter to `false`: +You can change the format of the returned aggregation from JSON object to a list of key-value pairs by setting the `keyed` parameter to `false`: ```json GET /transaction_data/_search @@ -166,9 +163,9 @@ The response includes an array instead of an object: } ``` -### Precision tuning with tdigest +## Precision tuning with tdigest -Percentile ranks are calculated using the `tdigest` algorithm by default. You can control the trade-off between accuracy and memory usage by adjusting the `tdigest.compression` configuration. Higher values provide better accuracy, however require more memory. For more information about how tdigest works see [precision tuning with tdigest]({{site.url}}{{site.baseurl}}/aggregations/metric/percentile/#precision-tuning-with-tdigest) +By default, percentile ranks are calculated using the `tdigest` algorithm. You can control the trade-off between accuracy and memory usage by specifying the `tdigest.compression` parameter. Higher values provide better accuracy but require more memory. For more information about how tdigest works, see [precision tuning with tdigest]({{site.url}}{{site.baseurl}}/aggregations/metric/percentile/#precision-tuning-with-tdigest) The following example is configured with `tdigest.compression` set to `200`: @@ -193,7 +190,7 @@ GET /transaction_data/_search ### HDR histogram -As an alternative to `tdigest`, you can use the High Dynamic Range (HDR) histogram algorithm, which is better suited for large numbers of buckets and fast processing. For further details regarding how HDR histogram works see [HDR histogram]({{site.url}}{{site.baseurl}}/aggregations/metric/percentile/#hdr-histogram) +As an alternative to `tdigest`, you can use the High Dynamic Range (HDR) histogram algorithm, which is better suited for large numbers of buckets and fast processing. For more information about how HDR histogram works, see [HDR histogram]({{site.url}}{{site.baseurl}}/aggregations/metric/percentile/#hdr-histogram) You should use HDR if you: @@ -229,7 +226,7 @@ GET /transaction_data/_search ### Missing values -If some documents are missing the target field, you can instruct the query to use a fallback value by setting the `missing` parameter. The following example ensures that documents without an amount field will be treated as if the value were `0`, and included in the percentile ranks computation: +If some documents are missing the target field, you can instruct the query to use a fallback value by setting the `missing` parameter. The following example ensures that documents without an amount field are treated as if their values are `0` and are included in the percentile ranks computation: ```json GET /transaction_data/_search @@ -254,7 +251,7 @@ Instead of specifying a field, you can dynamically compute the value using a scr #### Inline script -The following example uses inline script to calculate the percentile ranks of the transformed values `30` and `60`, against values from the amount field multiplied by 10%: +The following example uses an inline script to calculate the percentile ranks of the transformed values `30` and `60` against values from the `amount` field, increased by 10%: ```json GET /transaction_data/_search @@ -276,9 +273,8 @@ GET /transaction_data/_search #### Stored script -Stored scripts can also be used. -To use a stored script first create it using the following command: +To use a stored script, first create it using the following request: ```json POST _scripts/percentile_script @@ -291,7 +287,7 @@ POST _scripts/percentile_script ``` {% include copy-curl.html %} -Use the stored script in the `percentile_ranks` aggregation: +Then use the stored script in the `percentile_ranks` aggregation: ```json GET /transaction_data/_search @@ -313,3 +309,4 @@ GET /transaction_data/_search } } ``` +{% include copy-curl.html %} diff --git a/_aggregations/metric/percentile.md b/_aggregations/metric/percentile.md index 210c2ff6b73..c1423798c0c 100644 --- a/_aggregations/metric/percentile.md +++ b/_aggregations/metric/percentile.md @@ -13,27 +13,25 @@ The `percentiles` aggregation estimates the value at a given percentile of a num For example, a 95th percentile of `load_time` = `120ms` means 95% of values are less than or equal to 120ms. -Similarly to the [`cardinality`]({{site.url}}{{site.baseurl}}/aggregations/metric/cardinality/) metric, the `percentile` metric is also approximate. +Similarly to the [`cardinality`]({{site.url}}{{site.baseurl}}/aggregations/metric/cardinality/) metric, the `percentile` metric is approximate. ## Parameters -The `percentiles` aggregation takes the following parameters: +The `percentiles` aggregation takes the following parameters. | Parameter | Data type | Required/Optional | Description | | ---------------------------------------- | ---------------- | -------- | --------------------------------------------------------------------------------------------------------------------------- | | `field` | String | Required | The numeric field used to compute percentiles on. | | `percents` | Array of doubles | Optional | The list of percentiles returned in the response. Default is `[1, 5, 25, 50, 75, 95, 99]`. | | `keyed` | Boolean | Optional | If set to `false` returns results as a array, otherwise returns results as JSON object. Default is `true` | -| `tdigest.compression` | Double | Optional | Controls accuracy and memory usage of the `tdigest` algorithm. For further details see [precision tuning with tdigest](#precision-tuning-with-tdigest). | -| `hdr.number_of_significant_value_digits` | Integer | Optional | Precision setting for the HDR histogram. For further details see [HDR histogram](#hdr-histogram) | +| `tdigest.compression` | Double | Optional | Controls accuracy and memory usage of the `tdigest` algorithm. See [Precision tuning with tdigest](#precision-tuning-with-tdigest). | +| `hdr.number_of_significant_value_digits` | Integer | Optional | Precision setting for the HDR histogram. See [HDR histogram](#hdr-histogram). | | `missing` | Number | Optional | Default value used for documents missing the field. | -| `script` | Object | Optional | Script used to compute custom values instead of using a field. Supports inline or stored scripts. | +| `script` | Object | Optional | Script used to compute custom values instead of using a field. Supports inline and stored scripts. | -## Examples +## Example -The following examples demonstrate different configurations of the `percentiles` aggregation. -### Create an index and add sample data First, create an index: @@ -75,7 +73,7 @@ POST /latency_data/_bulk ### Percentiles aggregation -The following example calculates the default set of percentiles for the `load_time` field" +The following example calculates the default set of percentiles for the `load_time` field: ```json GET /latency_data/_search @@ -113,9 +111,9 @@ By default, the 1st, 5th, 25th, 50th, 75th, 95th, and 99th percentiles are retur } ``` -### Custom percentiles +## Custom percentiles -You can specify the exact percentiles using `percents` setting: +You can specify the exact percentiles using `percents` array: ```json GET /latency_data/_search @@ -152,7 +150,7 @@ The response includes only the three requested percentiles aggregations: ### Keyed response -You can change the format of the returned aggregation from JSON object to list of key-value pairs by setting the `keyed` parameter to `false`: +You can change the format of the returned aggregation from JSON object to a list of key-value pairs by setting the `keyed` parameter to `false`: ```json GET /latency_data/_search @@ -216,20 +214,19 @@ The response provides percentiles as an array of values: The `tdigest` algorithm is the default method used to calculate percentiles. It provides a memory-efficient way to estimate percentile ranks, especially when working with floating-point data such as response times or latencies. -Unlike exact percentile calculations, `tdigest` uses a probabilistic approach that groups values into centroids—small clusters that summarize the distribution. This method enables accurate estimates for most percentiles without needing to store all the raw data in memory. +Unlike exact percentile calculations, `tdigest` uses a probabilistic approach that groups values into _centroids_---small clusters that summarize the distribution. This method enables accurate estimates for most percentiles without needing to store all the raw data in memory. -The algorithm is designed to be highly accurate near the tails of the distribution, the low percentiles (1st) and high percentiles (99th), which are often the most important for performance analysis. You can control the precision of the results using the `compression` parameter. +The algorithm is designed to be highly accurate near the tails of the distribution, the low percentiles (such as 1st) and high percentiles (such as 99th), which are often the most important for performance analysis. You can control the precision of the results using the `compression` parameter. -A higher compression value means more centroids are used, increasing accuracy, especially in the tails, but requiring more memory and CPU. A lower compression value reduces memory usage and speeds up execution, but results may be less accurate. +A higher `compression` value means that more centroids are used, increasing accuracy, especially in the tails, but requiring more memory and CPU. A lower `compression` value reduces memory usage and speeds up execution, but the results may be less accurate. -#### When to use tdigest -Use tdigest when: +Use `tdigest` when: * Your data includes floating-point values, such as response times, latency, or duration. -* You need accurate results in the extreme percentiles, for example 1st, 99th. +* You need accurate results in the extreme percentiles, for example 1st or 99th. -Avoid tdigest if: +Avoid `tdigest` when: * You are working only with integer data and want maximum speed. * You care less about accuracy in the distribution tails and prefer faster aggregation (consider using [`hdr`](#hdr-histogram) instead). @@ -264,9 +261,9 @@ For example, if you're measuring response times ranging from 1 microsecond to 1 This trade-off makes HDR much faster and more memory-intensive than [`tdigest`](#precision-tuning-with-tdigest) -Breakdown of HDR significant digits: +The following table presents the breakdown of HDR significant digits. -| Significant Digits | Relative Precision (Max Error) | +| Significant digits | Relative precision (max error) | | ------------------ | ------------------------------ | | 1 | 1 part in 10 = 10% | | 2 | 1 part in 100 = 1% | @@ -307,7 +304,7 @@ GET /latency_data/_search ### Missing values -You can use `missing` setting to configure fallback value for documents that do not have the target field. See following example: +Use the `missing` setting to configure a fallback value for documents that do not have the target field: ```json GET /latency_data/_search @@ -325,11 +322,11 @@ GET /latency_data/_search ``` {% include copy-curl.html %} -### Script +## Script Instead of specifying a field, you can dynamically compute the value using a script. This is useful when you need to apply transformations, such as converting currencies or applying weights. -#### Inline script +### Inline script Use a script to compute derived values: @@ -351,11 +348,10 @@ GET /latency_data/_search ``` {% include copy-curl.html %} -#### Stored script +### Stored script -Stored scripts can also be used. -First, create a sample script using the following command: +First, create a sample script using the following request: ```json POST _scripts/load_script @@ -366,8 +362,9 @@ POST _scripts/load_script } } ``` +{% include copy-curl.html %} -Use the stored script in the `percentiles` aggregation, providing the `params` used by the stored script: +Then use the stored script in the `percentiles` aggregation, providing the `params` required by the stored script: ```json GET /latency_data/_search @@ -389,3 +386,4 @@ GET /latency_data/_search } } ``` +{% include copy-curl.html %} From d4582c387febd63a9da36d1d043ca9e1f3377649 Mon Sep 17 00:00:00 2001 From: Nathan Bower Date: Mon, 28 Jul 2025 11:59:03 -0400 Subject: [PATCH 12/14] Update _aggregations/metric/percentile.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Nathan Bower --- _aggregations/metric/percentile.md | 1 + 1 file changed, 1 insertion(+) diff --git a/_aggregations/metric/percentile.md b/_aggregations/metric/percentile.md index c1423798c0c..9ad40384a2e 100644 --- a/_aggregations/metric/percentile.md +++ b/_aggregations/metric/percentile.md @@ -363,6 +363,7 @@ POST _scripts/load_script } ``` {% include copy-curl.html %} +{% include copy-curl.html %} Then use the stored script in the `percentiles` aggregation, providing the `params` required by the stored script: From 9fcbdef09b5a1315125ece17589bce0589b53969 Mon Sep 17 00:00:00 2001 From: Nathan Bower Date: Mon, 28 Jul 2025 11:59:24 -0400 Subject: [PATCH 13/14] Update _aggregations/metric/percentile.md Co-authored-by: kolchfa-aws <105444904+kolchfa-aws@users.noreply.github.com> Signed-off-by: Nathan Bower --- _aggregations/metric/percentile.md | 1 + 1 file changed, 1 insertion(+) diff --git a/_aggregations/metric/percentile.md b/_aggregations/metric/percentile.md index 9ad40384a2e..8249c48b6d4 100644 --- a/_aggregations/metric/percentile.md +++ b/_aggregations/metric/percentile.md @@ -388,3 +388,4 @@ GET /latency_data/_search } ``` {% include copy-curl.html %} +{% include copy-curl.html %} From 2682968f83e7057a527ef1367c49ac6218b91e5f Mon Sep 17 00:00:00 2001 From: Nathan Bower Date: Mon, 28 Jul 2025 12:01:08 -0400 Subject: [PATCH 14/14] Apply suggestions from code review Signed-off-by: Nathan Bower --- _aggregations/metric/percentile-ranks.md | 10 ++++---- _aggregations/metric/percentile.md | 30 ++++++++++++------------ 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/_aggregations/metric/percentile-ranks.md b/_aggregations/metric/percentile-ranks.md index 79145ee295f..124192ecddf 100644 --- a/_aggregations/metric/percentile-ranks.md +++ b/_aggregations/metric/percentile-ranks.md @@ -19,7 +19,7 @@ The `percentile_ranks` aggregation takes the following parameters. | Parameter | Data type | Required/Optional | Description | | ---------------------------------------- | ---------------- | ----------------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `field` | String | Required | The numeric field used to compute percentile ranks on. | +| `field` | String | Required | The numeric field used to compute percentile ranks. | | `values` | Array of doubles | Required | The values used to calculate percentile ranks. | | `keyed` | Boolean | Optional | If set to `false`, returns results as an array. Otherwise returns results as a JSON object. Default is `true`. | | `tdigest.compression` | Double | Optional | Controls accuracy and memory usage of the `tdigest` algorithm. See [Precision tuning with tdigest](#precision-tuning-with-tdigest). | @@ -114,7 +114,7 @@ The response demonstrates that 28.6% of the values are less than or equal to `25 ## Keyed response -You can change the format of the returned aggregation from JSON object to a list of key-value pairs by setting the `keyed` parameter to `false`: +You can change the format of the returned aggregation from a JSON object to a list of key-value pairs by setting the `keyed` parameter to `false`: ```json GET /transaction_data/_search @@ -165,7 +165,7 @@ The response includes an array instead of an object: ## Precision tuning with tdigest -By default, percentile ranks are calculated using the `tdigest` algorithm. You can control the trade-off between accuracy and memory usage by specifying the `tdigest.compression` parameter. Higher values provide better accuracy but require more memory. For more information about how tdigest works, see [precision tuning with tdigest]({{site.url}}{{site.baseurl}}/aggregations/metric/percentile/#precision-tuning-with-tdigest) +By default, percentile ranks are calculated using the `tdigest` algorithm. You can control the trade-off between accuracy and memory usage by specifying the `tdigest.compression` parameter. Higher values provide better accuracy but require more memory. For more information about how tdigest works, see [Precision tuning with tdigest]({{site.url}}{{site.baseurl}}/aggregations/metric/percentile/#precision-tuning-with-tdigest). The following example is configured with `tdigest.compression` set to `200`: @@ -190,7 +190,7 @@ GET /transaction_data/_search ### HDR histogram -As an alternative to `tdigest`, you can use the High Dynamic Range (HDR) histogram algorithm, which is better suited for large numbers of buckets and fast processing. For more information about how HDR histogram works, see [HDR histogram]({{site.url}}{{site.baseurl}}/aggregations/metric/percentile/#hdr-histogram) +As an alternative to `tdigest`, you can use the High Dynamic Range (HDR) histogram algorithm, which is better suited for large numbers of buckets and fast processing. For more information about how the HDR histogram works, see [HDR histogram]({{site.url}}{{site.baseurl}}/aggregations/metric/percentile/#hdr-histogram). You should use HDR if you: @@ -226,7 +226,7 @@ GET /transaction_data/_search ### Missing values -If some documents are missing the target field, you can instruct the query to use a fallback value by setting the `missing` parameter. The following example ensures that documents without an amount field are treated as if their values are `0` and are included in the percentile ranks computation: +If some documents are missing the target field, you can instruct the query to use a fallback value by setting the `missing` parameter. The following example ensures that documents without an `amount` field are treated as if their values are `0` and are included in the percentile ranks computation: ```json GET /transaction_data/_search diff --git a/_aggregations/metric/percentile.md b/_aggregations/metric/percentile.md index 8249c48b6d4..e9d0d0b3cbc 100644 --- a/_aggregations/metric/percentile.md +++ b/_aggregations/metric/percentile.md @@ -11,7 +11,7 @@ redirect_from: The `percentiles` aggregation estimates the value at a given percentile of a numeric field. This is useful for understanding distribution boundaries. -For example, a 95th percentile of `load_time` = `120ms` means 95% of values are less than or equal to 120ms. +For example, a 95th percentile of `load_time` = `120ms` means that 95% of values are less than or equal to 120 ms. Similarly to the [`cardinality`]({{site.url}}{{site.baseurl}}/aggregations/metric/cardinality/) metric, the `percentile` metric is approximate. @@ -21,13 +21,13 @@ The `percentiles` aggregation takes the following parameters. | Parameter | Data type | Required/Optional | Description | | ---------------------------------------- | ---------------- | -------- | --------------------------------------------------------------------------------------------------------------------------- | -| `field` | String | Required | The numeric field used to compute percentiles on. | +| `field` | String | Required | The numeric field used to compute percentiles. | | `percents` | Array of doubles | Optional | The list of percentiles returned in the response. Default is `[1, 5, 25, 50, 75, 95, 99]`. | -| `keyed` | Boolean | Optional | If set to `false` returns results as a array, otherwise returns results as JSON object. Default is `true` | +| `keyed` | Boolean | Optional | If set to `false`, returns results as an array. Otherwise, returns results as a JSON object. Default is `true`. | | `tdigest.compression` | Double | Optional | Controls accuracy and memory usage of the `tdigest` algorithm. See [Precision tuning with tdigest](#precision-tuning-with-tdigest). | -| `hdr.number_of_significant_value_digits` | Integer | Optional | Precision setting for the HDR histogram. See [HDR histogram](#hdr-histogram). | -| `missing` | Number | Optional | Default value used for documents missing the field. | -| `script` | Object | Optional | Script used to compute custom values instead of using a field. Supports inline and stored scripts. | +| `hdr.number_of_significant_value_digits` | Integer | Optional | The precision setting for the HDR histogram. See [HDR histogram](#hdr-histogram). | +| `missing` | Number | Optional | The default value used when the target field is missing in a document. | +| `script` | Object | Optional | The script used to compute custom values instead of using a field. Supports inline and stored scripts. | ## Example @@ -113,7 +113,7 @@ By default, the 1st, 5th, 25th, 50th, 75th, 95th, and 99th percentiles are retur ## Custom percentiles -You can specify the exact percentiles using `percents` array: +You can specify the exact percentiles using the `percents` array: ```json GET /latency_data/_search @@ -131,7 +131,7 @@ GET /latency_data/_search ``` {% include copy-curl.html %} -The response includes only the three requested percentiles aggregations: +The response includes only the three requested percentile aggregations: ```json { @@ -150,7 +150,7 @@ The response includes only the three requested percentiles aggregations: ### Keyed response -You can change the format of the returned aggregation from JSON object to a list of key-value pairs by setting the `keyed` parameter to `false`: +You can change the format of the returned aggregation from a JSON object to a list of key-value pairs by setting the `keyed` parameter to `false`: ```json GET /latency_data/_search @@ -216,15 +216,15 @@ The `tdigest` algorithm is the default method used to calculate percentiles. It Unlike exact percentile calculations, `tdigest` uses a probabilistic approach that groups values into _centroids_---small clusters that summarize the distribution. This method enables accurate estimates for most percentiles without needing to store all the raw data in memory. -The algorithm is designed to be highly accurate near the tails of the distribution, the low percentiles (such as 1st) and high percentiles (such as 99th), which are often the most important for performance analysis. You can control the precision of the results using the `compression` parameter. +The algorithm is designed to be highly accurate near the tails of the distribution---the low percentiles (such as 1st) and high percentiles (such as 99th)---which are often the most important for performance analysis. You can control the precision of the results using the `compression` parameter. -A higher `compression` value means that more centroids are used, increasing accuracy, especially in the tails, but requiring more memory and CPU. A lower `compression` value reduces memory usage and speeds up execution, but the results may be less accurate. +A higher `compression` value means that more centroids are used, which increases accuracy (especially in the tails) but requires more memory and CPU. A lower `compression` value reduces memory usage and speeds up execution, but the results may be less accurate. Use `tdigest` when: * Your data includes floating-point values, such as response times, latency, or duration. -* You need accurate results in the extreme percentiles, for example 1st or 99th. +* You need accurate results in the extreme percentiles, for example, the 1st or 99th. Avoid `tdigest` when: @@ -253,13 +253,13 @@ GET /latency_data/_search ### HDR histogram -HDR (High Dynamic Range) histogram is an alternative to [`tdigest`](#precision-tuning-with-tdigest) for calculating percentiles. It is especially useful when dealing with large datasets and latency measurements. It is designed for speed and supports a wide dynamic range of values while maintaining a fixed, configurable level of precision. +The High Dynamic Range (HDR) histogram is an alternative to [`tdigest`](#precision-tuning-with-tdigest) for calculating percentiles. It is especially useful when dealing with large datasets and latency measurements. It is designed for speed and supports a wide dynamic range of values while maintaining a fixed, configurable level of precision. Unlike [`tdigest`](#precision-tuning-with-tdigest), which offers more accuracy in the tails of a distribution (extreme percentiles), HDR prioritizes speed and uniform accuracy across the range. It works best when the number of buckets is large and extreme precision in rare values is not required. For example, if you're measuring response times ranging from 1 microsecond to 1 hour and configure HDR with 3 significant digits, it will record values with a precision of ±1 microsecond for values up to 1 millisecond and ±3.6 seconds for values near 1 hour. -This trade-off makes HDR much faster and more memory-intensive than [`tdigest`](#precision-tuning-with-tdigest) +This trade-off makes HDR much faster and more memory-intensive than [`tdigest`](#precision-tuning-with-tdigest). The following table presents the breakdown of HDR significant digits. @@ -304,7 +304,7 @@ GET /latency_data/_search ### Missing values -Use the `missing` setting to configure a fallback value for documents that do not have the target field: +Use the `missing` setting to configure a fallback value for documents that do not contain the target field: ```json GET /latency_data/_search