This is the multi-page printable view of this section. Click here to print.
Functions
- 1: detect_anomalous_access_cf_fl()
- 2: detect_anomalous_spike_fl()
- 3: graph_blast_radius_fl()
- 4: graph_exposure_perimeter_fl()
- 5: graph_node_centrality_fl()
- 6: graph_path_discovery_fl()
- 7: bartlett_test_fl()
- 8: binomial_test_fl()
- 9: comb_fl()
- 10: dbscan_dynamic_fl()
- 11: dbscan_fl()
- 12: detect_anomalous_new_entity_fl()
- 13: factorial_fl()
- 14: Functions
- 15: Functions library
- 16: geoip_fl()
- 17: get_packages_version_fl()
- 18: kmeans_dynamic_fl()
- 19: kmeans_fl()
- 20: ks_test_fl()
- 21: levene_test_fl()
- 22: log_reduce_fl()
- 23: log_reduce_full_fl()
- 24: log_reduce_predict_fl()
- 25: log_reduce_predict_full_fl()
- 26: log_reduce_train_fl()
- 27: mann_whitney_u_test_fl()
- 28: normality_test_fl()
- 29: pair_probabilities_fl()
- 30: pairwise_dist_fl()
- 31: percentiles_linear_fl()
- 32: perm_fl()
- 33: plotly_anomaly_fl()
- 34: plotly_gauge_fl()
- 35: plotly_scatter3d_fl()
- 36: predict_fl()
- 37: predict_onnx_fl()
- 38: quantize_fl()
- 39: series_clean_anomalies_fl()
- 40: series_cosine_similarity_fl()
- 41: series_dbl_exp_smoothing_fl()
- 42: series_dot_product_fl()
- 43: series_downsample_fl()
- 44: series_exp_smoothing_fl()
- 45: series_fbprophet_forecast_fl()
- 46: series_fit_lowess_fl()
- 47: series_fit_poly_fl()
- 48: series_lag_fl()
- 49: series_metric_fl()
- 50: series_monthly_decompose_anomalies_fl()
- 51: series_moving_avg_fl()
- 52: series_moving_var_fl()
- 53: series_mv_ee_anomalies_fl()
- 54: series_mv_if_anomalies_fl()
- 55: series_mv_oc_anomalies_fl()
- 56: series_rate_fl()
- 57: series_rolling_fl()
- 58: series_shapes_fl()
- 59: series_uv_anomalies_fl()
- 60: series_uv_change_points_fl()
- 61: time_weighted_avg_fl()
- 62: time_weighted_avg2_fl()
- 63: time_weighted_val_fl()
- 64: time_window_rolling_avg_fl()
- 65: two_sample_t_test_fl()
- 66: User-defined functions
- 67: wilcoxon_test_fl()
1 - detect_anomalous_access_cf_fl()
Detect anomalous access using a collaborative filtering (CF) model that identifies anomalous access patterns in timestamped data.
The detect_anomalous_access_cf_fl() function is a user-defined function (UDF) that applies a collaborative filtering (CF) model to detect anomalous interactions, such as entity-resource. For example, a User Principal Name (UPN) accessing a Storage account, based on timestamped data like access logs. In a cybersecurity context, this function helps detect abnormal or unauthorized access patterns.
The CF-based model predicts access scores using item similarity, leveraging historical access patterns and the cosine similarity between entities and resources. It estimates the probability of an entity accessing a resource during a defined detection period within a given scope, such as a subscription or an account. Several optional parameters, including a minimal threshold, allow customization of the model’s behavior.
The model outputs an access anomaly score in the range [0, 1], where 0 indicates a high likelihood of legitimate access and 1 indicates a highly anomalous access. Alongside the access anomaly score, the function also returns a binary anomaly flag (based on the defined threshold) and additional explanatory fields.
Syntax
detect_anomalous_access_cf_fl(entityColumnName, resourceColumnName, scopeColumnName, timeColumnName, startTraining, startDetection, endDetection, [anomalyScoreThresh])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| entityColumnName | string | ✔️ | The name of the input table column containing entity names or IDs for which the cf model is calculated. |
| resourceColumnName | string | ✔️ | The name of the input table column containing resource names or IDs for which the model is calculated. |
| scopeColumnName | string | ✔️ | The name of the input table column containing the partition or scope, so that a different anomaly model is built for each scope. |
| timeColumnName | string | ✔️ | The name of the input table column containing the timestamps that are used to define the training and detection periods. |
| startTraining | datetime | ✔️ | The beginning of the training period for the anomaly model. Its end is defined by the beginning of detection period. |
| startDetection | datetime | ✔️ | The beginning of the detection period for anomaly detection. |
| endDetection | datetime | ✔️ | The end of the detection period for anomaly detection. |
| anomalyScoreThresh | real | The maximum value of anomaly score for which an anomaly is detected, a number in range [0, 1]. Higher values mean that only more significant cases are considered anomalous, so fewer anomalies are detected (higher precision, lower recall). The default value is 0.9. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let detect_anomalous_access_cf_fl = (T:(*), entityColumnName:string, resourceColumnName:string, scopeColumnName:string
, timeColumnName:string, startTraining:datetime, startDetection:datetime, endDetection:datetime
, anomalyScoreThresh:real = 0.9)
{
//pre-process the input data by adding standard column names and dividing to datasets
let processedData = (
T
| extend entity = column_ifexists(entityColumnName, '')
| extend resource = column_ifexists(resourceColumnName, '')
| extend scope = column_ifexists(scopeColumnName, '')
| extend sliceTime = todatetime(column_ifexists(timeColumnName, ''))
| where isnotempty(scope) and isnotempty(entity) and isnotempty(resource) and isnotempty(sliceTime)
| extend dataSet = case((sliceTime >= startTraining and sliceTime < startDetection), 'trainSet'
, sliceTime >= startDetection and sliceTime <= endDetection , 'detectSet'
, 'other')
| where dataSet in ('trainSet', 'detectSet')
);
// Create all possible pairs (entity, resource) with the same scope
let entities = (
processedData
| where dataSet == 'trainSet'
| summarize by entity, scope
| extend temp = 1
);
let resources = (
processedData
| where dataSet == 'trainSet'
| summarize by resource, scope
| extend temp = 1
);
let potentialAccessTrainData = (
entities
| join kind=inner resources on temp
| distinct entity, resource, scope
);
let accessTrainData = (
potentialAccessTrainData
| join kind=leftouter hint.strategy=broadcast (processedData | where dataSet =='trainSet') on entity, resource, scope
| extend usedOperation = iff(isempty(resource1), 0, 1)
| distinct entity, resource, scope, usedOperation
);
// Aggregate interaction scores per item into a list to prepare for similarity calculations
// Add a temporary key for self-joining later in the process
let ItemUserInteractions = (
accessTrainData
| summarize interactList = make_list(usedOperation) by resource, scope
| extend tempKey=1
);
// Compute item-to-item similarity using cosine similarity
let ItemSimilarities = (
ItemUserInteractions
| join kind=inner (ItemUserInteractions) on tempKey
| where scope == scope1
| extend similarity = series_cosine_similarity(interactList, interactList1)
| extend similarity = iff(isnan(similarity), 0.0, similarity)
| project resource, resource1, scope, similarity
);
// Predict user-item interactions based on item similarities
let Predictions = (
accessTrainData
| join kind=inner (ItemSimilarities) on scope and $left.resource == $right.resource1
| project entity, resource=resource2, usedOperation, similarity
| summarize accessAnomalyScore = sum(usedOperation * similarity) / sum(abs(similarity)) by entity, resource
| extend accessAnomalyScore = iff(isnan(accessAnomalyScore), 0.0, accessAnomalyScore)
| extend accessAnomalyScore = 1 - accessAnomalyScore
| extend accessAnomalyScore = round(accessAnomalyScore, 4)
| join kind=inner accessTrainData on entity, resource
| project entity, resource, scope, usedOperation, accessAnomalyScore
| extend accessAnomalyScore = iff(usedOperation == 0.0, accessAnomalyScore, todouble(usedOperation))
| order by entity asc, resource
);
let resultsData = (
processedData
| where dataSet == "detectSet"
| join kind=leftouter Predictions on entity, resource, scope
| extend isAnomalousAccess = iff(accessAnomalyScore > anomalyScoreThresh, 1, 0)
| project-away sliceTime, entity1, resource1, scope1, usedOperation
);
resultsData
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "KCL", docstring = "Detect anomalous access using collaborative filtering model", skipvalidation = "true")
detect_anomalous_access_cf_fl(T:(*), entityColumnName:string, resourceColumnName:string, scopeColumnName:string
, timeColumnName:string, startTraining:datetime, startDetection:datetime, endDetection:datetime
, anomalyScoreThresh:real=0.9)
{
//pre-process the input data by adding standard column names and dividing to datasets
let processedData = (
T
| extend entity = column_ifexists(entityColumnName, '')
| extend resource = column_ifexists(resourceColumnName, '')
| extend scope = column_ifexists(scopeColumnName, '')
| extend sliceTime = todatetime(column_ifexists(timeColumnName, ''))
| where isnotempty(scope) and isnotempty(entity) and isnotempty(resource) and isnotempty(sliceTime)
| extend dataSet = case((sliceTime >= startTraining and sliceTime < startDetection), 'trainSet'
, sliceTime >= startDetection and sliceTime <= endDetection, 'detectSet'
, 'other')
| where dataSet in ('trainSet', 'detectSet')
);
// Create all possible pairs (entity, resource) with the same scope
let entities = (
processedData
| where dataSet == 'trainSet'
| summarize by entity, scope
| extend temp = 1
);
let resources = (
processedData
| where dataSet == 'trainSet'
| summarize by resource, scope
| extend temp = 1
);
let potentialAccessTrainData = (
entities
| join kind=inner resources on temp
| distinct entity, resource, scope
);
let accessTrainData = (
potentialAccessTrainData
| join kind=leftouter hint.strategy=broadcast (processedData | where dataSet =='trainSet') on entity, resource, scope
| extend usedOperation = iff(isempty(resource1), 0, 1)
| distinct entity, resource, scope, usedOperation
);
// Aggregate interaction scores per item into a list to prepare for similarity calculations
// Add a temporary key for self-joining later in the process
let ItemUserInteractions = (
accessTrainData
| summarize interactList = make_list(usedOperation) by resource, scope
| extend tempKey=1
);
// Compute item-to-item similarity using cosine similarity
let ItemSimilarities = (
ItemUserInteractions
| join kind=inner (ItemUserInteractions) on tempKey
| where scope == scope1
| extend similarity = series_cosine_similarity(interactList, interactList1)
| extend similarity = iff(isnan(similarity), 0.0, similarity)
| project resource, resource1, scope, similarity
);
// Predict user-item interactions based on item similarities
let Predictions = (
accessTrainData
| join kind=inner (ItemSimilarities) on scope and $left.resource == $right.resource1
| project entity, resource=resource2, usedOperation, similarity
| summarize accessAnomalyScore = sum(usedOperation * similarity) / sum(abs(similarity)) by entity, resource
| extend accessAnomalyScore = iff(isnan(accessAnomalyScore), 0.0, accessAnomalyScore)
| extend accessAnomalyScore = 1 - accessAnomalyScore
| extend accessAnomalyScore = round(accessAnomalyScore, 4)
| join kind=inner accessTrainData on entity, resource
| project entity, resource, scope, usedOperation, accessAnomalyScore
| extend accessAnomalyScore = iff(usedOperation == 0.0, accessAnomalyScore, todouble(usedOperation))
| order by entity asc, resource
);
let resultsData = (
processedData
| where dataSet == "detectSet"
| join kind=leftouter Predictions on entity, resource, scope
| extend isAnomalousAccess = iff(accessAnomalyScore > anomalyScoreThresh, 1, 0)
| project-away sliceTime, entity1, resource1, scope1, usedOperation
);
resultsData
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let detect_anomalous_access_cf_fl = (T:(*), entityColumnName:string, resourceColumnName:string, scopeColumnName:string
, timeColumnName:string, startTraining:datetime, startDetection:datetime, endDetection:datetime
, anomalyScoreThresh:real = 0.9)
{
//pre-process the input data by adding standard column names and dividing to datasets
let processedData = (
T
| extend entity = column_ifexists(entityColumnName, '')
| extend resource = column_ifexists(resourceColumnName, '')
| extend scope = column_ifexists(scopeColumnName, '')
| extend sliceTime = todatetime(column_ifexists(timeColumnName, ''))
| where isnotempty(scope) and isnotempty(entity) and isnotempty(resource) and isnotempty(sliceTime)
| extend dataSet = case((sliceTime >= startTraining and sliceTime < startDetection), 'trainSet'
, sliceTime >= startDetection and sliceTime <= endDetection, 'detectSet'
, 'other')
| where dataSet in ('trainSet', 'detectSet')
);
// Create all possible pairs (entity, resource) with the same scope
let entities = (
processedData
| where dataSet == 'trainSet'
| summarize by entity, scope
| extend temp = 1
);
let resources = (
processedData
| where dataSet == 'trainSet'
| summarize by resource, scope
| extend temp = 1
);
let potentialAccessTrainData = (
entities
| join kind=inner resources on temp
| distinct entity, resource, scope
);
let accessTrainData = (
potentialAccessTrainData
| join kind=leftouter hint.strategy=broadcast (processedData | where dataSet =='trainSet') on entity, resource, scope
| extend usedOperation = iff(isempty(resource1), 0, 1)
| distinct entity, resource, scope, usedOperation
);
// Aggregate interaction scores per item into a list to prepare for similarity calculations
// Add a temporary key for self-joining later in the process
let ItemUserInteractions = (
accessTrainData
| summarize interactList = make_list(usedOperation) by resource, scope
| extend tempKey=1
);
// Compute item-to-item similarity using cosine similarity
let ItemSimilarities = (
ItemUserInteractions
| join kind=inner (ItemUserInteractions) on tempKey
| where scope == scope1
| extend similarity = series_cosine_similarity(interactList, interactList1)
| extend similarity = iff(isnan(similarity), 0.0, similarity)
| project resource, resource1, scope, similarity
);
// Predict user-item interactions based on item similarities
let Predictions = (
accessTrainData
| join kind=inner (ItemSimilarities) on scope and $left.resource == $right.resource1
| project entity, resource=resource2, usedOperation, similarity
| summarize accessAnomalyScore = sum(usedOperation * similarity) / sum(abs(similarity)) by entity, resource
| extend accessAnomalyScore = iff(isnan(accessAnomalyScore), 0.0, accessAnomalyScore)
| extend accessAnomalyScore = 1 - accessAnomalyScore
| extend accessAnomalyScore = round(accessAnomalyScore, 4)
| join kind=inner accessTrainData on entity, resource
| project entity, resource, scope, usedOperation, accessAnomalyScore
| extend accessAnomalyScore = iff(usedOperation == 0.0, accessAnomalyScore, todouble(usedOperation))
| order by entity asc, resource
);
let resultsData = (
processedData
| where dataSet == "detectSet"
| join kind=leftouter Predictions on entity, resource, scope
| extend isAnomalousAccess = iff(accessAnomalyScore > anomalyScoreThresh, 1, 0)
| project-away sliceTime, entity1, resource1, scope1, usedOperation
);
resultsData
};
// synthetic data generation
let detectPeriodStart = datetime(2022-04-30 05:00);
let trainPeriodStart = datetime(2022-03-01 05:00);
let names = pack_array("Admin", "Dev1", "Dev2", "IT-support");
let countNames = array_length(names);
let devices = toscalar(range device_id from 1 to 51 step 1 | extend device = strcat("device", tostring(device_id)) | summarize devices_array = make_list(device));
let countDevices = array_length(devices)-1;
let testData = range t from 0 to 24*60 step 1
| extend timeSlice = trainPeriodStart + 1h * t
| extend userName = tostring(names[toint(rand(countNames))])
| extend deviceId = tostring(devices[toint(rand(countDevices))])
| extend accountName = iff(((rand() < 0.2) and (timeSlice < detectPeriodStart)), 'testEnvironment', 'prodEnvironment')
| extend userName = iff(timeSlice == trainPeriodStart, 'H4ck3r', userName)
| extend deviceId = iff(timeSlice == trainPeriodStart, 'device1', deviceId)
| extend accountName = iff(timeSlice == trainPeriodStart, 'prodEnvironment', accountName)
| extend userName = iff(timeSlice == detectPeriodStart, 'H4ck3r', userName)
| extend deviceId = iff(timeSlice == detectPeriodStart, 'device50', deviceId)
| extend accountName = iff(timeSlice == detectPeriodStart, 'prodEnvironment', accountName)
| sort by timeSlice desc
;
testData
| invoke detect_anomalous_access_cf_fl(entityColumnName = 'userName'
, resourceColumnName = 'deviceId'
, scopeColumnName = 'accountName'
, timeColumnName = 'timeSlice'
, startTraining = trainPeriodStart
, startDetection = detectPeriodStart
, endDetection = detectPeriodStart
)
Stored
// synthetic data generation
let detectPeriodStart = datetime(2022-04-30 05:00);
let trainPeriodStart = datetime(2022-03-01 05:00);
let names = pack_array("Admin", "Dev1", "Dev2", "IT-support");
let countNames = array_length(names);
let devices = toscalar(range device_id from 1 to 51 step 1 | extend device = strcat("device", tostring(device_id)) | summarize devices_array = make_list(device));
let countDevices = array_length(devices)-1;
let testData = range t from 0 to 24*60 step 1
| extend timeSlice = trainPeriodStart + 1h * t
| extend userName = tostring(names[toint(rand(countNames))])
| extend deviceId = tostring(devices[toint(rand(countDevices))])
| extend accountName = iff(((rand() < 0.2) and (timeSlice < detectPeriodStart)), 'testEnvironment', 'prodEnvironment')
| extend userName = iff(timeSlice == trainPeriodStart, 'H4ck3r', userName)
| extend deviceId = iff(timeSlice == trainPeriodStart, 'device1', deviceId)
| extend accountName = iff(timeSlice == trainPeriodStart, 'prodEnvironment', accountName)
| extend userName = iff(timeSlice == detectPeriodStart, 'H4ck3r', userName)
| extend deviceId = iff(timeSlice == detectPeriodStart, 'device50', deviceId)
| extend accountName = iff(timeSlice == detectPeriodStart, 'prodEnvironment', accountName)
| sort by timeSlice desc
;
testData
| invoke detect_anomalous_access_cf_fl(entityColumnName = 'userName'
, resourceColumnName = 'deviceId'
, scopeColumnName = 'accountName'
, timeColumnName = 'timeSlice'
, startTraining = trainPeriodStart
, startDetection = detectPeriodStart
, endDetection = detectPeriodStart
)
Output
| t | timeSlice | userName | deviceId | accountName | entity | resource | scope | dataSet | accessAnomalyScore | isAnomalousAccess |
|---|---|---|---|---|---|---|---|---|---|---|
| 1440 | 2022-04-30 05:00:00.0000000 | H4ck3r | device50 | prodEnvironment | H4ck3r | device50 | prodEnvironment | detectSet | 0.982 | 1 |
The output of running the function shows each anomalous entity-resource access event during the detection period, filtered for cases where the predicted access probability (based on collaborative filtering) was higher than the defined anomaly threshold (by default, 0.9). Additional fields are added for clarity:
dataSet: current dataset (is alwaysdetectSet).accessAnomalyScore: the predicted access anomaly score of this access, based on collaborative filtering modeling. The value is in range [0, 1], higher values signify a higher degree of anomaly.isAnomalousAccess: binary flag for anomalous accesses
Running the function with default parameters flags the access attempt by the user ‘H4ck3r’ to device ‘device50’ within the ‘prodEnvironment’ account. The predicted access anomaly score is 0.982, which is very high, indicating that this access is unexpected according to the trained model based on historical patterns.
In the training period, the collaborative filtering model learned access patterns between users and devices within scopes. Since ‘H4ck3r’ accessing ‘device50’ was not observed and considered unlikely in the historical data, it was flagged as anomalous.
The output table presents these anomalous accesses together with the predicted access score. These fields are useful for further investigation, alerting, or integration with broader detection workflows.
The suggested usage in a cybersecurity context is to monitor important entities, such as usernames or IPs, accessing important resources like devices, databases, or applications within their corresponding scopes (e.g., account or subscription).
2 - detect_anomalous_spike_fl()
Detect the appearance of anomalous spikes in numeric variables in timestamped data.
The function detect_anomalous_spike_fl() is a UDF (user-defined function) that detects the appearance of anomalous spikes in numeric variables - such as amount of exfiltrated data or failed sign in attempts - in timestamped data, such as traffic logs. In cybersecurity context, such events might be suspicious and indicate a potential attack or compromise.
The anomaly model is based on a combination of two scores: Z-score (the number of standard deviations above average) and Q-score (the number of interquantile ranges above a high quantile). Z-score is a straightforward and common outlier metric; Q-score is based on Tukey’s fences - but we extend the definition to any quantiles for more control. Choosing different quantiles (by default, 95th and 25th quantiles are used) allows to detect more significant outliers, thus improving precision. The model is built on top of some numeric variable (such as the number of login attempts or amount of exfiltrated data) and is calculated per scope (such as subscription or account) and per entity (such as user or device).
After calculating the scores for a single-variate numeric datapoint and checking other requirements (for example, the number of active days in training period on scope is above a predefined threshold), we check whether each of the scores is above its predefined threshold. If so, a spike is detected and the datapoint is flagged as anomalous. Two models are built: one for entity level (defined by entityColumnName parameter) - such as user or device per scope (defined by scopeColumnName parameter) - such as account or subscription. The second model is built for the whole scope. The anomaly detection logic is executed for each model and if anomaly is detected in one of them - it is shown. By default, upward spikes are detected; downward spikes (‘dips’) can also be interesting in some contexts and can be detected by adapting the logic.
The model’s direct output is an anomaly score based on the scores. The score is monotonous in the range of [0, 1], with 1 representing something anomalous. In addition to the anomaly score, there’s a binary flag for detected anomaly (controlled by a minimal threshold parameter), and other explanatory fields.
Note that the function disregards the temporal structure of the variable (mainly for scalability and explainability). If the variable has significant temporal components - such as trend and seasonalities - we suggest considering either the series_decompose_anomalies() function, or use series_decompose() in order to calculate the residual and execute detect_anomalous_spike_fl() on top of it.
Syntax
detect_anomalous_spike_fl(numericColumnName, entityColumnName, scopeColumnName, timeColumnName, startTraining, startDetection, endDetection, [minTrainingDaysThresh], [lowPercentileForQscore], [highPercentileForQscore], [minSlicesPerEntity], [zScoreThreshEntity], [qScoreThreshEntity], [minNumValueThreshEntity], [minSlicesPerScope], [zScoreThreshScope], [qScoreThreshScope], [minNumValueThreshScope])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| numericColumnName | string | ✔️ | The name of the input table column containing numeric variable for which anomaly models are calculated. |
| entityColumnName | string | ✔️ | The name of the input table column containing the names or IDs of the entities for which anomaly model is calculated. |
| scopeColumnName | string | ✔️ | The name of the input table column containing the partition or scope, so that a different anomaly model is built for each scope. |
| timeColumnName | string | ✔️ | The name of the input table column containing the timestamps, that are used to define the training and detection periods. |
| startTraining | datetime | ✔️ | The beginning of the training period for the anomaly model. Its end is defined by the beginning of detection period. |
| startDetection | datetime | ✔️ | The beginning of the detection period for anomaly detection. |
| endDetection | datetime | ✔️ | The end of the detection period for anomaly detection. |
| minTrainingDaysThresh | int | The minimum number of days in training period that a scope exists to calculate anomalies. If it is below threshold, the scope is considered too new and unknown, so anomalies aren’t calculated. The default value is 14. | |
| lowPercentileForQscore | real | A number in range [0.0,1.0] representing the percentile to be calculated as low limit for Q-score. In Tukey’s fences, 0.25 is used. The default value is 0.25. Choosing a lower percentile improves precision as more significant anomalies are detected. | |
| highPercentileForQscore | real | A number in range [0.0,1.0] representing the percentile to be calculated as high limit for Q-score. In Tukey’s fences, 0.75 is used. The default value is 0.9. Choosing a higher percentile improves precision as more significant anomalies are detected. | |
| minSlicesPerEntity | int | The minimum threshold of ‘slices’ (for example, days) to exist on an entity before anomaly model is built for it. If the number is below the threshold, the entity is considered too new and unstable. The default value is 20. | |
| zScoreThreshEntity | real | The minimum threshold for entity-level Z-score (number of standard deviations above average) to be flagged as anomaly. When choosing higher values, only more significant anomalies are detected. The default value is 3.0. | |
| qScoreThreshEntity | real | The minimum threshold for entity-level Q-score (number of interquantile ranges above high quantile) to be flagged as anomaly. When choosing higher values, only more significant anomalies are detected. Default value is 2.0. | |
| minNumValueThreshEntity | long | The minimum threshold for numeric variable to be flagged as anomaly for an entity. This is useful for filtering cases when a value is anomalous statistically (high Z-score and Q-score), but the value itself is too small to be interesting. The default value is 0. | |
| minSlicesPerScope | int | The minimum threshold of ‘slices’ (for example, days) to exist on a scope before anomaly model is built for it. If the number is below the threshold, the scope is considered too new and unstable. The default value is 20. | |
| zScoreThreshScope | real | The minimum threshold for scope-level Z-score (number of standard deviations above average) to be flagged as anomaly. When choosing higher values, only more significant anomalies are detected. The default value is 3.0. | |
| qScoreThreshScope | real | The minimum threshold for scope-level Q-score (number of interquantile ranges above high quantile) to be flagged as anomaly. When choosing higher values, only more significant anomalies are detected. The default value is 2.0. | |
| minNumValueThreshScope | long | The minimum threshold for numeric variable to be flagged as anomaly for a scope. This is useful for filtering cases when a value is anomalous statistically (high Z-score and Q-score), but the value itself is too small to be interesting. The default value is 0. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let detect_anomalous_spike_fl = (T:(*), numericColumnName:string, entityColumnName:string, scopeColumnName:string
, timeColumnName:string, startTraining:datetime, startDetection:datetime, endDetection:datetime, minTrainingDaysThresh:int = 14
, lowPercentileForQscore:real = 0.25, highPercentileForQscore:real = 0.9
, minSlicesPerEntity:int = 20, zScoreThreshEntity:real = 3.0, qScoreThreshEntity:real = 2.0, minNumValueThreshEntity:long = 0
, minSlicesPerScope:int = 20, zScoreThreshScope:real = 3.0, qScoreThreshScope:real = 2.0, minNumValueThreshScope:long = 0)
{
// pre-process the input data by adding standard column names and dividing to datasets
let timePeriodBinSize = 'day'; // we assume a reasonable bin for time is day
let processedData = (
T
| extend scope = column_ifexists(scopeColumnName, '')
| extend entity = column_ifexists(entityColumnName, '')
| extend numVec = tolong(column_ifexists(numericColumnName, 0))
| extend sliceTime = todatetime(column_ifexists(timeColumnName, ''))
| where isnotempty(scope) and isnotempty(sliceTime)
| extend dataSet = case((sliceTime >= startTraining and sliceTime < startDetection), 'trainSet'
, sliceTime >= startDetection and sliceTime <= endDetection, 'detectSet'
, 'other')
| where dataSet in ('trainSet', 'detectSet')
);
let aggregatedCandidateScopeData = (
processedData
| summarize firstSeenScope = min(sliceTime), lastSeenScope = max(sliceTime) by scope
| extend slicesInTrainingScope = datetime_diff(timePeriodBinSize, startDetection, firstSeenScope)
| where slicesInTrainingScope >= minTrainingDaysThresh and lastSeenScope >= startDetection
);
let entityModelData = (
processedData
| join kind = inner (aggregatedCandidateScopeData) on scope
| where dataSet == 'trainSet'
| summarize countSlicesEntity = dcount(sliceTime), avgNumEntity = avg(numVec), sdNumEntity = stdev(numVec)
, lowPrcNumEntity = percentile(numVec, lowPercentileForQscore), highPrcNumEntity = percentile(numVec, highPercentileForQscore)
, firstSeenEntity = min(sliceTime), lastSeenEntity = max(sliceTime)
by scope, entity
| extend slicesInTrainingEntity = datetime_diff(timePeriodBinSize, startDetection, firstSeenEntity)
);
let scopeModelData = (
processedData
| join kind = inner (aggregatedCandidateScopeData) on scope
| where dataSet == 'trainSet'
| summarize countSlicesScope = dcount(sliceTime), avgNumScope = avg(numVec), sdNumScope = stdev(numVec)
, lowPrcNumScope = percentile(numVec, lowPercentileForQscore), highPrcNumScope = percentile(numVec, highPercentileForQscore)
by scope
);
let resultsData = (
processedData
| where dataSet == 'detectSet'
| join kind = inner (aggregatedCandidateScopeData) on scope
| join kind = leftouter (entityModelData) on scope, entity
| join kind = leftouter (scopeModelData) on scope
| extend zScoreEntity = iff(countSlicesEntity >= minSlicesPerEntity, round((toreal(numVec) - avgNumEntity)/(sdNumEntity + 1), 2), 0.0)
, qScoreEntity = iff(countSlicesEntity >= minSlicesPerEntity, round((toreal(numVec) - highPrcNumEntity)/(highPrcNumEntity - lowPrcNumEntity + 1), 2), 0.0)
, zScoreScope = iff(countSlicesScope >= minSlicesPerScope, round((toreal(numVec) - avgNumScope)/(sdNumScope + 1), 2), 0.0)
, qScoreScope = iff(countSlicesScope >= minSlicesPerScope, round((toreal(numVec) - highPrcNumScope)/(highPrcNumScope - lowPrcNumScope + 1), 2), 0.0)
| extend isSpikeOnEntity = iff((slicesInTrainingEntity >= minTrainingDaysThresh and zScoreEntity > zScoreThreshEntity and qScoreEntity > qScoreThreshEntity and numVec >= minNumValueThreshEntity), 1, 0)
, entityHighBaseline= round(max_of((avgNumEntity + sdNumEntity), highPrcNumEntity), 2)
, isSpikeOnScope = iff((countSlicesScope >= minTrainingDaysThresh and zScoreScope > zScoreThreshScope and qScoreScope > qScoreThreshScope and numVec >= minNumValueThreshScope), 1, 0)
, scopeHighBaseline = round(max_of((avgNumEntity + 2 * sdNumEntity), highPrcNumScope), 2)
| extend entitySpikeAnomalyScore = iff(isSpikeOnEntity == 1, round(1.0 - 0.25/(max_of(zScoreEntity, qScoreEntity)),4), 0.00)
, scopeSpikeAnomalyScore = iff(isSpikeOnScope == 1, round(1.0 - 0.25/(max_of(zScoreScope, qScoreScope)), 4), 0.00)
| where isSpikeOnEntity == 1 or isSpikeOnScope == 1
| extend avgNumEntity = round(avgNumEntity, 2), sdNumEntity = round(sdNumEntity, 2)
, avgNumScope = round(avgNumScope, 2), sdNumScope = round(sdNumScope, 2)
| project-away entity1, scope1, scope2, scope3
| extend anomalyType = iff(isSpikeOnEntity == 1, strcat('spike_', entityColumnName), strcat('spike_', scopeColumnName)), anomalyScore = max_of(entitySpikeAnomalyScore, scopeSpikeAnomalyScore)
| extend anomalyExplainability = iff(isSpikeOnEntity == 1
, strcat('The value of numeric variable ', numericColumnName, ' for ', entityColumnName, ' ', entity, ' is ', numVec, ', which is abnormally high for this '
, entityColumnName, ' at this ', scopeColumnName
, '. Based on observations from last ' , slicesInTrainingEntity, ' ', timePeriodBinSize, 's, the expected baseline value is below ', entityHighBaseline, '.')
, strcat('The value of numeric variable ', numericColumnName, ' on ', scopeColumnName, ' ', scope, ' is ', numVec, ', which is abnormally high for this '
, scopeColumnName, '. Based on observations from last ' , slicesInTrainingScope, ' ', timePeriodBinSize, 's, the expected baseline value is below ', scopeHighBaseline, '.'))
| extend anomalyState = iff(isSpikeOnEntity == 1
, bag_pack('avg', avgNumEntity, 'stdev', sdNumEntity, strcat('percentile_', lowPercentileForQscore), lowPrcNumEntity, strcat('percentile_', highPercentileForQscore), highPrcNumEntity)
, bag_pack('avg', avgNumScope, 'stdev', sdNumScope, strcat('percentile_', lowPercentileForQscore), lowPrcNumScope, strcat('percentile_', highPercentileForQscore), highPrcNumScope))
| project-away lowPrcNumEntity, highPrcNumEntity, lowPrcNumScope, highPrcNumScope
);
resultsData
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (docstring = "Detect anomalous high spikes in a numeric variable (such as amount of extracted data or failed logins) per scope (such as subscription or account) or per entity (such as user or device) on scope", skipvalidation = "true", folder = 'Cybersecurity')
detect_anomalous_spike_fl(T:(*), numericColumnName:string, entityColumnName:string, scopeColumnName:string
, timeColumnName:string, startTraining:datetime, startDetection:datetime, endDetection:datetime, minTrainingDaysThresh:int = 14
, lowPercentileForQscore:real = 0.25, highPercentileForQscore:real = 0.9
, minSlicesPerEntity:int = 20, zScoreThreshEntity:real = 3.0, qScoreThreshEntity:real = 2.0, minNumValueThreshEntity:long = 0
, minSlicesPerScope:int = 20, zScoreThreshScope:real = 3.0, qScoreThreshScope:real = 2.0, minNumValueThreshScope:long = 0)
{
// pre-process the input data by adding standard column names and dividing to datasets
let timePeriodBinSize = 'day'; // we assume a reasonable bin for time is day
let processedData = (
T
| extend scope = column_ifexists(scopeColumnName, '')
| extend entity = column_ifexists(entityColumnName, '')
| extend numVec = tolong(column_ifexists(numericColumnName, 0))
| extend sliceTime = todatetime(column_ifexists(timeColumnName, ''))
| where isnotempty(scope) and isnotempty(sliceTime)
| extend dataSet = case((sliceTime >= startTraining and sliceTime < startDetection), 'trainSet'
, sliceTime >= startDetection and sliceTime <= endDetection, 'detectSet'
, 'other')
| where dataSet in ('trainSet', 'detectSet')
);
let aggregatedCandidateScopeData = (
processedData
| summarize firstSeenScope = min(sliceTime), lastSeenScope = max(sliceTime) by scope
| extend slicesInTrainingScope = datetime_diff(timePeriodBinSize, startDetection, firstSeenScope)
| where slicesInTrainingScope >= minTrainingDaysThresh and lastSeenScope >= startDetection
);
let entityModelData = (
processedData
| join kind = inner (aggregatedCandidateScopeData) on scope
| where dataSet == 'trainSet'
| summarize countSlicesEntity = dcount(sliceTime), avgNumEntity = avg(numVec), sdNumEntity = stdev(numVec)
, lowPrcNumEntity = percentile(numVec, lowPercentileForQscore), highPrcNumEntity = percentile(numVec, highPercentileForQscore)
, firstSeenEntity = min(sliceTime), lastSeenEntity = max(sliceTime)
by scope, entity
| extend slicesInTrainingEntity = datetime_diff(timePeriodBinSize, startDetection, firstSeenEntity)
);
let scopeModelData = (
processedData
| join kind = inner (aggregatedCandidateScopeData) on scope
| where dataSet == 'trainSet'
| summarize countSlicesScope = dcount(sliceTime), avgNumScope = avg(numVec), sdNumScope = stdev(numVec)
, lowPrcNumScope = percentile(numVec, lowPercentileForQscore), highPrcNumScope = percentile(numVec, highPercentileForQscore)
by scope
);
let resultsData = (
processedData
| where dataSet == 'detectSet'
| join kind = inner (aggregatedCandidateScopeData) on scope
| join kind = leftouter (entityModelData) on scope, entity
| join kind = leftouter (scopeModelData) on scope
| extend zScoreEntity = iff(countSlicesEntity >= minSlicesPerEntity, round((toreal(numVec) - avgNumEntity)/(sdNumEntity + 1), 2), 0.0)
, qScoreEntity = iff(countSlicesEntity >= minSlicesPerEntity, round((toreal(numVec) - highPrcNumEntity)/(highPrcNumEntity - lowPrcNumEntity + 1), 2), 0.0)
, zScoreScope = iff(countSlicesScope >= minSlicesPerScope, round((toreal(numVec) - avgNumScope)/(sdNumScope + 1), 2), 0.0)
, qScoreScope = iff(countSlicesScope >= minSlicesPerScope, round((toreal(numVec) - highPrcNumScope)/(highPrcNumScope - lowPrcNumScope + 1), 2), 0.0)
| extend isSpikeOnEntity = iff((slicesInTrainingEntity >= minTrainingDaysThresh and zScoreEntity > zScoreThreshEntity and qScoreEntity > qScoreThreshEntity and numVec >= minNumValueThreshEntity), 1, 0)
, entityHighBaseline= round(max_of((avgNumEntity + sdNumEntity), highPrcNumEntity), 2)
, isSpikeOnScope = iff((countSlicesScope >= minTrainingDaysThresh and zScoreScope > zScoreThreshScope and qScoreScope > qScoreThreshScope and numVec >= minNumValueThreshScope), 1, 0)
, scopeHighBaseline = round(max_of((avgNumEntity + 2 * sdNumEntity), highPrcNumScope), 2)
| extend entitySpikeAnomalyScore = iff(isSpikeOnEntity == 1, round(1.0 - 0.25/(max_of(zScoreEntity, qScoreEntity)),4), 0.00)
, scopeSpikeAnomalyScore = iff(isSpikeOnScope == 1, round(1.0 - 0.25/(max_of(zScoreScope, qScoreScope)), 4), 0.00)
| where isSpikeOnEntity == 1 or isSpikeOnScope == 1
| extend avgNumEntity = round(avgNumEntity, 2), sdNumEntity = round(sdNumEntity, 2)
, avgNumScope = round(avgNumScope, 2), sdNumScope = round(sdNumScope, 2)
| project-away entity1, scope1, scope2, scope3
| extend anomalyType = iff(isSpikeOnEntity == 1, strcat('spike_', entityColumnName), strcat('spike_', scopeColumnName)), anomalyScore = max_of(entitySpikeAnomalyScore, scopeSpikeAnomalyScore)
| extend anomalyExplainability = iff(isSpikeOnEntity == 1
, strcat('The value of numeric variable ', numericColumnName, ' for ', entityColumnName, ' ', entity, ' is ', numVec, ', which is abnormally high for this '
, entityColumnName, ' at this ', scopeColumnName
, '. Based on observations from last ' , slicesInTrainingEntity, ' ', timePeriodBinSize, 's, the expected baseline value is below ', entityHighBaseline, '.')
, strcat('The value of numeric variable ', numericColumnName, ' on ', scopeColumnName, ' ', scope, ' is ', numVec, ', which is abnormally high for this '
, scopeColumnName, '. Based on observations from last ' , slicesInTrainingScope, ' ', timePeriodBinSize, 's, the expected baseline value is below ', scopeHighBaseline, '.'))
| extend anomalyState = iff(isSpikeOnEntity == 1
, bag_pack('avg', avgNumEntity, 'stdev', sdNumEntity, strcat('percentile_', lowPercentileForQscore), lowPrcNumEntity, strcat('percentile_', highPercentileForQscore), highPrcNumEntity)
, bag_pack('avg', avgNumScope, 'stdev', sdNumScope, strcat('percentile_', lowPercentileForQscore), lowPrcNumScope, strcat('percentile_', highPercentileForQscore), highPrcNumScope))
| project-away lowPrcNumEntity, highPrcNumEntity, lowPrcNumScope, highPrcNumScope
);
resultsData
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let detect_anomalous_spike_fl = (T:(*), numericColumnName:string, entityColumnName:string, scopeColumnName:string
, timeColumnName:string, startTraining:datetime, startDetection:datetime, endDetection:datetime, minTrainingDaysThresh:int = 14
, lowPercentileForQscore:real = 0.25, highPercentileForQscore:real = 0.9
, minSlicesPerEntity:int = 20, zScoreThreshEntity:real = 3.0, qScoreThreshEntity:real = 2.0, minNumValueThreshEntity:long = 0
, minSlicesPerScope:int = 20, zScoreThreshScope:real = 3.0, qScoreThreshScope:real = 2.0, minNumValueThreshScope:long = 0)
{
// pre-process the input data by adding standard column names and dividing to datasets
let timePeriodBinSize = 'day'; // we assume a reasonable bin for time is day
let processedData = (
T
| extend scope = column_ifexists(scopeColumnName, '')
| extend entity = column_ifexists(entityColumnName, '')
| extend numVec = tolong(column_ifexists(numericColumnName, 0))
| extend sliceTime = todatetime(column_ifexists(timeColumnName, ''))
| where isnotempty(scope) and isnotempty(sliceTime)
| extend dataSet = case((sliceTime >= startTraining and sliceTime < startDetection), 'trainSet'
, sliceTime >= startDetection and sliceTime <= endDetection, 'detectSet'
, 'other')
| where dataSet in ('trainSet', 'detectSet')
);
let aggregatedCandidateScopeData = (
processedData
| summarize firstSeenScope = min(sliceTime), lastSeenScope = max(sliceTime) by scope
| extend slicesInTrainingScope = datetime_diff(timePeriodBinSize, startDetection, firstSeenScope)
| where slicesInTrainingScope >= minTrainingDaysThresh and lastSeenScope >= startDetection
);
let entityModelData = (
processedData
| join kind = inner (aggregatedCandidateScopeData) on scope
| where dataSet == 'trainSet'
| summarize countSlicesEntity = dcount(sliceTime), avgNumEntity = avg(numVec), sdNumEntity = stdev(numVec)
, lowPrcNumEntity = percentile(numVec, lowPercentileForQscore), highPrcNumEntity = percentile(numVec, highPercentileForQscore)
, firstSeenEntity = min(sliceTime), lastSeenEntity = max(sliceTime)
by scope, entity
| extend slicesInTrainingEntity = datetime_diff(timePeriodBinSize, startDetection, firstSeenEntity)
);
let scopeModelData = (
processedData
| join kind = inner (aggregatedCandidateScopeData) on scope
| where dataSet == 'trainSet'
| summarize countSlicesScope = dcount(sliceTime), avgNumScope = avg(numVec), sdNumScope = stdev(numVec)
, lowPrcNumScope = percentile(numVec, lowPercentileForQscore), highPrcNumScope = percentile(numVec, highPercentileForQscore)
by scope
);
let resultsData = (
processedData
| where dataSet == 'detectSet'
| join kind = inner (aggregatedCandidateScopeData) on scope
| join kind = leftouter (entityModelData) on scope, entity
| join kind = leftouter (scopeModelData) on scope
| extend zScoreEntity = iff(countSlicesEntity >= minSlicesPerEntity, round((toreal(numVec) - avgNumEntity)/(sdNumEntity + 1), 2), 0.0)
, qScoreEntity = iff(countSlicesEntity >= minSlicesPerEntity, round((toreal(numVec) - highPrcNumEntity)/(highPrcNumEntity - lowPrcNumEntity + 1), 2), 0.0)
, zScoreScope = iff(countSlicesScope >= minSlicesPerScope, round((toreal(numVec) - avgNumScope)/(sdNumScope + 1), 2), 0.0)
, qScoreScope = iff(countSlicesScope >= minSlicesPerScope, round((toreal(numVec) - highPrcNumScope)/(highPrcNumScope - lowPrcNumScope + 1), 2), 0.0)
| extend isSpikeOnEntity = iff((slicesInTrainingEntity >= minTrainingDaysThresh and zScoreEntity > zScoreThreshEntity and qScoreEntity > qScoreThreshEntity and numVec >= minNumValueThreshEntity), 1, 0)
, entityHighBaseline= round(max_of((avgNumEntity + sdNumEntity), highPrcNumEntity), 2)
, isSpikeOnScope = iff((countSlicesScope >= minTrainingDaysThresh and zScoreScope > zScoreThreshScope and qScoreScope > qScoreThreshScope and numVec >= minNumValueThreshScope), 1, 0)
, scopeHighBaseline = round(max_of((avgNumEntity + 2 * sdNumEntity), highPrcNumScope), 2)
| extend entitySpikeAnomalyScore = iff(isSpikeOnEntity == 1, round(1.0 - 0.25/(max_of(zScoreEntity, qScoreEntity)),4), 0.00)
, scopeSpikeAnomalyScore = iff(isSpikeOnScope == 1, round(1.0 - 0.25/(max_of(zScoreScope, qScoreScope)), 4), 0.00)
| where isSpikeOnEntity == 1 or isSpikeOnScope == 1
| extend avgNumEntity = round(avgNumEntity, 2), sdNumEntity = round(sdNumEntity, 2)
, avgNumScope = round(avgNumScope, 2), sdNumScope = round(sdNumScope, 2)
| project-away entity1, scope1, scope2, scope3
| extend anomalyType = iff(isSpikeOnEntity == 1, strcat('spike_', entityColumnName), strcat('spike_', scopeColumnName)), anomalyScore = max_of(entitySpikeAnomalyScore, scopeSpikeAnomalyScore)
| extend anomalyExplainability = iff(isSpikeOnEntity == 1
, strcat('The value of numeric variable ', numericColumnName, ' for ', entityColumnName, ' ', entity, ' is ', numVec, ', which is abnormally high for this '
, entityColumnName, ' at this ', scopeColumnName
, '. Based on observations from last ' , slicesInTrainingEntity, ' ', timePeriodBinSize, 's, the expected baseline value is below ', entityHighBaseline, '.')
, strcat('The value of numeric variable ', numericColumnName, ' on ', scopeColumnName, ' ', scope, ' is ', numVec, ', which is abnormally high for this '
, scopeColumnName, '. Based on observations from last ' , slicesInTrainingScope, ' ', timePeriodBinSize, 's, the expected baseline value is below ', scopeHighBaseline, '.'))
| extend anomalyState = iff(isSpikeOnEntity == 1
, bag_pack('avg', avgNumEntity, 'stdev', sdNumEntity, strcat('percentile_', lowPercentileForQscore), lowPrcNumEntity, strcat('percentile_', highPercentileForQscore), highPrcNumEntity)
, bag_pack('avg', avgNumScope, 'stdev', sdNumScope, strcat('percentile_', lowPercentileForQscore), lowPrcNumScope, strcat('percentile_', highPercentileForQscore), highPrcNumScope))
| project-away lowPrcNumEntity, highPrcNumEntity, lowPrcNumScope, highPrcNumScope
);
resultsData
};
let detectPeriodStart = datetime(2022-04-30 05:00:00.0000000);
let trainPeriodStart = datetime(2022-03-01 05:00);
let names = pack_array("Admin", "Dev1", "Dev2", "IT-support");
let countNames = array_length(names);
let testData = range t from 1 to 24*60 step 1
| extend timeSlice = trainPeriodStart + 1h * t
| extend countEvents = round(2*rand() + iff((t/24)%7>=5, 10.0, 15.0) - (((t%24)/10)*((t%24)/10)), 2) * 100
| extend userName = tostring(names[toint(rand(countNames))])
| extend deviceId = hash_md5(rand())
| extend accountName = iff(((rand() < 0.2) and (timeSlice < detectPeriodStart)), 'testEnvironment', 'prodEnvironment')
| extend userName = iff(timeSlice == detectPeriodStart, 'H4ck3r', userName)
| extend countEvents = iff(timeSlice == detectPeriodStart, 3*countEvents, countEvents)
| sort by timeSlice desc
;
testData
| invoke detect_anomalous_spike_fl(numericColumnName = 'countEvents'
, entityColumnName = 'userName'
, scopeColumnName = 'accountName'
, timeColumnName = 'timeSlice'
, startTraining = trainPeriodStart
, startDetection = detectPeriodStart
, endDetection = detectPeriodStart
)
Stored
let detectPeriodStart = datetime(2022-04-30 05:00:00.0000000);
let trainPeriodStart = datetime(2022-03-01 05:00);
let names = pack_array("Admin", "Dev1", "Dev2", "IT-support");
let countNames = array_length(names);
let testData = range t from 1 to 24*60 step 1
| extend timeSlice = trainPeriodStart + 1h * t
| extend countEvents = round(2*rand() + iff((t/24)%7>=5, 10.0, 15.0) - (((t%24)/10)*((t%24)/10)), 2) * 100
| extend userName = tostring(names[toint(rand(countNames))])
| extend deviceId = hash_md5(rand())
| extend accountName = iff(((rand() < 0.2) and (timeSlice < detectPeriodStart)), 'testEnvironment', 'prodEnvironment')
| extend userName = iff(timeSlice == detectPeriodStart, 'H4ck3r', userName)
| extend countEvents = iff(timeSlice == detectPeriodStart, 3*countEvents, countEvents)
| sort by timeSlice desc
;
testData
| invoke detect_anomalous_spike_fl(numericColumnName = 'countEvents'
, entityColumnName = 'userName'
, scopeColumnName = 'accountName'
, timeColumnName = 'timeSlice'
, startTraining = trainPeriodStart
, startDetection = detectPeriodStart
, endDetection = detectPeriodStart
)
Output
| t | timeSlice | countEvents | userName | deviceId | accountName | scope | entity | numVec | sliceTime | dataSet | firstSeenScope | lastSeenScope | slicesInTrainingScope | countSlicesEntity | avgNumEntity | sdNumEntity | firstSeenEntity | lastSeenEntity | slicesInTrainingEntity | countSlicesScope | avgNumScope | sdNumScope | zScoreEntity | qScoreEntity | zScoreScope | qScoreScope | isSpikeOnEntity | entityHighBaseline | isSpikeOnScope | scopeHighBaseline | entitySpikeAnomalyScore | scopeSpikeAnomalyScore | anomalyType | anomalyScore | anomalyExplainability | anomalyState |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1440 | 2022-04-30 05:00:00.0000000 | 5079 | H4ck3r | 9e8e151aced5a64938b93ee0c13fe940 | prodEnvironment | prodEnvironment | H4ck3r | 5079 | 2022-04-30 05:00:00.0000000 | detectSet | 2022-03-01 08:00:00.0000000 | 2022-04-30 05:00:00.0000000 | 60 | 1155 | 1363.22 | 267.51 | 0 | 0 | 13.84 | 185.46 | 0 | 1 | 628 | 0 | 0.9987 | spike_accountName | 0.9987 | The value of numeric variable countEvents on accountName prodEnvironment is 5079, which is abnormally high for this accountName. Based on observations from last 60 days, the expected baseline value is below 628.0. | {“avg”: 1363.22,“stdev”: 267.51,“percentile_0.25”: 605,“percentile_0.9”: 628} |
The output of running the function is the rows in detection dataset that were tagged as anomalous spikes either at scope or entity levels. Some other fields are added for clarity:
dataSet: current dataset (is alwaysdetectSet).firstSeenScope: timestamp when the scope was first seen.lastSeenScope: timestamp when the scope was last seen.slicesInTrainingScope: number of slices (for example, days) that the scope exists in training dataset.countSlicesEntity: number of slices (for example, days) that the entity exists on scope.avgNumEntity: average of the numeric variable in training set per entity on scope.sdNumEntity: standard deviation of the numeric variable in training set per entity on scope.firstSeenEntity: timestamp when the entity was first seen on scope.lastSeenEntity: timestamp when the entity was last seen on scope.slicesInTrainingEntity: number of slices (for example, days) that the entity exists on scope in training dataset.countSlicesScope: number of slices (for example, days) that the scope exists.avgNumScope: average of the numeric variable in training set per scope.sdNumScope: standard deviation of the numeric variable in training set per scope.zScoreEntity: Z-score for the current value of numeric variable based on entity model.qScoreEntity: Q-score for the current value of numeric variable based on entity model.zScoreScope: Z-score for the current value of numeric variable based on scope model.qScoreScope: Q-score for the current value of numeric variable based on scope model.isSpikeOnEntity: binary flag for anomalous spike based on entity model.entityHighBaseline: expected high baseline for numeric variable values based on entity model.isSpikeOnScope: binary flag for anomalous spike based on scope model.scopeHighBaseline: expected high baseline for numeric variable values based on scope model.entitySpikeAnomalyScore: anomaly score for the spike based on entity model; a number in range [0,1], higher values meaning more anomaly.scopeSpikeAnomalyScore: anomaly score for the spike based on scope model; a number in range [0,1], higher values meaning more anomaly.anomalyType: shows the type of anomaly (helpful when running several anomaly detection logics together).anomalyScore: anomaly score for the spike based on the chosen model.anomalyExplainability: textual wrapper for generated anomaly and its explanation.anomalyState: bag of metrics from the chosen model (average, standard deviation and percentiles) describing the model.
In the example above, running this function on countEvents variable using user as entity and account as scope with default parameters detects a spike on scope level. Since the user ‘H4ck3r’ doesn’t have enough data in training period, the anomaly isn’t calculated for entity level and all relevant fields are empty. The scope level anomaly has an anomaly score of 0.998, meaning that this spike is anomalous for the scope.
If we raise any of the minimum thresholds high enough, no anomaly will be detected since requirements would be too high.
The output shows the rows with anomalous spikes together with explanation fields in standardized format. These fields are useful for investigating the anomaly and for running anomalous spike detection on several numeric variables or running other algorithms together.
The suggested usage in cybersecurity context is running the function on meaningful numeric variables (amounts of downloaded data, counts of uploaded files or failed sign in attempts) per meaningful scopes (such as subscription on accounts) and entities (such as users or devices). A detected anomalous spike means that the numeric value is higher than what is expected on that scope or entity, and might be suspicious.
3 - graph_blast_radius_fl()
Calculate the Blast Radius (list and score) of source nodes over path or edge data.
The function graph_blast_radius_fl() is a UDF (user-defined function) that allows you to calculate the Blast Radius of each of the source nodes based on paths or edges data. Each row of input data contains a source node and a target node, which can represent direct connections (edges) between nodes and targets, or longer multi-hop paths between them. If the paths aren’t available, we can first discover them using the graph-match operator or graph_path_discovery_fl() function. Then graph_blast_radius_fl() can be executed on top of the output of path discovery.
Blast Radius represents the connectivity of a specific source node to relevant targets. The more targets the source can access, the more effect it has if it’s compromised by the attacker - hence the name. Nodes with high Blast Radius are important in the cybersecurity domain due to the potential damage they might cause and to being highly valued by attackers. Thus, nodes with high Blast Radius should be protected accordingly - in terms of hardening and prioritizing security signals such as alerts.
The function outputs a list of connected targets for each source and also a score representing targets’ number. Optionally, in case there’s a meaningful ‘weight’ for each target (such as criticality or cost), a weighted score is calculated as a sum of targets’ weights. In addition, the limits for maximum total number of shown sources and maximum number of targets in each list are exposed as optional parameters for better control.
Syntax
graph_blast_radius_fl(sourceIdColumnName, targetIdColumnName, [targetWeightColumnName], [resultCountLimit], [listedIdsLimit])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| sourceIdColumnName | string | ✔️ | The name of the column containing the source node IDs (either for edges or paths). |
| targetIdColumnName | string | ✔️ | The name of the column containing the target node IDs (either for edges or paths). |
| targetWeightColumnName | string | The name of the column containing the target nodes’ weights (such as criticality). If no relevant weights are present, the weighted score is equal to 0. The default column name is noWeightsColumn. | |
| resultCountLimit | long | The maximum number of returned rows (sorted by descending score). The default value is 100000. | |
| listedIdsLimit | long | The maximum number of targets listed for each source. The default value is 50. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let graph_blast_radius_fl = (T:(*), sourceIdColumnName:string, targetIdColumnName:string, targetWeightColumnName:string = 'noWeightsColumn'
, resultCountLimit:long = 100000, listedIdsLimit:long = 50)
{
let paths = (
T
| extend sourceId = column_ifexists(sourceIdColumnName, '')
| extend targetId = column_ifexists(targetIdColumnName, '')
| extend targetWeight = tolong(column_ifexists(targetWeightColumnName, 0))
);
let aggregatedPaths = (
paths
| sort by sourceId, targetWeight desc
| summarize blastRadiusList = array_slice(make_set_if(targetId, isnotempty(targetId)), 0, (listedIdsLimit - 1))
, blastRadiusScore = dcountif(targetId, isnotempty(targetId))
, blastRadiusScoreWeighted = sum(targetWeight)
by sourceId
| extend isBlastRadiusListCapped = (blastRadiusScore > listedIdsLimit)
);
aggregatedPaths
| top resultCountLimit by blastRadiusScore desc
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (docstring = "Calculate the Blast Radius (list and score) of source nodes over path or edge data", skipvalidation = "true", folder = 'Cybersecurity')
graph_blast_radius_fl (T:(*), sourceIdColumnName:string, targetIdColumnName:string, targetWeightColumnName:string = 'noWeightsColumn'
, resultCountLimit:long = 100000, listedIdsLimit:long = 50)
{
let paths = (
T
| extend sourceId = column_ifexists(sourceIdColumnName, '')
| extend targetId = column_ifexists(targetIdColumnName, '')
| extend targetWeight = tolong(column_ifexists(targetWeightColumnName, 0))
);
let aggregatedPaths = (
paths
| sort by sourceId, targetWeight desc
| summarize blastRadiusList = array_slice(make_set_if(targetId, isnotempty(targetId)), 0, (listedIdsLimit - 1))
, blastRadiusScore = dcountif(targetId, isnotempty(targetId))
, blastRadiusScoreWeighted = sum(targetWeight)
by sourceId
| extend isBlastRadiusListCapped = (blastRadiusScore > listedIdsLimit)
);
aggregatedPaths
| top resultCountLimit by blastRadiusScore desc
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let connections = datatable (SourceNodeName:string, TargetNodeName:string, TargetNodeCriticality:int)[
'vm-work-1', 'webapp-prd', 3,
'vm-custom', 'webapp-prd', 3,
'webapp-prd', 'vm-custom', 1,
'webapp-prd', 'test-machine', 1,
'vm-custom', 'server-0126', 1,
'vm-custom', 'hub_router', 2,
'webapp-prd', 'hub_router', 2,
'test-machine', 'vm-custom', 1,
'test-machine', 'hub_router', 2,
'hub_router', 'remote_DT', 1,
'vm-work-1', 'storage_main_backup', 5,
'hub_router', 'vm-work-2', 1,
'vm-work-2', 'backup_prc', 3,
'remote_DT', 'backup_prc', 3,
'backup_prc', 'storage_main_backup', 5,
'backup_prc', 'storage_DevBox', 1,
'device_A1', 'sevice_B2', 2,
'sevice_B2', 'device_A1', 2
];
let graph_blast_radius_fl = (T:(*), sourceIdColumnName:string, targetIdColumnName:string, targetWeightColumnName:string = 'noWeightsColumn'
, resultCountLimit:long = 100000, listedIdsLimit:long = 50)
{
let paths = (
T
| extend sourceId = column_ifexists(sourceIdColumnName, '')
| extend targetId = column_ifexists(targetIdColumnName, '')
| extend targetWeight = tolong(column_ifexists(targetWeightColumnName, 0))
);
let aggregatedPaths = (
paths
| sort by sourceId, targetWeight desc
| summarize blastRadiusList = array_slice(make_set_if(targetId, isnotempty(targetId)), 0, (listedIdsLimit - 1))
, blastRadiusScore = dcountif(targetId, isnotempty(targetId))
, blastRadiusScoreWeighted = sum(targetWeight)
by sourceId
| extend isBlastRadiusListCapped = (blastRadiusScore > listedIdsLimit)
);
aggregatedPaths
| top resultCountLimit by blastRadiusScore desc
};
connections
| invoke graph_blast_radius_fl(sourceIdColumnName = 'SourceNodeName'
, targetIdColumnName = 'TargetNodeName'
, targetWeightColumnName = 'TargetNodeCriticality'
)
Stored
let connections = datatable (SourceNodeName:string, TargetNodeName:string, TargetNodeCriticality:int)[
'vm-work-1', 'webapp-prd', 3,
'vm-custom', 'webapp-prd', 3,
'webapp-prd', 'vm-custom', 1,
'webapp-prd', 'test-machine', 1,
'vm-custom', 'server-0126', 1,
'vm-custom', 'hub_router', 2,
'webapp-prd', 'hub_router', 2,
'test-machine', 'vm-custom', 1,
'test-machine', 'hub_router', 2,
'hub_router', 'remote_DT', 1,
'vm-work-1', 'storage_main_backup', 5,
'hub_router', 'vm-work-2', 1,
'vm-work-2', 'backup_prc', 3,
'remote_DT', 'backup_prc', 3,
'backup_prc', 'storage_main_backup', 5,
'backup_prc', 'storage_DevBox', 1,
'device_A1', 'sevice_B2', 2,
'sevice_B2', 'device_A1', 2
];
connections
| invoke graph_blast_radius_fl(sourceIdColumnName = 'SourceNodeName'
, targetIdColumnName = 'TargetNodeName'
, targetWeightColumnName = 'TargetNodeCriticality'
)
Output
| sourceId | blastRadiusList | blastRadiusScore | blastRadiusScoreWeighted | isBlastRadiusListCapped |
|---|---|---|---|---|
| webapp-prd | [“vm-custom”,“test-machine”,“hub_router”] | 3 | 4 | FALSE |
| vm-custom | [“webapp-prd”,“server-0126”,“hub_router”] | 3 | 6 | FALSE |
| test-machine | [“vm-custom”,“hub_router”] | 2 | 3 | FALSE |
| vm-work-1 | [“webapp-prd”,“storage_main_backup”] | 2 | 8 | FALSE |
| backup_prc | [“storage_main_backup”,“storage_DevBox”] | 2 | 6 | FALSE |
| hub_router | [“remote_DT”,“vm-work-2”] | 2 | 2 | FALSE |
| vm-work-2 | [“backup_prc”] | 1 | 3 | FALSE |
| device_A1 | [“sevice_B2”] | 1 | 2 | FALSE |
| remote_DT | [“backup_prc”] | 1 | 3 | FALSE |
| sevice_B2 | [“device_A1”] | 1 | 2 | FALSE |
Running the function aggregates the connections or paths between sources and targets by source. For each source, Blast Radius represents the connected targets as score (regular and weighted) and list.
Each row in the output contains the following fields:
sourceId: ID of the source node taken from relevant column.blastRadiusList: a list of target nodes IDs (taken from relevant column) that the source node is connected to. The list is capped to maximum length limit of listedIdsLimit parameter.blastRadiusScore: the score is the count of target nodes that the source is connected to. High Blast Radius score indicates that the source node can potentially access lots of targets, and should be treated accordingly.blastRadiusScoreWeighted: the weighted score is the sum of the optional target nodes’ weight column, representing their value - such as criticality or cost. If such weight exists, weighted Blast Radius score might be a more accurate metric of source node value due to potential access to high value targets.isBlastRadiusListCapped: boolean flag whether the list of targets was capped by listedIdsLimit parameter. If it’s true, then other targets can be accessed from the source in addition to the listed one (up to the number of blastRadiusScore).
In the example above, we run the graph_blast_radius_fl() function on top of connections between sources and targets. In the first row of the output, we can see that source node ‘webapp-prd’ is connected to three targets (‘vm-custom’, ’test-machine’, ‘hub_router’). We use the input data TargetNodeCriticality column as target weights, and get a cumulative weight of 4. Also, since the number of targets is 3 and the default list limit is 50, all of the targets are shown - so the value of isBlastRadiusListCapped column is FALSE.
If the multi-hop paths aren’t available, we can build multi-hop paths between sources and targets (for example, by running ‘graph_path_discovery_fl()’) and run ‘graph_blast_radius_fl()’ on top of the results.
The output looks similar, but represents Blast Radius calculated over multi-hop paths, thus being a better indicator of source nodes true connectivity to relevant targets. In order to find the full paths between source and target scenarios (for example, for disruption), graph_path_discovery_fl() function can be used with filters on relevant source and target nodes.
The function graph_blast_radius_fl() can be used to calculate the Blast Radius of source nodes, calculated either over direct edges or longer paths. In the cybersecurity domain, it can provide several insights. Blast Radius scores, regular and weighted, represent a source node’s importance from both defenders’ and attackers’ perspectives. Nodes with a high Blast Radius should be protected accordingly, for example, in terms of access hardening and vulnerability management. Security signals such as alerts on such nodes should be prioritized. The Blast Radius list should be monitored for undesired connections between sources and targets and used in disruption scenarios. For example, if the source was compromised, connections between it and important targets should be broken.
Related content
4 - graph_exposure_perimeter_fl()
Calculate the Exposure Perimeter (list and score) of target nodes over path or edge data.
The function graph_exposure_perimeter_fl() is a UDF (user-defined function) that allows you to calculate the Exposure Perimeter of each of the target nodes based on paths or edges data. Each row of input data contains a source node and a target node, which can represent direct connections (edges) between nodes and targets, or longer multi-hop paths between them. If the paths aren’t available, we can first discover them using the graph-match operator or graph_path_discovery_fl() function. Then graph_exposure_perimeter_fl() can be executed on top of the output of path discovery.
Exposure Perimeter represents the accessibility of a specific target from relevant source nodes. The more sources can access the target, the more exposed it’s to potential compromise by the attacker - hence the name. Nodes with high Exposure Perimeter are important in cybersecurity domain due to the likelihood they might be reached illegitimately and to being highly valued by attackers. Thus, nodes with high Exposure Perimeter should be protected accordingly - in terms of hardening and monitoring their perimeter.
The function outputs a list of connected sources that can reach each target and also a score representing sources’ number. Optionally, in case there’s a meaningful ‘weight’ for each source (such as vulnerability or exposedness), a weighted score is calculated as a sum of sources’ weights. In addition, the limits for maximum total number of shown targets and maximum number of sources in each list are exposed as optional parameters for better control.
Syntax
graph_exposure_perimeter_fl(sourceIdColumnName, targetIdColumnName, [sourceWeightColumnName], [resultCountLimit], [listedIdsLimit])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| sourceIdColumnName | string | ✔️ | The name of the column containing the source node IDs (either for edges or paths). |
| targetIdColumnName | string | ✔️ | The name of the column containing the target node IDs (either for edges or paths). |
| sourceWeightColumnName | string | The name of the column containing the source nodes’ weights (such as vulnerability). If no relevant weights are present, the weighted score is equal to 0. The default column name is ’noWeightsColumn’. | |
| resultCountLimit | long | The maximum number of returned rows (sorted by descending score). The default value is 100000. | |
| listedIdsLimit | long | The maximum number of targets listed for each source. The default value is 50. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let exposure_perimeter_fl = (T:(*), sourceIdColumnName:string, targetIdColumnName:string, sourceWeightColumnName:string= 'noWeightsColumn'
, resultCountLimit:long = 100000, listedIdsLimit:long = 50)
{
let paths = (
T
| extend sourceId = column_ifexists(sourceIdColumnName, '')
| extend targetId = column_ifexists(targetIdColumnName, '')
| extend sourceWeight = tolong(column_ifexists(sourceWeightColumnName, 0))
);
let aggregatedPaths = (
paths
| sort by targetId, sourceWeight desc
| summarize exposurePerimeterList = array_slice(make_set_if(sourceId, isnotempty(sourceId)), 0, (listedIdsLimit - 1))
, exposurePerimeterScore = dcountif(sourceId, isnotempty(sourceId))
, exposurePerimeterScoreWeighted = sum(sourceWeight)
by targetId
| extend isExposurePerimeterCapped = (exposurePerimeterScore > listedIdsLimit)
);
aggregatedPaths
| top resultCountLimit by exposurePerimeterScore desc
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (docstring = "Calculate the Exposure Perimeter (list and score) of target nodes over path or edge data", skipvalidation = "true", folder = 'Cybersecurity')
graph_exposure_perimeter_fl (T:(*), sourceIdColumnName:string, targetIdColumnName:string, sourceWeightColumnName:string = 'noWeightsColumn'
, resultCountLimit:long = 100000, listedIdsLimit:long = 50)
{
let paths = (
T
| extend sourceId = column_ifexists(sourceIdColumnName, '')
| extend targetId = column_ifexists(targetIdColumnName, '')
| extend sourceWeight = tolong(column_ifexists(sourceWeightColumnName, 0))
);
let aggregatedPaths = (
paths
| sort by targetId, sourceWeight desc
| summarize exposurePerimeterList = array_slice(make_set_if(sourceId, isnotempty(sourceId)), 0, (listedIdsLimit - 1))
, exposurePerimeterScore = dcountif(sourceId, isnotempty(sourceId))
, exposurePerimeterScoreWeighted = sum(sourceWeight)
by targetId
| extend isExposurePerimeterCapped = (exposurePerimeterScore > listedIdsLimit)
);
aggregatedPaths
| top resultCountLimit by exposurePerimeterScore desc
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let connections = datatable (SourceNodeName:string, TargetNodeName:string, SourceNodeVulnerability:int)[
'vm-work-1', 'webapp-prd', 0,
'vm-custom', 'webapp-prd', 4,
'webapp-prd', 'vm-custom', 1,
'webapp-prd', 'test-machine', 1,
'vm-custom', 'server-0126', 4,
'vm-custom', 'hub_router', 4,
'webapp-prd', 'hub_router', 2,
'test-machine', 'vm-custom', 5,
'test-machine', 'hub_router', 5,
'hub_router', 'remote_DT', 0,
'vm-work-1', 'storage_main_backup', 0,
'hub_router', 'vm-work-2', 0,
'vm-work-2', 'backup_prc', 1,
'remote_DT', 'backup_prc', 2,
'backup_prc', 'storage_main_backup', 0,
'backup_prc', 'storage_DevBox', 0,
'device_A1', 'sevice_B2', 1,
'sevice_B2', 'device_A1', 2
];
let exposure_perimeter_fl = (T:(*), sourceIdColumnName:string, targetIdColumnName:string, sourceWeightColumnName:string = 'noWeightsColumn'
, resultCountLimit:long = 100000, listedIdsLimit:long = 50)
{
let paths = (
T
| extend sourceId = column_ifexists(sourceIdColumnName, '')
| extend targetId = column_ifexists(targetIdColumnName, '')
| extend sourceWeight = tolong(column_ifexists(sourceWeightColumnName, 0))
);
let aggregatedPaths = (
paths
| sort by targetId, sourceWeight desc
| summarize exposurePerimeterList = array_slice(make_set_if(sourceId, isnotempty(sourceId)), 0, (listedIdsLimit - 1))
, exposurePerimeterScore = dcountif(sourceId, isnotempty(sourceId))
, exposurePerimeterScoreWeighted = sum(sourceWeight)
by targetId
| extend isExposurePerimeterCapped = (exposurePerimeterScore > listedIdsLimit)
);
aggregatedPaths
| top resultCountLimit by exposurePerimeterScore desc
};
connections
| invoke exposure_perimeter_fl(sourceIdColumnName = 'SourceNodeName'
, targetIdColumnName = 'TargetNodeName'
, sourceWeightColumnName = 'SourceNodeVulnerability'
)
Stored
let connections = datatable (SourceNodeName:string, TargetNodeName:string, SourceNodeVulnerability:int)[
'vm-work-1', 'webapp-prd', 0,
'vm-custom', 'webapp-prd', 4,
'webapp-prd', 'vm-custom', 1,
'webapp-prd', 'test-machine', 1,
'vm-custom', 'server-0126', 4,
'vm-custom', 'hub_router', 4,
'webapp-prd', 'hub_router', 2,
'test-machine', 'vm-custom', 5,
'test-machine', 'hub_router', 5,
'hub_router', 'remote_DT', 0,
'vm-work-1', 'storage_main_backup', 0,
'hub_router', 'vm-work-2', 0,
'vm-work-2', 'backup_prc', 1,
'remote_DT', 'backup_prc', 2,
'backup_prc', 'storage_main_backup', 0,
'backup_prc', 'storage_DevBox', 0,
'device_A1', 'sevice_B2', 1,
'sevice_B2', 'device_A1', 2
];
connections
| invoke exposure_perimeter_fl(sourceIdColumnName = 'SourceNodeName'
, targetIdColumnName = 'TargetNodeName'
, sourceWeightColumnName = 'SourceNodeVulnerability'
)
Output
| targetId | exposurePerimeterList | exposurePerimeterScore | exposurePerimeterScoreWeighted | isExposurePerimeterCapped |
|---|---|---|---|---|
| hub_router | [“vm-custom”,“webapp-prd”,“test-machine”] | 3 | 11 | FALSE |
| storage_main_backup | [“vm-work-1”,“backup_prc”] | 2 | 0 | FALSE |
| vm-custom | [“webapp-prd”,“test-machine”] | 2 | 6 | FALSE |
| backup_prc | [“vm-work-2”,“remote_DT”] | 2 | 3 | FALSE |
| webapp-prd | [“vm-work-1”,“vm-custom”] | 2 | 4 | FALSE |
| test-machine | [“webapp-prd”] | 1 | 1 | FALSE |
| server-0126 | [“vm-custom”] | 1 | 4 | FALSE |
| remote_DT | [“hub_router”] | 1 | 0 | FALSE |
| vm-work-2 | [“hub_router”] | 1 | 0 | FALSE |
| storage_DevBox | [“backup_prc”] | 1 | 0 | FALSE |
| device_A1 | [“sevice_B2”] | 1 | 2 | FALSE |
| sevice_B2 | [“device_A1”] | 1 | 1 | FALSE |
Running the function aggregates the connections or paths between sources and targets by target. For each target, Exposure Perimeter represents the sources that can connect to it as score (regular and weighted) and list.
Each row in the output contains the following fields:
targetId: ID of the target node taken from relevant column.exposurePerimeterList: a list of source nodes IDs (taken from relevant column) that can connect to the target node. The list is capped to maximum length limit of listedIdsLimit parameter.exposurePerimeterScore: the score is the count of source nodes that can connect to the target. High Exposure Perimeter score indicates that the target node can be potentially accessed from lots of sources, and should be treated accordingly.exposurePerimeterScoreWeighted: the weighted score is the sum of the optional source nodes’ weight column, representing their value - such as vulnerability or exposedness. If such weight exists, weighted Exposure Perimeter score might be a more accurate metric of target node value due to potential access from highly vulnerable or exposed sources.isExposurePerimeterCapped: boolean flag whether the list of sources was capped by listedIdsLimit parameter. If it’s true, then other sources can access the target in addition to the listed ones (up to the number of exposurePerimeterScore).
In the example above we run the graph_exposure_perimeter_fl() function on top of connections between sources and targets. In the first row of the output, we can see that target node ‘hub_router’ can be connected from three sources (‘vm-custom’, ‘webapp-prd’, ’test-machine’). We use the input data SourceNodeVulnerability column as source weights, and get a cumulative weight of 11. Also, since the number of sources is 3 and the default list limit is 50, all of the sources are shown - so the value of isExposurePerimeterCapped column is FALSE.
In case the multi-hop paths aren’t available, we can build multi-hop paths between sources and targets (for example, by running ‘graph_path_discovery_fl()’) and run ‘graph_exposure_perimeter_fl()’ on top of the results.
The output looks similar, but represents Exposure Perimeter calculated over multi-hop paths, thus being a better indicator of target nodes true accessibility from relevant sources. In order to find the full paths between source and target scenarios (for example, for disruption), graph_path_discovery_fl() function can be used with filters on relevant source and target nodes.
The function graph_exposure_perimeter_fl() can be used to calculate the Exposure Perimeter of target nodes, either over direct edges or longer paths. In the cybersecurity domain, it can be used for several insights. Exposure Perimeter scores (regular and weighted), represent target node’s importance both from defenders’ and attackers’ perspectives. Nodes with high Exposure Perimeter, especially critical ones, should be protected accordingly. For example, in terms of access monitoring and hardening. Security signals, such as alerts, should be prioritized on sources that can access these nodes. The Exposure Perimeter list should be monitored for undesired connections between sources and targets and used in disruption scenarios. For example, if some of the sources were compromised, connections between them and the target should be broken.
Related content
5 - graph_node_centrality_fl()
Calculate metrics of node centrality, such as degree and betweenness, over graph edge and node data.
The function graph_node_centrality_fl() is a UDF (user-defined function) that allows you to calculate various metrics of node centrality over graph data. Graph data consists of nodes, such as resources, applications or users, and edges such as existing access permissions or connections. The centrality of a node represents its importance in the graph structure and can be defined and measured in several ways. In cybersecurity, centrality represents a node’s value to attackers; compromising a node with high centrality, such as a well-connected token, offers more opportunities. For defenders, high-centrality nodes are also important and should be protected accordingly. Centrality is calculated directly over edges, as well as over discovered shortest paths. Various centrality metrics can be useful in different security contexts.
The input data for this function should include a table of edges in the format SourceId, EdgeId, TargetId and a list of nodes with optional relevant node properties. Alternatively, graph input can be extracted from other types of data. For example, traffic logs with entries of type User A logged in to resource B can be modeled as edges of type (User A)-[logged in to]->(resource B). The list of distinct users and resources can be modeled as nodes. As part of the function, shortest paths are calculated and used as input for centrality calculations.
The following assumptions are made:
- All edges are valid for path discovery. Edges that are irrelevant should be filtered out before calculating centrality.
- Edges are unweighted, independent, and unconditional, meaning that all edges have the same probability and moving from B to C isn’t dependent on previous move from A to B.
- Centrality metrics are calculated over edges as well as simple directional shortest paths without cycles, of type A->B->C. More complex definitions can be made by changing the internal syntax of graph-match operator in the function.
These assumptions can be adapted as needed by changing the internal logic of the function.
The function discovers all possible shortest paths between valid sources to valid targets, under optional constraints such as path length limits, maximum output size, and more. Various centrality metrics are calculated over resulting paths as well as original edges, representing different aspects of node importance. The output is a list of nodes that are flagged as relevant using the isValidConnectorColumnName column, with additional columns containing the centrality metrics for each node. The function only uses the required fields, such as node IDs and edge IDs. Other relevant fields, such as types, property lists, security-related scores, or external signals, can be added to logic and output by changing the function definition.
Syntax
graph_node_centrality_fl(edgesTableName, , nodesTableName, scopeColumnName, isValidPathStartColumnName, isValidPathEndColumnName, isValidConnectorColumnName, nodeIdColumnName, edgeIdColumnName, sourceIdColumnName, targetIdColumnName, [minPathLength], [maxPathLength], [resultCountLimit])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| edgesTableName | string | ✔️ | The name of the input table containing the edges of the graph. |
| nodesTableName | string | ✔️ | The name of the input table containing the nodes of the graph. |
| scopeColumnName | string | ✔️ | The name of the column in nodes and edges tables containing the partition or scope (for example, subscription or account), so that a different anomaly model is built for each scope. |
| isValidPathStartColumnName | string | ✔️ | The name of the column in nodes table containing a Boolean flag for a node, True meaning that the node is a valid start point for a path and False - not a valid one. |
| isValidPathEndColumnName | string | ✔️ | The name of the column in nodes table containing a Boolean flag for a node, True meaning that the node is a valid end point for a path and False - not a valid one. |
| isValidConnectorColumnName | string | ✔️ | The name of the column in nodes table containing a Boolean flag for a node, True meaning that the node is a valid connector that will be included in the output and False - that it is not a valid one. |
| nodeIdColumnName | string | ✔️ | The name of the column in nodes table containing the node ID. |
| edgeIdColumnName | string | ✔️ | The name of the column in edges table containing the edge ID. |
| sourceIdColumnName | string | ✔️ | The name of the column in edges table containing edge’s source node ID. |
| targetIdColumnName | string | ✔️ | The name of the column in edges table containing edge’s target node ID. |
| minPathLength | long | The minimum number of steps (edges) in the path. Default value: 1. | |
| maxPathLength | long | The maximum number of steps (edges) in the path. Default value: 8. | |
| resultCountLimit | long | The maximum number of paths returned for output. Default value: 100000. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let graph_node_centrality_fl = ( edgesTableName:string, nodesTableName:string, scopeColumnName:string
, isValidPathStartColumnName:string, isValidPathEndColumnName:string, isValidConnectorColumnName:string
, nodeIdColumnName:string, edgeIdColumnName:string, sourceIdColumnName:string, targetIdColumnName:string
, minPathLength:long = 1, maxPathLength:long = 8, resultCountLimit:long = 100000)
{
let edges = (
table(edgesTableName)
| extend sourceId = column_ifexists(sourceIdColumnName, '')
| extend targetId = column_ifexists(targetIdColumnName, '')
| extend edgeId = column_ifexists(edgeIdColumnName, '')
| extend scope = column_ifexists(scopeColumnName, '')
);
let nodes = (
table(nodesTableName)
| extend nodeId = column_ifexists(nodeIdColumnName, '')
| extend isValidPathStart = column_ifexists(isValidPathStartColumnName, '')
| extend isValidPathEnd = column_ifexists(isValidPathEndColumnName, '')
| extend isNodeValidConnector = column_ifexists(isValidConnectorColumnName, '')
| extend scope = column_ifexists(scopeColumnName, '')
);
let potentialPairsOnScope = (
nodes
| summarize countSources = dcountif(nodeId, (isValidPathStart)), countTargets = dcountif(nodeId, (isValidPathEnd)) by scope
| project scope, countPotentialPairsOnScope = countSources * countTargets
);
let paths = (
edges
// Build graph object partitioned by scope, so that no connections are allowed between scopes.
// In case no scopes are relevant, partitioning should be removed for better performance.
| make-graph sourceId --> targetId with nodes on nodeId partitioned-by scope (
// Look for existing shortest paths between source nodes and target nodes with less than predefined number of hops.
// Current configurations looks for directed paths without any cycles; this can be changed if needed.
graph-shortest-paths output = all cycles = none (s)-[e*minPathLength..maxPathLength]->(t)
// Filter only by paths with that connect valid endpoints
where ((s.isValidPathStart) and (t.isValidPathEnd))
project sourceId = s.nodeId
, isSourceValidPathStart = s.isValidPathStart
, targetId = t.nodeId
, isTargetValidPathEnd = t.isValidPathEnd
, scope = s.scope
, edgeIds = e.edgeId
, innerNodeIds = map(inner_nodes(e), nodeId)
, innerNodeConnector = map(inner_nodes(e), isNodeValidConnector)
| limit resultCountLimit
)
| extend pathLength = array_length(edgeIds)
, pathEndpointsId = hash_md5(strcat(sourceId, targetId))
, pathId = hash_md5(strcat(sourceId, strcat(edgeIds), targetId))
);
let pathsProcessed = (
paths
| mv-expand with_itemindex = i innerNodeId = innerNodeIds to typeof(string), innerNodeConnector to typeof(bool)
| where (innerNodeConnector)
| summarize countShortestPathsThroughNode = count(), take_any(sourceId, targetId, pathLength) by scope, innerNodeId, pathEndpointsId
| join kind = leftouter (paths | summarize countShortestPaths = count() by scope, pathEndpointsId) on scope, pathEndpointsId
| project-away scope1, pathEndpointsId1
| extend betweennessForPair = (todouble(countShortestPathsThroughNode)/countShortestPaths)
| summarize betweenness = sum(betweennessForPair), countShortestPathsThroughNode = sum(countShortestPathsThroughNode)
, countPairsConnectedByNode = dcount(pathEndpointsId)
by scope, nodeId = innerNodeId
| join kind = leftouter (potentialPairsOnScope) on scope
| extend relativePrestige = round(todouble(countPairsConnectedByNode)/countPotentialPairsOnScope, 6)
| project scope, nodeId, betweenness, relativePrestige, countShortestPathsThroughNode, countPairsConnectedByNode
);
let centrality = (
nodes
| summarize take_any(*) by scope, nodeId
| where (isNodeValidConnector)
| join kind = leftouter (edges | summarize outDegree = dcount(targetId) by scope, sourceId) on scope, $left.nodeId == $right.sourceId
| join kind = leftouter (edges | summarize inDegree = dcount(sourceId) by scope, targetId) on scope, $left.nodeId == $right.targetId
| project-away scope1, scope2, sourceId, targetId
| extend inDegree = coalesce(inDegree, 0), outDegree = coalesce(outDegree, 0)
| extend totalDegree = inDegree * outDegree
| join kind = leftouter (paths | summarize sourceOutFlow = dcount(targetId) by scope, sourceId) on scope, $left.nodeId == $right.sourceId
| join kind = leftouter (paths | summarize sinkInFlow = dcount(sourceId) by scope, targetId) on scope, $left.nodeId == $right.targetId
| project-away scope1, scope2, sourceId, targetId
| extend sourceOutFlow = coalesce(sourceOutFlow, 0), sinkInFlow = coalesce(sinkInFlow, 0)
| join kind = leftouter (pathsProcessed) on scope, nodeId
| project-away scope1, nodeId1
| extend betweenness = coalesce(betweenness, 0.0), relativePrestige = coalesce(relativePrestige, 0.0)
, countShortestPathsThroughNode = coalesce(countShortestPathsThroughNode, 0), countPairsConnectedByNode = coalesce(countPairsConnectedByNode, 0)
);
centrality
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (docstring = "Calculate various centrality metrics for relevant nodes over graph data (edges and nodes) and discovered paths between valid endpoints (sych as exposed and critical assets) per scope (such as subscription or device)", skipvalidation = "true", folder = 'Cybersecurity')
graph_node_centrality_fl ( edgesTableName:string, nodesTableName:string, scopeColumnName:string
, isValidPathStartColumnName:string, isValidPathEndColumnName:string, isValidConnectorColumnName:string
, nodeIdColumnName:string, edgeIdColumnName:string, sourceIdColumnName:string, targetIdColumnName:string
, minPathLength:long = 1, maxPathLength:long = 8, resultCountLimit:long = 100000)
{
let edges = (
table(edgesTableName)
| extend sourceId = column_ifexists(sourceIdColumnName, '')
| extend targetId = column_ifexists(targetIdColumnName, '')
| extend edgeId = column_ifexists(edgeIdColumnName, '')
| extend scope = column_ifexists(scopeColumnName, '')
);
let nodes = (
table(nodesTableName)
| extend nodeId = column_ifexists(nodeIdColumnName, '')
| extend isValidPathStart = column_ifexists(isValidPathStartColumnName, '')
| extend isValidPathEnd = column_ifexists(isValidPathEndColumnName, '')
| extend isNodeValidConnector = column_ifexists(isValidConnectorColumnName, '')
| extend scope = column_ifexists(scopeColumnName, '')
);
let potentialPairsOnScope = (
nodes
| summarize countSources = dcountif(nodeId, (isValidPathStart)), countTargets = dcountif(nodeId, (isValidPathEnd)) by scope
| project scope, countPotentialPairsOnScope = countSources * countTargets
);
let paths = (
edges
// Build graph object partitioned by scope, so that no connections are allowed between scopes.
// In case no scopes are relevant, partitioning should be removed for better performance.
| make-graph sourceId --> targetId with nodes on nodeId partitioned-by scope (
// Look for existing shortest paths between source nodes and target nodes with less than predefined number of hops.
// Current configurations looks for directed paths without any cycles; this can be changed if needed.
graph-shortest-paths output = all cycles = none (s)-[e*minPathLength..maxPathLength]->(t)
// Filter only by paths with that connect valid endpoints
where ((s.isValidPathStart) and (t.isValidPathEnd))
project sourceId = s.nodeId
, isSourceValidPathStart = s.isValidPathStart
, targetId = t.nodeId
, isTargetValidPathEnd = t.isValidPathEnd
, scope = s.scope
, edgeIds = e.edgeId
, innerNodeIds = map(inner_nodes(e), nodeId)
, innerNodeConnector = map(inner_nodes(e), isNodeValidConnector)
| limit resultCountLimit
)
| extend pathLength = array_length(edgeIds)
, pathEndpointsId = hash_md5(strcat(sourceId, targetId))
, pathId = hash_md5(strcat(sourceId, strcat(edgeIds), targetId))
);
let pathsProcessed = (
paths
| mv-expand with_itemindex = i innerNodeId = innerNodeIds to typeof(string), innerNodeConnector to typeof(bool)
| where (innerNodeConnector)
| summarize countShortestPathsThroughNode = count(), take_any(sourceId, targetId, pathLength) by scope, innerNodeId, pathEndpointsId
| join kind = leftouter (paths | summarize countShortestPaths = count() by scope, pathEndpointsId) on scope, pathEndpointsId
| project-away scope1, pathEndpointsId1
| extend betweennessForPair = (todouble(countShortestPathsThroughNode)/countShortestPaths)
| summarize betweenness = sum(betweennessForPair), countShortestPathsThroughNode = sum(countShortestPathsThroughNode)
, countPairsConnectedByNode = dcount(pathEndpointsId)
by scope, nodeId = innerNodeId
| join kind = leftouter (potentialPairsOnScope) on scope
| extend relativePrestige = round(todouble(countPairsConnectedByNode)/countPotentialPairsOnScope, 6)
| project scope, nodeId, betweenness, relativePrestige, countShortestPathsThroughNode, countPairsConnectedByNode
);
let centrality = (
nodes
| summarize take_any(*) by scope, nodeId
| where (isNodeValidConnector)
| join kind = leftouter (edges | summarize outDegree = dcount(targetId) by scope, sourceId) on scope, $left.nodeId == $right.sourceId
| join kind = leftouter (edges | summarize inDegree = dcount(sourceId) by scope, targetId) on scope, $left.nodeId == $right.targetId
| project-away scope1, scope2, sourceId, targetId
| extend inDegree = coalesce(inDegree, 0), outDegree = coalesce(outDegree, 0)
| extend totalDegree = inDegree * outDegree
| join kind = leftouter (paths | summarize sourceOutFlow = dcount(targetId) by scope, sourceId) on scope, $left.nodeId == $right.sourceId
| join kind = leftouter (paths | summarize sinkInFlow = dcount(sourceId) by scope, targetId) on scope, $left.nodeId == $right.targetId
| project-away scope1, scope2, sourceId, targetId
| extend sourceOutFlow = coalesce(sourceOutFlow, 0), sinkInFlow = coalesce(sinkInFlow, 0)
| join kind = leftouter (pathsProcessed) on scope, nodeId
| project-away scope1, nodeId1
| extend betweenness = coalesce(betweenness, 0.0), relativePrestige = coalesce(relativePrestige, 0.0)
, countShortestPathsThroughNode = coalesce(countShortestPathsThroughNode, 0), countPairsConnectedByNode = coalesce(countPairsConnectedByNode, 0)
);
centrality
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let edges = datatable (SourceNodeName:string, EdgeName:string, EdgeType:string, TargetNodeName:string, Region:string)[
'vm-work-1', 'e1', 'can use', 'webapp-prd', 'US',
'vm-custom', 'e2', 'can use', 'webapp-prd', 'US',
'webapp-prd', 'e3', 'can access', 'vm-custom', 'US',
'webapp-prd', 'e4', 'can access', 'test-machine', 'US',
'vm-custom', 'e5', 'can access', 'server-0126', 'US',
'vm-custom', 'e6', 'can access', 'hub_router', 'US',
'webapp-prd', 'e7', 'can access', 'hub_router', 'US',
'test-machine', 'e8', 'can access', 'vm-custom', 'US',
'test-machine', 'e9', 'can access', 'hub_router', 'US',
'hub_router', 'e10', 'routes traffic to', 'remote_DT', 'US',
'vm-work-1', 'e11', 'can access', 'storage_main_backup', 'US',
'hub_router', 'e12', 'routes traffic to', 'vm-work-2', 'US',
'vm-work-2', 'e13', 'can access', 'backup_prc', 'US',
'remote_DT', 'e14', 'can access', 'backup_prc', 'US',
'backup_prc', 'e15', 'moves data to', 'storage_main_backup', 'US',
'backup_prc', 'e16', 'moves data to', 'storage_DevBox', 'US',
'device_A1', 'e17', 'is connected to', 'device_B2', 'EU',
'device_B2', 'e18', 'is connected to', 'device_A1', 'EU'
];
let nodes = datatable (NodeName:string, NodeType:string, NodeEnvironment:string, Region:string) [
'vm-work-1', 'Virtual Machine', 'Production', 'US',
'vm-custom', 'Virtual Machine', 'Production', 'US',
'webapp-prd', 'Application', 'None', 'US',
'test-machine', 'Virtual Machine', 'Test', 'US',
'hub_router', 'Traffic Router', 'None', 'US',
'vm-work-2', 'Virtual Machine', 'Production', 'US',
'remote_DT', 'Virtual Machine', 'Production', 'US',
'backup_prc', 'Service', 'Production', 'US',
'server-0126', 'Server', 'Production', 'US',
'storage_main_backup', 'Cloud Storage', 'Production', 'US',
'storage_DevBox', 'Cloud Storage', 'Test', 'US',
'device_A1', 'Device', 'Backend', 'EU',
'device_B2', 'Device', 'Backend', 'EU'
];
let nodesEnriched = (
nodes
| extend IsValidStart = (NodeType in ('Virtual Machine'))
, IsValidEnd = (NodeType in ('Cloud Storage'))
| extend IsValidConnector = (NodeType in ('Application', 'Traffic Router', 'Service'))
);
let graph_node_centrality_fl = ( edgesTableName:string, nodesTableName:string, scopeColumnName:string
, isValidPathStartColumnName:string, isValidPathEndColumnName:string, isValidConnectorColumnName:string
, nodeIdColumnName:string, edgeIdColumnName:string, sourceIdColumnName:string, targetIdColumnName:string
, minPathLength:long = 1, maxPathLength:long = 8, resultCountLimit:long = 100000)
{
let edges = (
table(edgesTableName)
| extend sourceId = column_ifexists(sourceIdColumnName, '')
| extend targetId = column_ifexists(targetIdColumnName, '')
| extend edgeId = column_ifexists(edgeIdColumnName, '')
| extend scope = column_ifexists(scopeColumnName, '')
);
let nodes = (
table(nodesTableName)
| extend nodeId = column_ifexists(nodeIdColumnName, '')
| extend isValidPathStart = column_ifexists(isValidPathStartColumnName, '')
| extend isValidPathEnd = column_ifexists(isValidPathEndColumnName, '')
| extend isNodeValidConnector = column_ifexists(isValidConnectorColumnName, '')
| extend scope = column_ifexists(scopeColumnName, '')
);
let potentialPairsOnScope = (
nodes
| summarize countSources = dcountif(nodeId, (isValidPathStart)), countTargets = dcountif(nodeId, (isValidPathEnd)) by scope
| project scope, countPotentialPairsOnScope = countSources * countTargets
);
let paths = (
edges
// Build graph object partitioned by scope, so that no connections are allowed between scopes.
// In case no scopes are relevant, partitioning should be removed for better performance.
| make-graph sourceId --> targetId with nodes on nodeId partitioned-by scope (
// Look for existing shortest paths between source nodes and target nodes with less than predefined number of hops.
// Current configurations looks for directed paths without any cycles; this can be changed if needed.
graph-shortest-paths output = all cycles = none (s)-[e*minPathLength..maxPathLength]->(t)
// Filter only by paths with that connect valid endpoints
where ((s.isValidPathStart) and (t.isValidPathEnd))
project sourceId = s.nodeId
, isSourceValidPathStart = s.isValidPathStart
, targetId = t.nodeId
, isTargetValidPathEnd = t.isValidPathEnd
, scope = s.scope
, edgeIds = e.edgeId
, innerNodeIds = map(inner_nodes(e), nodeId)
, innerNodeConnector = map(inner_nodes(e), isNodeValidConnector)
| limit resultCountLimit
)
| extend pathLength = array_length(edgeIds)
, pathEndpointsId = hash_md5(strcat(sourceId, targetId))
, pathId = hash_md5(strcat(sourceId, strcat(edgeIds), targetId))
);
let pathsProcessed = (
paths
| mv-expand with_itemindex = i innerNodeId = innerNodeIds to typeof(string), innerNodeConnector to typeof(bool)
| where (innerNodeConnector)
| summarize countShortestPathsThroughNode = count(), take_any(sourceId, targetId, pathLength) by scope, innerNodeId, pathEndpointsId
| join kind = leftouter (paths | summarize countShortestPaths = count() by scope, pathEndpointsId) on scope, pathEndpointsId
| project-away scope1, pathEndpointsId1
| extend betweennessForPair = (todouble(countShortestPathsThroughNode)/countShortestPaths)
| summarize betweenness = sum(betweennessForPair), countShortestPathsThroughNode = sum(countShortestPathsThroughNode)
, countPairsConnectedByNode = dcount(pathEndpointsId)
by scope, nodeId = innerNodeId
| join kind = leftouter (potentialPairsOnScope) on scope
| extend relativePrestige = round(todouble(countPairsConnectedByNode)/countPotentialPairsOnScope, 6)
| project scope, nodeId, betweenness, relativePrestige, countShortestPathsThroughNode, countPairsConnectedByNode
);
let centrality = (
nodes
| summarize take_any(*) by scope, nodeId
| where (isNodeValidConnector)
| join kind = leftouter (edges | summarize outDegree = dcount(targetId) by scope, sourceId) on scope, $left.nodeId == $right.sourceId
| join kind = leftouter (edges | summarize inDegree = dcount(sourceId) by scope, targetId) on scope, $left.nodeId == $right.targetId
| project-away scope1, scope2, sourceId, targetId
| extend inDegree = coalesce(inDegree, 0), outDegree = coalesce(outDegree, 0)
| extend totalDegree = inDegree * outDegree
| join kind = leftouter (paths | summarize sourceOutFlow = dcount(targetId) by scope, sourceId) on scope, $left.nodeId == $right.sourceId
| join kind = leftouter (paths | summarize sinkInFlow = dcount(sourceId) by scope, targetId) on scope, $left.nodeId == $right.targetId
| project-away scope1, scope2, sourceId, targetId
| extend sourceOutFlow = coalesce(sourceOutFlow, 0), sinkInFlow = coalesce(sinkInFlow, 0)
| join kind = leftouter (pathsProcessed) on scope, nodeId
| project-away scope1, nodeId1
| extend betweenness = coalesce(betweenness, 0.0), relativePrestige = coalesce(relativePrestige, 0.0)
, countShortestPathsThroughNode = coalesce(countShortestPathsThroughNode, 0), countPairsConnectedByNode = coalesce(countPairsConnectedByNode, 0)
);
centrality
};
graph_node_centrality_fl(edgesTableName = 'edges'
, nodesTableName = 'nodesEnriched'
, scopeColumnName = 'Region'
, nodeIdColumnName = 'NodeName'
, edgeIdColumnName = 'EdgeName'
, sourceIdColumnName = 'SourceNodeName'
, targetIdColumnName = 'TargetNodeName'
, isValidPathStartColumnName = 'IsValidStart'
, isValidPathEndColumnName = 'IsValidEnd'
, isValidConnectorColumnName = 'IsValidConnector'
)
Stored
let edges = datatable (SourceNodeName:string, EdgeName:string, EdgeType:string, TargetNodeName:string, Region:string)[
'vm-work-1', 'e1', 'can use', 'webapp-prd', 'US',
'vm-custom', 'e2', 'can use', 'webapp-prd', 'US',
'webapp-prd', 'e3', 'can access', 'vm-custom', 'US',
'webapp-prd', 'e4', 'can access', 'test-machine', 'US',
'vm-custom', 'e5', 'can access', 'server-0126', 'US',
'vm-custom', 'e6', 'can access', 'hub_router', 'US',
'webapp-prd', 'e7', 'can access', 'hub_router', 'US',
'test-machine', 'e8', 'can access', 'vm-custom', 'US',
'test-machine', 'e9', 'can access', 'hub_router', 'US',
'hub_router', 'e10', 'routes traffic to', 'remote_DT', 'US',
'vm-work-1', 'e11', 'can access', 'storage_main_backup', 'US',
'hub_router', 'e12', 'routes traffic to', 'vm-work-2', 'US',
'vm-work-2', 'e13', 'can access', 'backup_prc', 'US',
'remote_DT', 'e14', 'can access', 'backup_prc', 'US',
'backup_prc', 'e15', 'moves data to', 'storage_main_backup', 'US',
'backup_prc', 'e16', 'moves data to', 'storage_DevBox', 'US',
'device_A1', 'e17', 'is connected to', 'device_B2', 'EU',
'device_B2', 'e18', 'is connected to', 'device_A1', 'EU'
];
let nodes = datatable (NodeName:string, NodeType:string, NodeEnvironment:string, Region:string) [
'vm-work-1', 'Virtual Machine', 'Production', 'US',
'vm-custom', 'Virtual Machine', 'Production', 'US',
'webapp-prd', 'Application', 'None', 'US',
'test-machine', 'Virtual Machine', 'Test', 'US',
'hub_router', 'Traffic Router', 'None', 'US',
'vm-work-2', 'Virtual Machine', 'Production', 'US',
'remote_DT', 'Virtual Machine', 'Production', 'US',
'backup_prc', 'Service', 'Production', 'US',
'server-0126', 'Server', 'Production', 'US',
'storage_main_backup', 'Cloud Storage', 'Production', 'US',
'storage_DevBox', 'Cloud Storage', 'Test', 'US',
'device_A1', 'Device', 'Backend', 'EU',
'device_B2', 'Device', 'Backend', 'EU'
];
let nodesEnriched = (
nodes
| extend IsValidStart = (NodeType in ('Virtual Machine'))
, IsValidEnd = (NodeType in ('Cloud Storage'))
| extend IsValidConnector = (NodeType in ('Application', 'Traffic Router', 'Service'))
);
graph_node_centrality_fl(edgesTableName = 'edges'
, nodesTableName = 'nodesEnriched'
, scopeColumnName = 'Region'
, nodeIdColumnName = 'NodeName'
, edgeIdColumnName = 'EdgeName'
, sourceIdColumnName = 'SourceNodeName'
, targetIdColumnName = 'TargetNodeName'
, isValidPathStartColumnName = 'IsValidStart'
, isValidPathEndColumnName = 'IsValidEnd'
, isValidConnectorColumnName = 'IsValidConnector'
)
Output
| scope | nodeId | NodeName | NodeType | NodeEnvironment | Region | IsValidStart | IsValidEnd | IsValidConnector | isValidPathStart | isValidPathEnd | isNodeValidConnector | outDegree | inDegree | totalDegree | sourceOutFlow | sinkInFlow | betweenness | relativePrestige | countShortestPathsThroughNode | countPairsConnectedByNode |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| US | backup_prc | backup_prc | Service | Production | US | False | False | True | False | False | True | 2 | 2 | 4 | 0 | 0 | 9 | 0.9 | 14 | 9 |
Running the function finds all shortest paths that connect between source nodes flagged as valid start points (isSourceValidPathStart == True) to all targets flagged as valid end points (isTargetValidPathEnd == True). Various centrality metrics are calculated on top of these paths and original edges for all nodes flagged as valid connectors (isValidConnector == True). The output is a table where each row corresponds to a valid connector node. Each row contains the following fields:
nodeId: The ID of the connector node.isValidConnector: A boolean flag for the node being a valid connector for which we want to calculate centrality; should be equal to True.isSourceValidPathStart: A boolean flag for the node being a valid path start.isTargetValidPathEnd: A boolean flag for the node being a valid path end.scope: The scope containing the node and the paths.outDegree: The OutDegree of the node. This is the number of distinct targets on outcoming edges adjacent to the node.inDegree: The InDegree of the node. This is the number of distinct sources on incoming edges of the node.totalDegree: TheinDegreemultiplied byoutDegree. The value represents the potential number of paths that the node can create since all the incoming edges are connected to all the outcoming ones.sourceOutFlow: The number of targets that can be reached via paths starting with the node, similar to BlastRadius.sinkInFlow: The number of sources that can reach the node via paths, similar to ExposurePerimeter.betweenness: The Betweenness centrality, the fraction of shortest paths that pass through the node out of all shortest paths.relativePrestige: The Prestige centrality is the count of source/target pairs connected by shortest paths passing through the node. Relative prestige normalizes this count by the number of all potential source and target pairs. The calculation can be adapted to penalize the score for longer paths.countShortestPathsThroughNode: The number of shortest paths that pass through the node, including those with recurring source and target pairs.countPairsConnectedByNode: The number of distinct source and target pairs from paths that pass through the node.
The example calculated the centrality metrics for all assets that are either applications, traffic routers, or services, based on paths connecting virtual machines to storage accounts. In the first row of the output, if sorted by descending betweenness, you can see the service backup_prc. It has an in and out degrees of 2, betweenness of 9, and so on. Different centrality metrics represent different aspects of importance, so they are not perfectly aligned. For example, node backup_prc has high betweenness and relativePrestige, but low degrees which highlights it as a node that doesn’t have lots of direct edges, but is placed strategically and plays an important role in global relativePrestige of its scope.
The function graph_node_centrality_fl() can be used in the cybersecurity domain to discover important nodes, such as well connected tokens or users, over data modeled as a graph. Various available centrality metrics provide a better understanding of node’s posture and allow you to act accordingly. For example, by prioritizing related signals, hardening the node or disrupting unnecessary connections.
Related content
6 - graph_path_discovery_fl()
Discover valid paths between relevant endpoints (sources and targets) over graph data (edge and nodes).
The function graph_path_discovery_fl() is a UDF (user-defined function) that allows you to discover valid paths between relevant endpoints over graph data. Graph data consists of nodes (for example - resources, applications or users) and edges (for example - existing access permissions). In cybersecurity context, such paths might represent possible lateral movement paths that a potential attacker can utilize. We’re interested in paths connecting endpoints defined as relevant by some criteria - for example, exposed sources connected to critical targets. Based on the function’s configuration, other types of paths, suitable for other security scenarios, can be discovered.
The input data for this function should include a table of edges in the format ‘SourceId, EdgeId, TargetId’ and a list of nodes with optional nodes’ properties that can be used to define valid paths. Alternatively, graph input can be extracted from other types of data. For example, traffic logs with entries of type ‘User A logged in to resource B’ can be modeled as edges of type ‘(User A)-[logged in to]->(resource B)’. The list of distinct users and resources can be modeled as nodes.
We make several assumptions:
- All edges are valid for path discovery. Edges that are irrelevant should be filtered out before running path discovery.
- Edges are unweighted, independent, and unconditional, meaning that all edges have the same probability and moving from B to C isn’t dependent on previous move from A to B.
- Paths we want to discover are simple directional paths without cycles, of type A->B->C. More complex definitions can be made by changing the internal syntax of graph-match operator in the function.
These assumptions can be adapted as needed by changing the internal logic of the function.
The function discovers all possible paths between valid sources to valid targets, under optional constraints such as path length limits, maximum output size, etc. The output is a list of discovered paths with source and target IDs, as well as list of connecting edges and nodes. The function uses only the required fields, such as node IDs and edge IDs. In case other relevant fields - such as types, property lists, security-related scores, or external signals - are available in input data, they can be added to logic and output by changing the function definition.
Syntax
graph_path_discovery_fl(edgesTableName, , nodesTableName, scopeColumnName, isValidPathStartColumnName, isValidPathEndColumnName, nodeIdColumnName, edgeIdColumnName, , sourceIdColumnName, targetIdColumnName, [minPathLength], [maxPathLength], [resultCountLimit])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| edgesTableName | string | ✔️ | The name of the input table containing the edges of the graph. |
| nodesTableName | string | ✔️ | The name of the input table containing the nodes of the graph. |
| scopeColumnName | string | ✔️ | The name of the column in nodes and edges tables containing the partition or scope (for example, subscription or account), so that a different anomaly model is built for each scope. |
| isValidPathStartColumnName | string | ✔️ | The name of the column in nodes table containing a Boolean flag for a node, True meaning that the node is a valid start point for a path and False - not a valid one. |
| isValidPathEndColumnName | string | ✔️ | The name of the column in nodes table containing a Boolean flag for a node, True meaning that the node is a valid end point for a path and False - not a valid one. |
| nodeIdColumnName | string | ✔️ | The name of the column in nodes table containing the node ID. |
| edgeIdColumnName | string | ✔️ | The name of the column in edges table containing the edge ID. |
| sourceIdColumnName | string | ✔️ | The name of the column in edges table containing edge’s source node ID. |
| targetIdColumnName | string | ✔️ | The name of the column in edges table containing edge’s target node ID. |
| minPathLength | long | The minimum number of steps (edges) in the path. The default value is 1. | |
| maxPathLength | long | The maximum number of steps (edges) in the path. The default value is 8. | |
| resultCountLimit | long | The maximum number of paths returned for output. The default value is 100000. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let graph_path_discovery_fl = ( edgesTableName:string, nodesTableName:string, scopeColumnName:string
, isValidPathStartColumnName:string, isValidPathEndColumnName:string
, nodeIdColumnName:string, edgeIdColumnName:string, sourceIdColumnName:string, targetIdColumnName:string
, minPathLength:long = 1, maxPathLength:long = 8, resultCountLimit:long = 100000)
{
let edges = (
table(edgesTableName)
| extend sourceId = column_ifexists(sourceIdColumnName, '')
| extend targetId = column_ifexists(targetIdColumnName, '')
| extend edgeId = column_ifexists(edgeIdColumnName, '')
| extend scope = column_ifexists(scopeColumnName, '')
);
let nodes = (
table(nodesTableName)
| extend nodeId = column_ifexists(nodeIdColumnName, '')
| extend isValidPathStart = column_ifexists(isValidPathStartColumnName, '')
| extend isValidPathEnd = column_ifexists(isValidPathEndColumnName, '')
| extend scope = column_ifexists(scopeColumnName, '')
);
let paths = (
edges
// Build graph object partitioned by scope, so that no connections are allowed between scopes.
// In case no scopes are relevant, partitioning should be removed for better performance.
| make-graph sourceId --> targetId with nodes on nodeId partitioned-by scope (
// Look for existing paths between source nodes and target nodes with less than predefined number of hops
// Current configurations looks for directed paths without any cycles; this can be changed if needed
graph-match cycles = none (s)-[e*minPathLength..maxPathLength]->(t)
// Filter only by paths with that connect valid endpoints
where ((s.isValidPathStart) and (t.isValidPathEnd))
project sourceId = s.nodeId
, isSourceValidPathStart = s.isValidPathStart
, targetId = t.nodeId
, isTargetValidPathEnd = t.isValidPathEnd
, scope = s.scope
, edgeIds = e.edgeId
, edgeAllTargetIds = e.targetId
| limit resultCountLimit
)
| extend pathLength = array_length(edgeIds)
, pathId = hash_md5(strcat(sourceId, strcat(edgeIds), targetId))
, pathAllNodeIds = array_concat(pack_array(sourceId), edgeAllTargetIds)
| project-away edgeAllTargetIds
| mv-apply with_itemindex = SortIndex nodesInPath = pathAllNodeIds to typeof(string), edgesInPath = edgeIds to typeof(string) on (
extend step = strcat(
iff(isnotempty(nodesInPath), strcat('(', nodesInPath, ')'), '')
, iff(isnotempty(edgesInPath), strcat('-[', edgesInPath, ']->'), ''))
| summarize fullPath = array_strcat(make_list(step), '')
)
);
paths
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (docstring = "Build paths on graph data (edges and nodes) between valid endpoints (sych as exposed and critical assets) per scope (such as subscription or device)", skipvalidation = "true", folder = 'Cybersecurity')
graph_path_discovery_fl ( edgesTableName:string, nodesTableName:string, scopeColumnName:string
, isValidPathStartColumnName:string, isValidPathEndColumnName:string
, nodeIdColumnName:string, edgeIdColumnName:string, sourceIdColumnName:string, targetIdColumnName:string
, minPathLength:long = 1, maxPathLength:long = 8, resultCountLimit:long = 100000)
{
let edges = (
table(edgesTableName)
| extend sourceId = column_ifexists(sourceIdColumnName, '')
| extend targetId = column_ifexists(targetIdColumnName, '')
| extend edgeId = column_ifexists(edgeIdColumnName, '')
| extend scope = column_ifexists(scopeColumnName, '')
);
let nodes = (
table(nodesTableName)
| extend nodeId = column_ifexists(nodeIdColumnName, '')
| extend isValidPathStart = column_ifexists(isValidPathStartColumnName, '')
| extend isValidPathEnd = column_ifexists(isValidPathEndColumnName, '')
| extend scope = column_ifexists(scopeColumnName, '')
);
let paths = (
edges
// Build graph object partitioned by scope, so that no connections are allowed between scopes.
// In case no scopes are relevant, partitioning should be removed for better performance.
| make-graph sourceId --> targetId with nodes on nodeId partitioned-by scope (
// Look for existing paths between source nodes and target nodes with less than predefined number of hops
// Current configurations looks for directed paths without any cycles; this can be changed if needed
graph-match cycles = none (s)-[e*minPathLength..maxPathLength]->(t)
// Filter only by paths with that connect valid endpoints
where ((s.isValidPathStart) and (t.isValidPathEnd))
project sourceId = s.nodeId
, isSourceValidPathStart = s.isValidPathStart
, targetId = t.nodeId
, isTargetValidPathEnd = t.isValidPathEnd
, scope = s.scope
, edgeIds = e.edgeId
, edgeAllTargetIds = e.targetId
| limit resultCountLimit
)
| extend pathLength = array_length(edgeIds)
, pathId = hash_md5(strcat(sourceId, strcat(edgeIds), targetId))
, pathAllNodeIds = array_concat(pack_array(sourceId), edgeAllTargetIds)
| project-away edgeAllTargetIds
| mv-apply with_itemindex = SortIndex nodesInPath = pathAllNodeIds to typeof(string), edgesInPath = edgeIds to typeof(string) on (
extend step = strcat(
iff(isnotempty(nodesInPath), strcat('(', nodesInPath, ')'), '')
, iff(isnotempty(edgesInPath), strcat('-[', edgesInPath, ']->'), ''))
| summarize fullPath = array_strcat(make_list(step), '')
)
);
paths
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let edges = datatable (SourceNodeName:string, EdgeName:string, EdgeType:string, TargetNodeName:string, Region:string)[
'vm-work-1', 'e1', 'can use', 'webapp-prd', 'US',
'vm-custom', 'e2', 'can use', 'webapp-prd', 'US',
'webapp-prd', 'e3', 'can access', 'vm-custom', 'US',
'webapp-prd', 'e4', 'can access', 'test-machine', 'US',
'vm-custom', 'e5', 'can access', 'server-0126', 'US',
'vm-custom', 'e6', 'can access', 'hub_router', 'US',
'webapp-prd', 'e7', 'can access', 'hub_router', 'US',
'test-machine', 'e8', 'can access', 'vm-custom', 'US',
'test-machine', 'e9', 'can access', 'hub_router', 'US',
'hub_router', 'e10', 'routes traffic to', 'remote_DT', 'US',
'vm-work-1', 'e11', 'can access', 'storage_main_backup', 'US',
'hub_router', 'e12', 'routes traffic to', 'vm-work-2', 'US',
'vm-work-2', 'e13', 'can access', 'backup_prc', 'US',
'remote_DT', 'e14', 'can access', 'backup_prc', 'US',
'backup_prc', 'e15', 'moves data to', 'storage_main_backup', 'US',
'backup_prc', 'e16', 'moves data to', 'storage_DevBox', 'US',
'device_A1', 'e17', 'is connected to', 'sevice_B2', 'EU',
'sevice_B2', 'e18', 'is connected to', 'device_A1', 'EU'
];
let nodes = datatable (NodeName:string, NodeType:string, NodeEnvironment:string, Region:string) [
'vm-work-1', 'Virtual Machine', 'Production', 'US',
'vm-custom', 'Virtual Machine', 'Production', 'US',
'webapp-prd', 'Application', 'None', 'US',
'test-machine', 'Virtual Machine', 'Test', 'US',
'hub_router', 'Traffic Router', 'None', 'US',
'vm-work-2', 'Virtual Machine', 'Production', 'US',
'remote_DT', 'Virtual Machine', 'Production', 'US',
'backup_prc', 'Service', 'Production', 'US',
'server-0126', 'Server', 'Production', 'US',
'storage_main_backup', 'Cloud Storage', 'Production', 'US',
'storage_DevBox', 'Cloud Storage', 'Test', 'US',
'device_A1', 'Device', 'Backend', 'EU',
'device_B2', 'Device', 'Backend', 'EU'
];
let nodesEnriched = (
nodes
| extend IsValidStart = (NodeType == 'Virtual Machine'), IsValidEnd = (NodeType == 'Cloud Storage') // option 1
//| extend IsValidStart = (NodeName in('vm-work-1', 'vm-work-2')), IsValidEnd = (NodeName in('storage_main_backup')) // option 2
//| extend IsValidStart = (NodeEnvironment == 'Test'), IsValidEnd = (NodeEnvironment == 'Production') // option 3
);
let graph_path_discovery_fl = ( edgesTableName:string, nodesTableName:string, scopeColumnName:string
, isValidPathStartColumnName:string, isValidPathEndColumnName:string
, nodeIdColumnName:string, edgeIdColumnName:string, sourceIdColumnName:string, targetIdColumnName:string
, minPathLength:long = 1, maxPathLength:long = 8, resultCountLimit:long = 100000)
{
let edges = (
table(edgesTableName)
| extend sourceId = column_ifexists(sourceIdColumnName, '')
| extend targetId = column_ifexists(targetIdColumnName, '')
| extend edgeId = column_ifexists(edgeIdColumnName, '')
| extend scope = column_ifexists(scopeColumnName, '')
);
let nodes = (
table(nodesTableName)
| extend nodeId = column_ifexists(nodeIdColumnName, '')
| extend isValidPathStart = column_ifexists(isValidPathStartColumnName, '')
| extend isValidPathEnd = column_ifexists(isValidPathEndColumnName, '')
| extend scope = column_ifexists(scopeColumnName, '')
);
let paths = (
edges
// Build graph object partitioned by scope, so that no connections are allowed between scopes.
// In case no scopes are relevant, partitioning should be removed for better performance.
| make-graph sourceId --> targetId with nodes on nodeId partitioned-by scope (
// Look for existing paths between source nodes and target nodes with less than predefined number of hops
// Current configurations looks for directed paths without any cycles; this can be changed if needed
graph-match cycles = none (s)-[e*minPathLength..maxPathLength]->(t)
// Filter only by paths with that connect valid endpoints
where ((s.isValidPathStart) and (t.isValidPathEnd))
project sourceId = s.nodeId
, isSourceValidPathStart = s.isValidPathStart
, targetId = t.nodeId
, isTargetValidPathEnd = t.isValidPathEnd
, scope = s.scope
, edgeIds = e.edgeId
, edgeAllTargetIds = e.targetId
| limit resultCountLimit
)
| extend pathLength = array_length(edgeIds)
, pathId = hash_md5(strcat(sourceId, strcat(edgeIds), targetId))
, pathAllNodeIds = array_concat(pack_array(sourceId), edgeAllTargetIds)
| project-away edgeAllTargetIds
| mv-apply with_itemindex = SortIndex nodesInPath = pathAllNodeIds to typeof(string), edgesInPath = edgeIds to typeof(string) on (
extend step = strcat(
iff(isnotempty(nodesInPath), strcat('(', nodesInPath, ')'), '')
, iff(isnotempty(edgesInPath), strcat('-[', edgesInPath, ']->'), ''))
| summarize fullPath = array_strcat(make_list(step), '')
)
);
paths
};
graph_path_discovery_fl(edgesTableName = 'edges'
, nodesTableName = 'nodesEnriched'
, scopeColumnName = 'Region'
, nodeIdColumnName = 'NodeName'
, edgeIdColumnName = 'EdgeName'
, sourceIdColumnName = 'SourceNodeName'
, targetIdColumnName = 'TargetNodeName'
, isValidPathStartColumnName = 'IsValidStart'
, isValidPathEndColumnName = 'IsValidEnd'
)
Stored
let edges = datatable (SourceNodeName:string, EdgeName:string, EdgeType:string, TargetNodeName:string, Region:string)[
'vm-work-1', 'e1', 'can use', 'webapp-prd', 'US',
'vm-custom', 'e2', 'can use', 'webapp-prd', 'US',
'webapp-prd', 'e3', 'can access', 'vm-custom', 'US',
'webapp-prd', 'e4', 'can access', 'test-machine', 'US',
'vm-custom', 'e5', 'can access', 'server-0126', 'US',
'vm-custom', 'e6', 'can access', 'hub_router', 'US',
'webapp-prd', 'e7', 'can access', 'hub_router', 'US',
'test-machine', 'e8', 'can access', 'vm-custom', 'US',
'test-machine', 'e9', 'can access', 'hub_router', 'US',
'hub_router', 'e10', 'routes traffic to', 'remote_DT', 'US',
'vm-work-1', 'e11', 'can access', 'storage_main_backup', 'US',
'hub_router', 'e12', 'routes traffic to', 'vm-work-2', 'US',
'vm-work-2', 'e13', 'can access', 'backup_prc', 'US',
'remote_DT', 'e14', 'can access', 'backup_prc', 'US',
'backup_prc', 'e15', 'moves data to', 'storage_main_backup', 'US',
'backup_prc', 'e16', 'moves data to', 'storage_DevBox', 'US',
'device_A1', 'e17', 'is connected to', 'sevice_B2', 'EU',
'sevice_B2', 'e18', 'is connected to', 'device_A1', 'EU'
];
let nodes = datatable (NodeName:string, NodeType:string, NodeEnvironment:string, Region:string) [
'vm-work-1', 'Virtual Machine', 'Production', 'US',
'vm-custom', 'Virtual Machine', 'Production', 'US',
'webapp-prd', 'Application', 'None', 'US',
'test-machine', 'Virtual Machine', 'Test', 'US',
'hub_router', 'Traffic Router', 'None', 'US',
'vm-work-2', 'Virtual Machine', 'Production', 'US',
'remote_DT', 'Virtual Machine', 'Production', 'US',
'backup_prc', 'Service', 'Production', 'US',
'server-0126', 'Server', 'Production', 'US',
'storage_main_backup', 'Cloud Storage', 'Production', 'US',
'storage_DevBox', 'Cloud Storage', 'Test', 'US',
'device_A1', 'Device', 'Backend', 'EU',
'device_B2', 'Device', 'Backend', 'EU'
];
let nodesEnriched = (
nodes
| extend IsValidStart = (NodeType == 'Virtual Machine'), IsValidEnd = (NodeType == 'Cloud Storage') // option 1
//| extend IsValidStart = (NodeName in('vm-work-1', 'vm-work-2')), IsValidEnd = (NodeName in('storage_main_backup')) // option 2
//| extend IsValidStart = (NodeEnvironment == 'Test'), IsValidEnd = (NodeEnvironment == 'Production') // option 3
);
graph_path_discovery_fl(edgesTableName = 'edges'
, nodesTableName = 'nodesEnriched'
, scopeColumnName = 'Region'
, nodeIdColumnName = 'NodeName'
, edgeIdColumnName = 'EdgeName'
, sourceIdColumnName = 'SourceNodeName'
, targetIdColumnName = 'TargetNodeName'
, isValidPathStartColumnName = 'IsValidStart'
, isValidPathEndColumnName = 'IsValidEnd'
)
Output
| sourceId | isSourceValidPathStart | targetId | isTargetValidPathEnd | scope | edgeIds | pathLength | pathId | pathAllNodeIds | fullPath |
|---|---|---|---|---|---|---|---|---|---|
| test-machine | True | storage_DevBox | True | US | [“e9”,“e10”,“e14”,“e16”] | 4 | 00605d35b6e1d28024fd846f217b43ac | [“test-machine”,“hub_router”,“remote_DT”,“backup_prc”,“storage_DevBox”] | (test-machine)-[e9]->(hub_router)-[e10]->(remote_DT)-[e14]->(backup_prc)-[e16]->(storage_DevBox) |
Running the function finds all paths using input edges that connect between source nodes flagged as valid start points (isSourceValidPathStart == True) to all targets flagged as valid end points (isTargetValidPathEnd == True). The output is a table where each row describes a single path (limited to maximum number of rows by resultCountLimit parameter). Each row contains the following fields:
sourceId: nodeId of the source - first node in the path.isSourceValidPathStart: Boolean flag for source node being a valid path start; should be equal to True.targetId: nodeId of the target - last node in the path.isTargetValidPathEnd: Boolean flag for target node being a valid path end; should be always equal to True.scope: the scope containing the path.edgeIds: an ordered list of edges in the path.pathLength: the numbers of edges (hops) in the path.pathId: a hash of path’s endpoints and steps can be used as unique identifier for the path.pathAllNodeIds: an ordered list of nodes in the path.fullPath: a string representing the full path, in format (source node)-[edge 1]->(node2)-…..->(target node).
In the previous example, we preprocess the nodes table and add several options of possible endpoint definitions. By commenting/uncommenting different options, several scenarios can be discovered:
- Option 1: Find paths between Virtual Machines to Cloud Storage resources. Useful in exploring connection patterns between types of nodes.
- Option 2: Find paths between any of the specific nodes (vm-work-1, vm-work-2) to a specific node (storage_main_backup). Useful in investigating known cases - such as paths from known compromised assets to known critical ones.
- Option 3: Find paths between groups of nodes, such as nodes in different environments. Useful for monitoring insecure paths, such as paths between test and production environments.
In the example above we use the first option to find all the paths between VMs to cloud storage resources, which might be used by potential attackers who want to access stored data. This scenario can be strengthened by adding more filters to valid endpoints - for example, connecting VMs with known vulnerabilities to storage accounts containing sensitive data.
The function graph_path_discovery_fl() can be used in cybersecurity domain to discover interesting paths, such as lateral movement paths, over data modeled as a graph.
Related content
7 - bartlett_test_fl()
The bartlett_test_fl() function is a user-defined tabular function that performs the Bartlett Test.
Syntax
T | invoke bartlett_test_fl()(data1, data2, test_statistic,p_value)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| data1 | string | ✔️ | The name of the column containing the first set of data to be used for the test. |
| data2 | string | ✔️ | The name of the column containing the second set of data to be used for the test. |
| test_statistic | string | ✔️ | The name of the column to store test statistic value for the results. |
| p_value | string | ✔️ | The name of the column to store p-value for the results. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let bartlett_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.bartlett(row[data1], row[data2])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Bartlett Test")
bartlett_test_fl(tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.bartlett(row[data1], row[data2])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let bartlett_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.bartlett(row[data1], row[data2])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Example query that uses the function
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke bartlett_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Stored
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke bartlett_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Output
| id | sample1 | sample2 | test_stat | p_val |
|---|---|---|---|---|
| Test #1 | [23.64, 20.57, 20.42] | [27.1, 22.12, 33.56] | 1.7660796224425723 | 0.183868001738637 |
| Test #2 | [20.85, 21.89, 23.41] | [35.09, 30.02, 26.52] | 1.9211710616896014 | 0.16572762069132516 |
| Test #3 | [20.13, 20.5, 21.7, 22.02] | [32.2, 32.79, 33.9, 34.22] | 0.0026985713829234454 | 0.958570306268548 |
| Test #3 | [20.13, 20.5, 21.7, 22.02] | [32.2, 32.79, 33.9, 34.22] | 0.0026985713829234454 | 0.958570306268548 |
8 - binomial_test_fl()
The function binomial_test_fl() is a UDF (user-defined function) that performs the binomial test.
Syntax
T | invoke binomial_test_fl(successes, trials [,success_prob [, alt_hypotheis ]])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| successes | string | ✔️ | The name of the column containing the number of success results. |
| trials | string | ✔️ | The name of the column containing the total number of trials. |
| p_value | string | ✔️ | The name of the column to store the results. |
| success_prob | real | The success probability. The default is 0.5. | |
| alt_hypotheis | string | The alternate hypothesis can be two-sided, greater, or less. The default is two-sided. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let binomial_test_fl = (tbl:(*), successes:string, trials:string, p_value:string, success_prob:real=0.5, alt_hypotheis:string='two-sided')
{
let kwargs = bag_pack('successes', successes, 'trials', trials, 'p_value', p_value, 'success_prob', success_prob, 'alt_hypotheis', alt_hypotheis);
let code = ```if 1:
from scipy import stats
successes = kargs["successes"]
trials = kargs["trials"]
p_value = kargs["p_value"]
success_prob = kargs["success_prob"]
alt_hypotheis = kargs["alt_hypotheis"]
def func(row, prob, h1):
pv = stats.binom_test(row[successes], row[trials], p=prob, alternative=h1)
return pv
result = df
result[p_value] = df.apply(func, axis=1, args=(success_prob, alt_hypotheis), result_type="expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Binomial test")
binomial_test_fl(tbl:(*), successes:string, trials:string, p_value:string, success_prob:real=0.5, alt_hypotheis:string='two-sided')
{
let kwargs = bag_pack('successes', successes, 'trials', trials, 'p_value', p_value, 'success_prob', success_prob, 'alt_hypotheis', alt_hypotheis);
let code = ```if 1:
from scipy import stats
successes = kargs["successes"]
trials = kargs["trials"]
p_value = kargs["p_value"]
success_prob = kargs["success_prob"]
alt_hypotheis = kargs["alt_hypotheis"]
def func(row, prob, h1):
pv = stats.binom_test(row[successes], row[trials], p=prob, alternative=h1)
return pv
result = df
result[p_value] = df.apply(func, axis=1, args=(success_prob, alt_hypotheis), result_type="expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let binomial_test_fl = (tbl:(*), successes:string, trials:string, p_value:string, success_prob:real=0.5, alt_hypotheis:string='two-sided')
{
let kwargs = bag_pack('successes', successes, 'trials', trials, 'p_value', p_value, 'success_prob', success_prob, 'alt_hypotheis', alt_hypotheis);
let code = ```if 1:
from scipy import stats
successes = kargs["successes"]
trials = kargs["trials"]
p_value = kargs["p_value"]
success_prob = kargs["success_prob"]
alt_hypotheis = kargs["alt_hypotheis"]
def func(row, prob, h1):
pv = stats.binom_test(row[successes], row[trials], p=prob, alternative=h1)
return pv
result = df
result[p_value] = df.apply(func, axis=1, args=(success_prob, alt_hypotheis), result_type="expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, x:int, n:int) [
'Test #1', 3, 5,
'Test #2', 5, 5,
'Test #3', 3, 15
]
| extend p_val=0.0
| invoke binomial_test_fl('x', 'n', 'p_val', success_prob=0.2, alt_hypotheis='greater')
Stored
datatable(id:string, x:int, n:int) [
'Test #1', 3, 5,
'Test #2', 5, 5,
'Test #3', 3, 15
]
| extend p_val=0.0
| invoke binomial_test_fl('x', 'n', 'p_val', success_prob=0.2, alt_hypotheis='greater')
Output
| id | x | n | p_val |
|---|---|---|---|
| Test #1 | 3 | 5 | 0.05792 |
| Test #2 | 5 | 5 | 0.00032 |
| Test #3 | 3 | 15 | 0.601976790745087 |
9 - comb_fl()
Calculate C(n, k)
The function comb_fl() is a user-defined function (UDF) that calculates C(n, k), the number of combinations for selection of k items out of n, without order. It’s based on the native gamma() function to calculate factorial. For more information, see facorial_fl(). For a selection of k items with order, use perm_fl().
Syntax
comb_fl(n, k)
Parameters
| Name | Type | Required | Description |
|---|
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let comb_fl=(n:int, k:int)
{
let fact_n = gamma(n+1);
let fact_nk = gamma(n-k+1);
let fact_k = gamma(k+1);
tolong(fact_n/fact_nk/fact_k)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Calculate number of combinations for selection of k items out of n items without order")
comb_fl(n:int, k:int)
{
let fact_n = gamma(n+1);
let fact_nk = gamma(n-k+1);
let fact_k = gamma(k+1);
tolong(fact_n/fact_nk/fact_k)
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let comb_fl=(n:int, k:int)
{
let fact_n = gamma(n+1);
let fact_nk = gamma(n-k+1);
let fact_k = gamma(k+1);
tolong(fact_n/fact_nk/fact_k)
};
range n from 3 to 10 step 3
| extend k = n-2
| extend cnk = comb_fl(n, k)
Stored
range n from 3 to 10 step 3
| extend k = n-2
| extend cnk = comb_fl(n, k)
Output
| n | k | cnk |
|---|---|---|
| 3 | 1 | 3 |
| 6 | 4 | 15 |
| 9 | 7 | 36 |
10 - dbscan_dynamic_fl()
The function dbscan_dynamic_fl() is a UDF (user-defined function) that clusterizes a dataset using the DBSCAN algorithm. This function is similar to dbscan_fl() just the features are supplied by a single numerical array column and not by multiple scalar columns.
Syntax
T | invoke dbscan_fl(features_col, cluster_col, epsilon, min_samples, metric, metric_params)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| features_col | string | ✔️ | The name of the column containing the numeric array of features to be used for clustering. |
| cluster_col | string | ✔️ | The name of the column to store the output cluster ID for each record. |
| epsilon | real | ✔️ | The maximum distance between two samples to be considered as neighbors. |
| min_samples | int | The number of samples in a neighborhood for a point to be considered as a core point. | |
| metric | string | The metric to use when calculating distance between points. | |
| metric_params | dynamic | Extra keyword arguments for the metric function. |
- For detailed description of the parameters, see DBSCAN documentation
- For the list of metrics see distance computations
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let dbscan_dynamic_fl=(tbl:(*), features_col:string, cluster_col:string, epsilon:double, min_samples:int=10, metric:string='minkowski', metric_params:dynamic=dynamic({'p': 2}))
{
let kwargs = bag_pack('features_col', features_col, 'cluster_col', cluster_col, 'epsilon', epsilon, 'min_samples', min_samples,
'metric', metric, 'metric_params', metric_params);
let code = ```if 1:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
features_col = kargs["features_col"]
cluster_col = kargs["cluster_col"]
epsilon = kargs["epsilon"]
min_samples = kargs["min_samples"]
metric = kargs["metric"]
metric_params = kargs["metric_params"]
df1 = df[features_col].apply(np.array)
mat = np.vstack(df1.values)
# Scale the dataframe
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
# see https://docs.scipy.org/doc/scipy/reference/spatial.distance.html for the various distance metrics
dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric, metric_params=metric_params) # 'minkowski', 'chebyshev'
labels = dbscan.fit_predict(mat)
result = df
result[cluster_col] = labels
```;
tbl
| evaluate python(typeof(*),code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\ML", docstring = "DBSCAN clustering of features passed as a single column containing numerical array")
dbscan_dynamic_fl(tbl:(*), features_col:string, cluster_col:string, epsilon:double, min_samples:int=10, metric:string='minkowski', metric_params:dynamic=dynamic({'p': 2}))
{
let kwargs = bag_pack('features_col', features_col, 'cluster_col', cluster_col, 'epsilon', epsilon, 'min_samples', min_samples,
'metric', metric, 'metric_params', metric_params);
let code = ```if 1:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
features_col = kargs["features_col"]
cluster_col = kargs["cluster_col"]
epsilon = kargs["epsilon"]
min_samples = kargs["min_samples"]
metric = kargs["metric"]
metric_params = kargs["metric_params"]
df1 = df[features_col].apply(np.array)
mat = np.vstack(df1.values)
# Scale the dataframe
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
# see https://docs.scipy.org/doc/scipy/reference/spatial.distance.html for the various distance metrics
dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric, metric_params=metric_params) # 'minkowski', 'chebyshev'
labels = dbscan.fit_predict(mat)
result = df
result[cluster_col] = labels
```;
tbl
| evaluate python(typeof(*),code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Clustering of artificial dataset with three clusters
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let dbscan_dynamic_fl=(tbl:(*), features_col:string, cluster_col:string, epsilon:double, min_samples:int=10, metric:string='minkowski', metric_params:dynamic=dynamic({'p': 2}))
{
let kwargs = bag_pack('features_col', features_col, 'cluster_col', cluster_col, 'epsilon', epsilon, 'min_samples', min_samples,
'metric', metric, 'metric_params', metric_params);
let code = ```if 1:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
features_col = kargs["features_col"]
cluster_col = kargs["cluster_col"]
epsilon = kargs["epsilon"]
min_samples = kargs["min_samples"]
metric = kargs["metric"]
metric_params = kargs["metric_params"]
df1 = df[features_col].apply(np.array)
mat = np.vstack(df1.values)
# Scale the dataframe
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
# see https://docs.scipy.org/doc/scipy/reference/spatial.distance.html for the various distance metrics
dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric, metric_params=metric_params) # 'minkowski', 'chebyshev'
labels = dbscan.fit_predict(mat)
result = df
result[cluster_col] = labels
```;
tbl
| evaluate python(typeof(*),code, kwargs)
};
union
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| project Features=pack_array(x, y), cluster_id=int(null)
| invoke dbscan_dynamic_fl("Features", "cluster_id", epsilon=0.6, min_samples=4, metric_params=dynamic({'p':2}))
| extend x=toreal(Features[0]), y=toreal(Features[1])
| render scatterchart with(series=cluster_id)
Stored
union
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| project Features=pack_array(x, y), cluster_id=int(null)
| invoke dbscan_dynamic_fl("Features", "cluster_id", epsilon=0.6, min_samples=4, metric_params=dynamic({'p':2}))
| extend x=toreal(Features[0]), y=toreal(Features[1])
| render scatterchart with(series=cluster_id)

11 - dbscan_fl()
The function dbscan_fl() is a UDF (user-defined function) that clusterizes a dataset using the DBSCAN algorithm.
Syntax
T | invoke dbscan_fl(features, cluster_col, epsilon, min_samples, metric, metric_params)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| features | dynamic | ✔️ | An array containing the names of the features columns to use for clustering. |
| cluster_col | string | ✔️ | The name of the column to store the output cluster ID for each record. |
| epsilon | real | ✔️ | The maximum distance between two samples to be considered as neighbors. |
| min_samples | int | The number of samples in a neighborhood for a point to be considered as a core point. | |
| metric | string | The metric to use when calculating distance between points. | |
| metric_params | dynamic | Extra keyword arguments for the metric function. |
- For detailed description of the parameters, see DBSCAN documentation
- For the list of metrics see distance computations
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let dbscan_fl=(tbl:(*), features:dynamic, cluster_col:string, epsilon:double, min_samples:int=10,
metric:string='minkowski', metric_params:dynamic=dynamic({'p': 2}))
{
let kwargs = bag_pack('features', features, 'cluster_col', cluster_col, 'epsilon', epsilon, 'min_samples', min_samples,
'metric', metric, 'metric_params', metric_params);
let code = ```if 1:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
features = kargs["features"]
cluster_col = kargs["cluster_col"]
epsilon = kargs["epsilon"]
min_samples = kargs["min_samples"]
metric = kargs["metric"]
metric_params = kargs["metric_params"]
df1 = df[features]
mat = df1.values
# Scale the dataframe
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
# see https://docs.scipy.org/doc/scipy/reference/spatial.distance.html for the various distance metrics
dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric, metric_params=metric_params) # 'minkowski', 'chebyshev'
labels = dbscan.fit_predict(mat)
result = df
result[cluster_col] = labels
```;
tbl
| evaluate python(typeof(*),code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\ML", docstring = "DBSCAN clustering")
dbscan_fl(tbl:(*), features:dynamic, cluster_col:string, epsilon:double, min_samples:int=10,
metric:string='minkowski', metric_params:dynamic=dynamic({'p': 2}))
{
let kwargs = bag_pack('features', features, 'cluster_col', cluster_col, 'epsilon', epsilon, 'min_samples', min_samples,
'metric', metric, 'metric_params', metric_params);
let code = ```if 1:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
features = kargs["features"]
cluster_col = kargs["cluster_col"]
epsilon = kargs["epsilon"]
min_samples = kargs["min_samples"]
metric = kargs["metric"]
metric_params = kargs["metric_params"]
df1 = df[features]
mat = df1.values
# Scale the dataframe
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
# see https://docs.scipy.org/doc/scipy/reference/spatial.distance.html for the various distance metrics
dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric, metric_params=metric_params) # 'minkowski', 'chebyshev'
labels = dbscan.fit_predict(mat)
result = df
result[cluster_col] = labels
```;
tbl
| evaluate python(typeof(*),code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Clustering of artificial dataset with three clusters
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let dbscan_fl=(tbl:(*), features:dynamic, cluster_col:string, epsilon:double, min_samples:int=10,
metric:string='minkowski', metric_params:dynamic=dynamic({'p': 2}))
{
let kwargs = bag_pack('features', features, 'cluster_col', cluster_col, 'epsilon', epsilon, 'min_samples', min_samples,
'metric', metric, 'metric_params', metric_params);
let code = ```if 1:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
features = kargs["features"]
cluster_col = kargs["cluster_col"]
epsilon = kargs["epsilon"]
min_samples = kargs["min_samples"]
metric = kargs["metric"]
metric_params = kargs["metric_params"]
df1 = df[features]
mat = df1.values
# Scale the dataframe
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
# see https://docs.scipy.org/doc/scipy/reference/spatial.distance.html for the various distance metrics
dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric, metric_params=metric_params) # 'minkowski', 'chebyshev'
labels = dbscan.fit_predict(mat)
result = df
result[cluster_col] = labels
```;
tbl
| evaluate python(typeof(*),code, kwargs)
};
union
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| extend cluster_id=int(null)
| invoke dbscan_fl(pack_array("x", "y"), "cluster_id", epsilon=0.6, min_samples=4, metric_params=dynamic({'p':2}))
| render scatterchart with(series=cluster_id)
Stored
union
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| extend cluster_id=int(null)
| invoke dbscan_fl(pack_array("x", "y"), "cluster_id", epsilon=0.6, min_samples=4, metric_params=dynamic({'p':2}))
| render scatterchart with(series=cluster_id)

12 - detect_anomalous_new_entity_fl()
Detect the appearance of anomalous new entities in timestamped data.
The function detect_anomalous_new_entity_fl() is a UDF (user-defined function) that detects the appearance of anomalous new entities - such as IP addresses or users - in timestamped data, such as traffic logs. In cybersecurity context, such events might be suspicious and indicate a potential attack or compromise.
The anomaly model is based on a Poisson distribution representing the number of new entities appearing per time bin (such as a day) for each scope. Poisson distribution parameter is estimated based on the rate of appearance of new entities in training period, with added decay factor reflecting the fact that recent appearances are more important than old ones. Hence, we calculate the probability to encounter a new entity in defined detection period per some scope - such as a subscription or an account. The model output is controlled by several optional parameters, such as minimal threshold for anomaly, decay rate parameter, and others.
The model’s direct output is an anomaly score, calculated based on the inverse of the estimated probability of encountering a new entity. The anomaly score ranges from 0 to 1, where 1 indicates a highly anomalous entity. In addition to the anomaly score, there’s a binary flag for detected anomalies (controlled by a minimal threshold parameter), and other explanatory fields.
Syntax
detect_anomalous_new_entity_fl(entityColumnName, scopeColumnName, timeColumnName, startTraining, startDetection, endDetection, [maxEntitiesThresh], [minTrainingDaysThresh], [decayParam], [anomalyScoreThresh])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| entityColumnName | string | ✔️ | The name of the input table column containing the names or IDs of the entities for which anomaly model is calculated. |
| scopeColumnName | string | ✔️ | The name of the input table column containing the partition or scope, so that a different anomaly model is built for each scope. |
| timeColumnName | string | ✔️ | The name of the input table column containing the timestamps, that are used to define the training and detection periods. |
| startTraining | datetime | ✔️ | The beginning of the training period for the anomaly model. Its end is defined by the beginning of detection period. |
| startDetection | datetime | ✔️ | The beginning of the detection period for anomaly detection. |
| endDetection | datetime | ✔️ | The end of the detection period for anomaly detection. |
| maxEntitiesThresh | int | The maximum number of existing entities in scope to calculate anomalies. If the number of entities is above the threshold, the scope is considered too noisy and anomalies aren’t calculated. The default value is 60. | |
| minTrainingDaysThresh | int | The minimum number of days in training period that a scope exists to calculate anomalies. If it is below threshold, the scope is considered too new and unknown, so anomalies aren’t calculated. The default value is 14. | |
| decayParam | real | The decay rate parameter for anomaly model, a number in range (0,1]. Lower values mean faster decay, so more importance is given to later appearances in training period. A value of 1 means no decay, so a simple average is used for Poisson distribution parameter estimation. The default value is 0.95. | |
| anomalyScoreThresh | real | The minimum value of anomaly score for which an anomaly is detected, a number in range [0, 1]. Higher values mean that only more significant cases are considered anomalous, so fewer anomalies are detected (higher precision, lower recall). The default value is 0.9. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let detect_anomalous_new_entity_fl = (T:(*), entityColumnName:string, scopeColumnName:string
, timeColumnName:string, startTraining:datetime, startDetection:datetime, endDetection:datetime
, maxEntitiesThresh:int = 60, minTrainingDaysThresh:int = 14, decayParam:real = 0.95, anomalyScoreThresh:real = 0.9)
{
//pre-process the input data by adding standard column names and dividing to datasets
let timePeriodBinSize = 'day'; // we assume a reasonable bin for time is day, so the probability model is built per that bin size
let processedData = (
T
| extend scope = column_ifexists(scopeColumnName, '')
| extend entity = column_ifexists(entityColumnName, '')
| extend sliceTime = todatetime(column_ifexists(timeColumnName, ''))
| where isnotempty(scope) and isnotempty(entity) and isnotempty(sliceTime)
| extend dataSet = case((sliceTime >= startTraining and sliceTime < startDetection), 'trainSet'
, sliceTime >= startDetection and sliceTime <= endDetection, 'detectSet'
, 'other')
| where dataSet in ('trainSet', 'detectSet')
);
// summarize the data by scope and entity. this will be used to create a distribution of entity appearances based on first seen data
let entityData = (
processedData
| summarize countRowsEntity = count(), firstSeenEntity = min(sliceTime), lastSeenEntity = max(sliceTime), firstSeenSet = arg_min(sliceTime, dataSet)
by scope, entity
| extend firstSeenSet = dataSet
| project-away dataSet
);
// aggregate entity data per scope and get the number of entities appearing over time
let aggregatedCandidateScopeData = (
entityData
| summarize countRowsScope = sum(countRowsEntity), countEntitiesScope = dcount(entity), countEntitiesScopeInTrain = dcountif(entity, firstSeenSet == 'trainSet')
, firstSeenScope = min(firstSeenEntity), lastSeenScope = max(lastSeenEntity), hasNewEntities = iff(dcountif(entity,firstSeenSet == 'detectSet') > 0, 1, 0)
by scope
| extend slicesInTrainingScope = datetime_diff(timePeriodBinSize, startDetection, firstSeenScope)
| where countEntitiesScopeInTrain <= maxEntitiesThresh and slicesInTrainingScope >= minTrainingDaysThresh and lastSeenScope >= startDetection and hasNewEntities == 1
);
let modelData = (
entityData
| join kind = inner (aggregatedCandidateScopeData) on scope
| where firstSeenSet == 'trainSet'
| summarize countAddedEntities = dcount(entity), firstSeenScope = min(firstSeenScope), slicesInTrainingScope = max(slicesInTrainingScope), countEntitiesScope = max(countEntitiesScope)
by scope, firstSeenSetOnScope = firstSeenSet, firstSeenEntityOnScope = firstSeenEntity
| extend diffInDays = datetime_diff(timePeriodBinSize, startDetection, firstSeenEntityOnScope)
// adding exponentially decaying weights to counts
| extend decayingWeight = pow(base = decayParam, exponent = diffInDays)
| extend decayingValue = countAddedEntities * decayingWeight
| summarize newEntityProbability = round(1 - exp(-1.0 * sum(decayingValue)/max(diffInDays)), 4)
, countKnownEntities = sum(countAddedEntities), lastNewEntityTimestamp = max(firstSeenEntityOnScope), slicesOnScope = max(slicesInTrainingScope)///for explainability
by scope, firstSeenSetOnScope
// anomaly score is based on probability to get no new entities, calculated using Poisson distribution (P(X=0) = exp(-avg)) with added decay on average
| extend newEntityAnomalyScore = round(1 - newEntityProbability, 4)
| extend isAnomalousNewEntity = iff(newEntityAnomalyScore >= anomalyScoreThresh, 1, 0)
);
let resultsData = (
processedData
| where dataSet == 'detectSet'
| join kind = inner (modelData) on scope
| join kind = inner (entityData | where firstSeenSet == 'detectSet') on scope, entity, $left.sliceTime == $right.firstSeenEntity
| project-away scope1, scope2, entity1
| where isAnomalousNewEntity == 1
| summarize arg_min(sliceTime, *) by scope, entity
| extend anomalyType = strcat('newEntity_', entityColumnName), anomalyExplainability = strcat('The ', entityColumnName, ' ', entity, ' wasn\'t seen on ', scopeColumnName, ' ', scope, ' during the last ', slicesOnScope, ' ', timePeriodBinSize, 's. Previously, ', countKnownEntities
, ' entities were seen, the last one of them appearing at ', format_datetime(lastNewEntityTimestamp, 'yyyy-MM-dd HH:mm'), '.')
| join kind = leftouter (entityData | where firstSeenSet == 'trainSet' | extend entityFirstSeens = strcat(entity, ' : ', format_datetime(firstSeenEntity, 'yyyy-MM-dd HH:mm')) | sort by scope, firstSeenEntity asc | summarize anomalyState = make_list(entityFirstSeens) by scope) on scope
| project-away scope1
);
resultsData
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (docstring = "Detect new and anomalous entity (such as username or IP) per scope (such as subscription or account)", skipvalidation = "true", folder = 'KCL')
detect_anomalous_new_entity_fl(T:(*), entityColumnName:string, scopeColumnName:string
, timeColumnName:string, startTraining:datetime, startDetection:datetime, endDetection:datetime
, maxEntitiesThresh:int = 60, minTrainingDaysThresh:int = 14, decayParam:real = 0.95, anomalyScoreThresh:real = 0.9)
{
//pre-process the input data by adding standard column names and dividing to datasets
let timePeriodBinSize = 'day'; // we assume a reasonable bin for time is day, so the probability model is built per that bin size
let processedData = (
T
| extend scope = column_ifexists(scopeColumnName, '')
| extend entity = column_ifexists(entityColumnName, '')
| extend sliceTime = todatetime(column_ifexists(timeColumnName, ''))
| where isnotempty(scope) and isnotempty(entity) and isnotempty(sliceTime)
| extend dataSet = case((sliceTime >= startTraining and sliceTime < startDetection), 'trainSet'
, sliceTime >= startDetection and sliceTime <= endDetection, 'detectSet'
, 'other')
| where dataSet in ('trainSet', 'detectSet')
);
// summarize the data by scope and entity. this will be used to create a distribution of entity appearances based on first seen data
let entityData = (
processedData
| summarize countRowsEntity = count(), firstSeenEntity = min(sliceTime), lastSeenEntity = max(sliceTime), firstSeenSet = arg_min(sliceTime, dataSet)
by scope, entity
| extend firstSeenSet = dataSet
| project-away dataSet
);
// aggregate entity data per scope and get the number of entities appearing over time
let aggregatedCandidateScopeData = (
entityData
| summarize countRowsScope = sum(countRowsEntity), countEntitiesScope = dcount(entity), countEntitiesScopeInTrain = dcountif(entity, firstSeenSet == 'trainSet')
, firstSeenScope = min(firstSeenEntity), lastSeenScope = max(lastSeenEntity), hasNewEntities = iff(dcountif(entity,firstSeenSet == 'detectSet') > 0, 1, 0)
by scope
| extend slicesInTrainingScope = datetime_diff(timePeriodBinSize, startDetection, firstSeenScope)
| where countEntitiesScopeInTrain <= maxEntitiesThresh and slicesInTrainingScope >= minTrainingDaysThresh and lastSeenScope >= startDetection and hasNewEntities == 1
);
let modelData = (
entityData
| join kind = inner (aggregatedCandidateScopeData) on scope
| where firstSeenSet == 'trainSet'
| summarize countAddedEntities = dcount(entity), firstSeenScope = min(firstSeenScope), slicesInTrainingScope = max(slicesInTrainingScope), countEntitiesScope = max(countEntitiesScope)
by scope, firstSeenSetOnScope = firstSeenSet, firstSeenEntityOnScope = firstSeenEntity
| extend diffInDays = datetime_diff(timePeriodBinSize, startDetection, firstSeenEntityOnScope)
// adding exponentially decaying weights to counts of
| extend decayingWeight = pow(base = decayParam, exponent = diffInDays)
| extend decayingValue = countAddedEntities * decayingWeight
| summarize newEntityProbability = round(1 - exp(-1.0 * sum(decayingValue)/max(diffInDays)), 4)
, countKnownEntities = sum(countAddedEntities), lastNewEntityTimestamp = max(firstSeenEntityOnScope), slicesOnScope = max(slicesInTrainingScope)///for explainability
by scope, firstSeenSetOnScope
// anomaly score is based on probability to get no new entities, calculated using Poisson distribution (P(X=0) = exp(-avg)) with added decay on average
| extend newEntityAnomalyScore = round(1 - newEntityProbability, 4)
| extend isAnomalousNewEntity = iff(newEntityAnomalyScore >= anomalyScoreThresh, 1, 0)
);
let resultsData = (
processedData
| where dataSet == 'detectSet'
| join kind = inner (modelData) on scope
| join kind = inner (entityData | where firstSeenSet == 'detectSet') on scope, entity, $left.sliceTime == $right.firstSeenEntity
| project-away scope1, scope2, entity1
| where isAnomalousNewEntity == 1
| summarize arg_min(sliceTime, *) by scope, entity
| extend anomalyType = strcat('newEntity_', entityColumnName), anomalyExplainability = strcat('The ', entityColumnName, ' ', entity, ' wasn\'t seen on ', scopeColumnName, ' ', scope, ' during the last ', slicesOnScope, ' ', timePeriodBinSize, 's. Previously, ', countKnownEntities
, ' entities were seen, the last one of them appearing at ', format_datetime(lastNewEntityTimestamp, 'yyyy-MM-dd HH:mm'), '.')
| join kind = leftouter (entityData | where firstSeenSet == 'trainSet' | extend entityFirstSeens = strcat(entity, ' : ', format_datetime(firstSeenEntity, 'yyyy-MM-dd HH:mm')) | sort by scope, firstSeenEntity asc | summarize anomalyState = make_list(entityFirstSeens) by scope) on scope
| project-away scope1
);
resultsData
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let detect_anomalous_new_entity_fl = (T:(*), entityColumnName:string, scopeColumnName:string
, timeColumnName:string, startTraining:datetime, startDetection:datetime, endDetection:datetime
, maxEntitiesThresh:int = 60, minTrainingDaysThresh:int = 14, decayParam:real = 0.95, anomalyScoreThresh:real = 0.9)
{
//pre-process the input data by adding standard column names and dividing to datasets
let timePeriodBinSize = 'day'; // we assume a reasonable bin for time is day, so the probability model is built per that bin size
let processedData = (
T
| extend scope = column_ifexists(scopeColumnName, '')
| extend entity = column_ifexists(entityColumnName, '')
| extend sliceTime = todatetime(column_ifexists(timeColumnName, ''))
| where isnotempty(scope) and isnotempty(entity) and isnotempty(sliceTime)
| extend dataSet = case((sliceTime >= startTraining and sliceTime < startDetection), 'trainSet'
, sliceTime >= startDetection and sliceTime <= endDetection, 'detectSet'
, 'other')
| where dataSet in ('trainSet', 'detectSet')
);
// summarize the data by scope and entity. this will be used to create a distribution of entity appearances based on first seen data
let entityData = (
processedData
| summarize countRowsEntity = count(), firstSeenEntity = min(sliceTime), lastSeenEntity = max(sliceTime), firstSeenSet = arg_min(sliceTime, dataSet)
by scope, entity
| extend firstSeenSet = dataSet
| project-away dataSet
);
// aggregate entity data per scope and get the number of entities appearing over time
let aggregatedCandidateScopeData = (
entityData
| summarize countRowsScope = sum(countRowsEntity), countEntitiesScope = dcount(entity), countEntitiesScopeInTrain = dcountif(entity, firstSeenSet == 'trainSet')
, firstSeenScope = min(firstSeenEntity), lastSeenScope = max(lastSeenEntity), hasNewEntities = iff(dcountif(entity,firstSeenSet == 'detectSet') > 0, 1, 0)
by scope
| extend slicesInTrainingScope = datetime_diff(timePeriodBinSize, startDetection, firstSeenScope)
| where countEntitiesScopeInTrain <= maxEntitiesThresh and slicesInTrainingScope >= minTrainingDaysThresh and lastSeenScope >= startDetection and hasNewEntities == 1
);
let modelData = (
entityData
| join kind = inner (aggregatedCandidateScopeData) on scope
| where firstSeenSet == 'trainSet'
| summarize countAddedEntities = dcount(entity), firstSeenScope = min(firstSeenScope), slicesInTrainingScope = max(slicesInTrainingScope), countEntitiesScope = max(countEntitiesScope)
by scope, firstSeenSetOnScope = firstSeenSet, firstSeenEntityOnScope = firstSeenEntity
| extend diffInDays = datetime_diff(timePeriodBinSize, startDetection, firstSeenEntityOnScope)
// adding exponentially decaying weights to counts
| extend decayingWeight = pow(base = decayParam, exponent = diffInDays)
| extend decayingValue = countAddedEntities * decayingWeight
| summarize newEntityProbability = round(1 - exp(-1.0 * sum(decayingValue)/max(diffInDays)), 4)
, countKnownEntities = sum(countAddedEntities), lastNewEntityTimestamp = max(firstSeenEntityOnScope), slicesOnScope = max(slicesInTrainingScope)///for explainability
by scope, firstSeenSetOnScope
// anomaly score is based on probability to get no new entities, calculated using Poisson distribution (P(X=0) = exp(-avg)) with added decay on average
| extend newEntityAnomalyScore = round(1 - newEntityProbability, 4)
| extend isAnomalousNewEntity = iff(newEntityAnomalyScore >= anomalyScoreThresh, 1, 0)
);
let resultsData = (
processedData
| where dataSet == 'detectSet'
| join kind = inner (modelData) on scope
| join kind = inner (entityData | where firstSeenSet == 'detectSet') on scope, entity, $left.sliceTime == $right.firstSeenEntity
| project-away scope1, scope2, entity1
| where isAnomalousNewEntity == 1
| summarize arg_min(sliceTime, *) by scope, entity
| extend anomalyType = strcat('newEntity_', entityColumnName), anomalyExplainability = strcat('The ', entityColumnName, ' ', entity, ' wasn\'t seen on ', scopeColumnName, ' ', scope, ' during the last ', slicesOnScope, ' ', timePeriodBinSize, 's. Previously, ', countKnownEntities
, ' entities were seen, the last one of them appearing at ', format_datetime(lastNewEntityTimestamp, 'yyyy-MM-dd HH:mm'), '.')
| join kind = leftouter (entityData | where firstSeenSet == 'trainSet' | extend entityFirstSeens = strcat(entity, ' : ', format_datetime(firstSeenEntity, 'yyyy-MM-dd HH:mm')) | sort by scope, firstSeenEntity asc | summarize anomalyState = make_list(entityFirstSeens) by scope) on scope
| project-away scope1
);
resultsData
};
// synthetic data generation
let detectPeriodStart = datetime(2022-04-30 05:00:00.0000000);
let trainPeriodStart = datetime(2022-03-01 05:00);
let names = pack_array("Admin", "Dev1", "Dev2", "IT-support");
let countNames = array_length(names);
let testData = range t from 1 to 24*60 step 1
| extend timeSlice = trainPeriodStart + 1h * t
| extend countEvents = round(2*rand() + iff((t/24)%7>=5, 10.0, 15.0) - (((t%24)/10)*((t%24)/10)), 2) * 100 // generate a series with weekly seasonality
| extend userName = tostring(names[toint(rand(countNames))])
| extend deviceId = hash_md5(rand())
| extend accountName = iff(((rand() < 0.2) and (timeSlice < detectPeriodStart)), 'testEnvironment', 'prodEnvironment')
| extend userName = iff(timeSlice == detectPeriodStart, 'H4ck3r', userName)
| extend deviceId = iff(timeSlice == detectPeriodStart, 'abcdefghijklmnoprtuvwxyz012345678', deviceId)
| sort by timeSlice desc
;
testData
| invoke detect_anomalous_new_entity_fl(entityColumnName = 'userName' //principalName for positive, deviceId for negative
, scopeColumnName = 'accountName'
, timeColumnName = 'timeSlice'
, startTraining = trainPeriodStart
, startDetection = detectPeriodStart
, endDetection = detectPeriodStart
)
Stored
let detectPeriodStart = datetime(2022-04-30 05:00:00.0000000);
let trainPeriodStart = datetime(2022-03-01 05:00);
let names = pack_array("Admin", "Dev1", "Dev2", "IT-support");
let countNames = array_length(names);
let testData = range t from 1 to 24*60 step 1
| extend timeSlice = trainPeriodStart + 1h * t
| extend countEvents = round(2*rand() + iff((t/24)%7>=5, 10.0, 15.0) - (((t%24)/10)*((t%24)/10)), 2) * 100 // generate a series with weekly seasonality
| extend userName = tostring(names[toint(rand(countNames))])
| extend deviceId = hash_md5(rand())
| extend accountName = iff(((rand() < 0.2) and (timeSlice < detectPeriodStart)), 'testEnvironment', 'prodEnvironment')
| extend userName = iff(timeSlice == detectPeriodStart, 'H4ck3r', userName)
| extend deviceId = iff(timeSlice == detectPeriodStart, 'abcdefghijklmnoprtuvwxyz012345678', deviceId)
| sort by timeSlice desc
;
testData
| invoke detect_anomalous_new_entity_fl(entityColumnName = 'userName'
, scopeColumnName = 'accountName'
, timeColumnName = 'timeSlice'
, startTraining = trainPeriodStart
, startDetection = detectPeriodStart
, endDetection = detectPeriodStart
)
Output
| scope | entity | sliceTime | t | timeSlice | countEvents | userName | deviceId | accountName | dataSet | firstSeenSetOnScope | newEntityProbability | countKnownEntities | lastNewEntityTimestamp | slicesOnScope | newEntityAnomalyScore | isAnomalousNewEntity | anomalyType | anomalyExplainability | anomalyState |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| prodEnvironment | H4ck3r | 2022-04-30 05:00:00.0000000 | 1440 | 2022-04-30 05:00:00.0000000 | 1687 | H4ck3r | abcdefghijklmnoprtuvwxyz012345678 | prodEnvironment | detectSet | trainSet | 0.0031 | 4 | 2022-03-01 09:00:00.0000000 | 60 | 0.9969 | 1 | newEntity_userName | The userName H4ck3r wasn’t seen on accountName prodEnvironment during the last 60 days. Previously, four entities were seen, the last one of them appearing at 2022-03-01 09:00. | [“IT-support : 2022-03-01 07:00”, “Admin : 2022-03-01 08:00”, “Dev2 : 2022-03-01 09:00”, “Dev1 : 2022-03-01 14:00”] |
The output of running the function is the first-seen row in test dataset for each entity per scope, filtered for new entities (meaning they didn’t appear during the training period) that were tagged as anomalous (meaning that entity anomaly score was above anomalyScoreThresh). Some other fields are added for clarity:
dataSet: current dataset (is alwaysdetectSet).firstSeenSetOnScope: dataset in which the scope was first seen (should be ’trainSet’).newEntityProbability: probability to see any new entity based on Poisson model estimation.countKnownEntities: existing entities on scope.lastNewEntityTimestamp: last time a new entity was seen before the anomalous one.slicesOnScope: count of slices per scope.newEntityAnomalyScore: anomaly score was the new entity in range [0, 1], higher values meaning more anomaly.isAnomalousNewEntity: binary flag for anomalous new entitiesanomalyType: shows the type of anomaly (helpful when running several anomaly detection logics together).anomalyExplainability: textual wrapper for generated anomaly and its explanation.anomalyState: bag of existing entities on scope with their first seen times.
Running this function on user per account with default parameters detects a previously unseen and anomalous user (‘H4ck3r’) with high anomaly score of 0.9969, meaning that this is unexpected (due to small numbers of existing users in training period).
When we run the function with default parameters on deviceId as entity, we won’t see an anomaly, due to the large number of existing devices - which makes the appearance of a new one expected. However, if we lower the parameter anomalyScoreThresh to 0.0001 and raise the parameter maxEntitiesThresh to 10000, we’ll effectively decrease precision in favor of recall, and detect an anomaly (with a low anomaly score) on device ‘abcdefghijklmnoprtuvwxyz012345678’.
The output shows the anomalous entities together with explanation fields in standardized format. These fields are useful for investigating the anomaly and for running anomalous entity detection on several entities or running other algorithms together.
The suggested usage in cybersecurity context is running the function on meaningful entities - such as usernames or IP addresses - per meaningful scopes - such as subscription on accounts. A detected anomalous new entity means that its appearance isn’t expected on the scope, and might be suspicious.
13 - factorial_fl()
Calculate factorial.
The function factorial_fl() is a UDF (user-defined function) that calculates factorial of positive integers (n!). It’s a simple wrapper of the native gamma() function.
Syntax
factorial_fl(n)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| n | int | ✔️ | The input integer for which to calculate the factorial. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let factorial_fl=(n:int)
{
gamma(n+1)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Calculate factorial")
factorial_fl(n:int)
{
gamma(n+1)
}
Example
Query-defined
let factorial_fl=(n:int)
{
gamma(n+1)
};
range x from 1 to 10 step 3
| extend fx = factorial_fl(x)
Stored
range x from 1 to 10 step 3
| extend fx = factorial_fl(x)
Output
| x | fx |
|---|---|
| 1 | 1 |
| 4 | 24 |
| 7 | 5040 |
| 10 | 3628799 |
14 - Functions
Functions are reusable queries or query parts. Kusto supports two kinds of functions:
Built-in functions are hard-coded functions defined by Kusto that can’t be modified by users.
User-defined functions, which are divided into two types:
Stored functions: user-defined functions that are stored and managed database schema entities, similar to tables. For more information, see Stored functions. To create a stored function, use the .create function command.
Query-defined functions: user-defined functions that are defined and used within the scope of a single query. The definition of such functions is done through a let statement. For more information on how to create query-defined functions, see Create a user defined function.
For more information on user-defined functions, see User-defined functions.
15 - Functions library
The following article contains a categorized list of UDF (user-defined functions).
The user-defined functions code is given in the articles. It can be used within a let statement embedded in a query or can be persisted in a database using .create function.
Cybersecurity functions
| Function Name | Description |
|---|---|
| detect_anomalous_access_cf_fl() | Detect anomalous access using collaborative filtering over timestamped data. |
| detect_anomalous_new_entity_fl() | Detect the appearance of anomalous new entities in timestamped data. |
| detect_anomalous_spike_fl() | Detect the appearance of anomalous spikes in numeric variables in timestamped data. |
| graph_blast_radius_fl() | Calculate the Blast Radius (list and score) of source nodes over path or edge data. |
| graph_exposure_perimeter_fl() | Calculate the Exposure Perimeter (list and score) of target nodes over path or edge data. |
| graph_node_centrality_fl() | Calculate various metrics of node centrality (such as degree and betweenness) over graph data (edge and nodes). |
| graph_path_discovery_fl() | Discover valid paths between relevant endpoints (sources and targets) over graph data (edge and nodes). |
General functions
| Function Name | Description |
|---|---|
| geoip_fl() | Retrieves geographic information of ip address. |
| get_packages_version_fl() | Returns version information of the Python engine and the specified packages. |
Machine learning & AI functions
| Function Name | Description |
|---|---|
| dbscan_fl() | Clusterize using the DBSCAN algorithm, features are in separate columns. |
| dbscan_dynamic_fl() | Clusterize using the DBSCAN algorithm, features are in a single dynamic column. |
| kmeans_fl() | Clusterize using the K-Means algorithm, features are in separate columns. |
| kmeans_dynamic_fl() | Clusterize using the K-Means algorithm, features are in a single dynamic column. |
| predict_fl() | Predict using an existing trained machine learning model. |
| predict_onnx_fl() | Predict using an existing trained machine learning model in ONNX format. |
| slm_embeddings_fl() | Generate text embeddings using local Small Language Models (SLM). |
Plotly functions
The following section contains functions for rendering interactive Plotly charts.
| Function Name | Description |
|---|---|
| plotly_anomaly_fl() | Render anomaly chart using a Plotly template. |
| plotly_gauge_fl() | Render gauge chart using a Plotly template. |
| plotly_graph_fl() | Render interactive graph visualizations using a Plotly template. |
| plotly_scatter3d_fl() | Render 3D scatter chart using a Plotly template. |
PromQL functions
The following section contains common PromQL functions. These functions can be used for analysis of metrics ingested to your database by the Prometheus monitoring system. All functions assume that metrics in your database are structured using the Prometheus data model.
| Function Name | Description |
|---|---|
| series_metric_fl() | Select and retrieve time series stored with the Prometheus data model. |
| series_rate_fl() | Calculate the average rate of counter metric increase per second. |
Series processing functions
| Function Name | Description |
|---|---|
| quantize_fl() | Quantize metric columns. |
| series_clean_anomalies_fl() | Replace anomalies in a series by interpolated value. |
| series_cosine_similarity_fl() | Calculate the cosine similarity of two numerical vectors. |
| series_dbl_exp_smoothing_fl() | Apply a double exponential smoothing filter on series. |
| series_dot_product_fl() | Calculate the dot product of two numerical vectors. |
| series_downsample_fl() | Downsample time series by an integer factor. |
| series_exp_smoothing_fl() | Apply a basic exponential smoothing filter on series. |
| series_fit_lowess_fl() | Fit a local polynomial to series using LOWESS method. |
| series_fit_poly_fl() | Fit a polynomial to series using regression analysis. |
| series_fbprophet_forecast_fl() | Forecast time series values using the Prophet algorithm. |
| series_lag_fl() | Apply a lag filter on series. |
| series_monthly_decompose_anomalies_fl() | Detect anomalies in a series with monthly seasonality. |
| series_moving_avg_fl() | Apply a moving average filter on series. |
| series_moving_var_fl() | Apply a moving variance filter on series. |
| series_mv_ee_anomalies_fl() | Multivariate Anomaly Detection for series using elliptical envelope model. |
| series_mv_if_anomalies_fl() | Multivariate Anomaly Detection for series using isolation forest model. |
| series_mv_oc_anomalies_fl() | Multivariate Anomaly Detection for series using one class SVM model. |
| series_rolling_fl() | Apply a rolling aggregation function on series. |
| series_shapes_fl() | Detects positive/negative trend or jump in series. |
| series_uv_anomalies_fl() | Detect anomalies in time series using the Univariate Anomaly Detection Cognitive Service API. |
| series_uv_change_points_fl() | Detect change points in time series using the Univariate Anomaly Detection Cognitive Service API. |
| time_weighted_avg_fl() | Calculates the time weighted average of a metric using fill forward interpolation. |
| time_weighted_avg2_fl() | Calculates the time weighted average of a metric using linear interpolation. |
| time_weighted_val_fl() | Calculates the time weighted value of a metric using linear interpolation. |
| time_window_rolling_avg_fl() | Calculates the rolling average of a metric over a constant duration time window. |
Statistical and probability functions
| Function Name | Description |
|---|---|
| bartlett_test_fl() | Perform the Bartlett test. |
| binomial_test_fl() | Perform the binomial test. |
| comb_fl() | Calculate C(n, k), the number of combinations for selection of k items out of n. |
| entropy_fl() | Calculate Shannon Entropy of multiple probability vectors. |
| factorial_fl() | Calculate n!, the factorial of n. |
| ks_test_fl() | Perform a Kolmogorov Smirnov test. |
| levene_test_fl() | Perform a Levene test. |
| normality_test_fl() | Performs the Normality Test. |
| mann_whitney_u_test_fl() | Perform a Mann-Whitney U Test. |
| pair_probabilities_fl() | Calculate various probabilities and related metrics for a pair of categorical variables. |
| pairwise_dist_fl() | Calculate pairwise distances between entities based on multiple nominal and numerical variables. |
| percentiles_linear_fl() | Calculate percentiles using linear interpolation between closest ranks |
| perm_fl() | Calculate P(n, k), the number of permutations for selection of k items out of n. |
| two_sample_t_test_fl() | Perform the two sample t-test. |
| wilcoxon_test_fl() | Perform the Wilcoxon Test. |
Text analytics
| Function Name | Description |
|---|---|
| log_reduce_fl() | Find common patterns in textual logs and output a summary table. |
| log_reduce_full_fl() | Find common patterns in textual logs and output a full table. |
| log_reduce_predict_fl() | Apply a trained model to find common patterns in textual logs and output a summary table. |
| log_reduce_predict_full_fl() | Apply a trained model to find common patterns in textual logs and output a full table. |
| log_reduce_train_fl() | Find common patterns in textual logs and output a model. |
| tokenize_fl() | Tokenize semi-structured text strings into separate columns. |
16 - geoip_fl()
geoip_fl() is a user-defined function that retrieves geographic information of ip address.
Syntax
T | invoke geoip_fl(ip_col, country_col, state_col, city_col, longitude_col, latitude_col)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| ip_col | string | ✔️ | The name of the column containing the IP addresses to resolve. |
| country_col | string | ✔️ | The name of the column to store the retrieved country. |
| state_col | string | ✔️ | The name of the column to store the retrieved state. |
| city_col | string | ✔️ | The name of the column to store the retrieved city. |
| longitude_col | real | ✔️ | The name of the column to store the retrieved longitude. |
| latitude_col | real | ✔️ | The name of the column to store the retrieved latitude. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let geoip_fl=(tbl:(*), ip_col:string, country_col:string, state_col:string, city_col:string, longitude_col:string, latitude_col:string)
{
let kwargs = bag_pack('ip_col', ip_col, 'country_col', country_col, 'state_col', state_col, 'city_col', city_col, 'longitude_col', longitude_col, 'latitude_col', latitude_col);
let code= ```if 1:
from sandbox_utils import Zipackage
Zipackage.install('geoip2.zip')
import geoip2.database
ip_col = kargs['ip_col']
country_col = kargs['country_col']
state_col = kargs['state_col']
city_col = kargs['city_col']
longitude_col = kargs['longitude_col']
latitude_col = kargs['latitude_col']
result=df
reader = geoip2.database.Reader(r'C:\\Temp\\GeoLite2-City.mmdb')
def geodata(ip):
try:
gd = reader.city(ip)
geo = pd.Series((gd.country.name, gd.subdivisions.most_specific.name, gd.city.name, gd.location.longitude, gd.location.latitude))
except:
geo = pd.Series((None, None, None, None, None))
return geo
result[[country_col, state_col, city_col, longitude_col, latitude_col]] = result[ip_col].apply(geodata)
```;
tbl
| evaluate python(typeof(*), code, kwargs,
external_artifacts =
pack('geoip2.zip', 'https://artifactswestus.blob.core.windows.net/public/geoip2-4.6.0.zip',
'GeoLite2-City.mmdb', 'https://artifactswestus.blob.core.windows.net/public/GeoLite2-City-20230221.mmdb')
)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = 'Packages\\Utils', docstring = 'Retrieve geographics of ip address')
geoip_fl(tbl:(*), ip_col:string, country_col:string, state_col:string, city_col:string, longitude_col:string, latitude_col:string)
{
let kwargs = bag_pack('ip_col', ip_col, 'country_col', country_col, 'state_col', state_col, 'city_col', city_col, 'longitude_col', longitude_col, 'latitude_col', latitude_col);
let code= ```if 1:
from sandbox_utils import Zipackage
Zipackage.install('geoip2.zip')
import geoip2.database
ip_col = kargs['ip_col']
country_col = kargs['country_col']
state_col = kargs['state_col']
city_col = kargs['city_col']
longitude_col = kargs['longitude_col']
latitude_col = kargs['latitude_col']
result=df
reader = geoip2.database.Reader(r'C:\\Temp\\GeoLite2-City.mmdb')
def geodata(ip):
try:
gd = reader.city(ip)
geo = pd.Series((gd.country.name, gd.subdivisions.most_specific.name, gd.city.name, gd.location.longitude, gd.location.latitude))
except:
geo = pd.Series((None, None, None, None, None))
return geo
result[[country_col, state_col, city_col, longitude_col, latitude_col]] = result[ip_col].apply(geodata)
```;
tbl
| evaluate python(typeof(*), code, kwargs,
external_artifacts =
pack('geoip2.zip', 'https://artifactswestus.blob.core.windows.net/public/geoip2-4.6.0.zip',
'GeoLite2-City.mmdb', 'https://artifactswestus.blob.core.windows.net/public/GeoLite2-City-20230221.mmdb')
)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let geoip_fl=(tbl:(*), ip_col:string, country_col:string, state_col:string, city_col:string, longitude_col:string, latitude_col:string)
{
let kwargs = bag_pack('ip_col', ip_col, 'country_col', country_col, 'state_col', state_col, 'city_col', city_col, 'longitude_col', longitude_col, 'latitude_col', latitude_col);
let code= ```if 1:
from sandbox_utils import Zipackage
Zipackage.install('geoip2.zip')
import geoip2.database
ip_col = kargs['ip_col']
country_col = kargs['country_col']
state_col = kargs['state_col']
city_col = kargs['city_col']
longitude_col = kargs['longitude_col']
latitude_col = kargs['latitude_col']
result=df
reader = geoip2.database.Reader(r'C:\\Temp\\GeoLite2-City.mmdb')
def geodata(ip):
try:
gd = reader.city(ip)
geo = pd.Series((gd.country.name, gd.subdivisions.most_specific.name, gd.city.name, gd.location.longitude, gd.location.latitude))
except:
geo = pd.Series((None, None, None, None, None))
return geo
result[[country_col, state_col, city_col, longitude_col, latitude_col]] = result[ip_col].apply(geodata)
```;
tbl
| evaluate python(typeof(*), code, kwargs,
external_artifacts =
pack('geoip2.zip', 'https://artifactswestus.blob.core.windows.net/public/geoip2-4.6.0.zip',
'GeoLite2-City.mmdb', 'https://artifactswestus.blob.core.windows.net/public/GeoLite2-City-20230221.mmdb')
)
};
datatable(ip:string) [
'8.8.8.8',
'20.53.203.50',
'20.81.111.85',
'20.103.85.33',
'20.84.181.62',
'205.251.242.103',
]
| extend country='', state='', city='', longitude=real(null), latitude=real(null)
| invoke geoip_fl('ip','country', 'state', 'city', 'longitude', 'latitude')
Stored
datatable(ip:string) [
'8.8.8.8',
'20.53.203.50',
'20.81.111.85',
'20.103.85.33',
'20.84.181.62',
'205.251.242.103',
]
| extend country='', state='', city='', longitude=real(null), latitude=real(null)
| invoke geoip_fl('ip','country', 'state', 'city', 'longitude', 'latitude')
Output
| ip | country | state | city | longitude | latitude |
|---|---|---|---|---|---|
| 20.103.85.33 | Netherlands | North Holland | Amsterdam | 4.8883 | 52.3716 |
| 20.53.203.50 | Australia | New South Wales | Sydney | 151.2006 | -33.8715 |
| 20.81.111.85 | United States | Virginia | Tappahannock | -76.8545 | 37.9273 |
| 20.84.181.62 | United States | Iowa | Des Moines | -93.6124 | 41.6021 |
| 205.251.242.103 | United States | Virginia | Ashburn | -77.4903 | 39.0469 |
| 8.8.8.8 | United States | California | Los Angeles | -118.2441 | 34.0544 |
17 - get_packages_version_fl()
get_packages_version_fl() is a user-defined function that retrieves the versions of the Python engine and packages of the inline python() plugin.
The function accepts a dynamic array containing the names of the packages to check, and returns their respective versions and the Python engine version.
Syntax
T | invoke get_packages_version_fl(packages)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| packages | dynamic | A dynamic array containing the names of the packages. Default is empty list to retrieve only the Python engine version. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let get_packages_version_fl = (packages:dynamic=dynamic([]))
{
let kwargs = pack('packages', packages);
let code =
```if 1:
import importlib
import sys
packages = kargs["packages"]
result = pd.DataFrame(columns=["name", "ver"])
for i in range(len(packages)):
result.loc[i, "name"] = packages[i]
try:
m = importlib.import_module(packages[i])
result.loc[i, "ver"] = m.__version__ if hasattr(m, "__version__") else "missing __version__ attribute"
except Exception as ex:
result.loc[i, "ver"] = "ERROR: " + (ex.msg if hasattr(ex, "msg") else "exception, no msg")
id = result.shape[0]
result.loc[id, "name"] = "Python"
result.loc[id, "ver"] = sys.version
```;
print 1
| evaluate python(typeof(name:string , ver:string), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Utils", docstring = "Returns version information of the Python engine and the specified packages")
get_packages_version_fl(packages:dynamic=dynamic([]))
{
let kwargs = pack('packages', packages);
let code =
```if 1:
import importlib
import sys
packages = kargs["packages"]
result = pd.DataFrame(columns=["name", "ver"])
for i in range(len(packages)):
result.loc[i, "name"] = packages[i]
try:
m = importlib.import_module(packages[i])
result.loc[i, "ver"] = m.__version__ if hasattr(m, "__version__") else "missing __version__ attribute"
except Exception as ex:
result.loc[i, "ver"] = "ERROR: " + (ex.msg if hasattr(ex, "msg") else "exception, no msg")
id = result.shape[0]
result.loc[id, "name"] = "Python"
result.loc[id, "ver"] = sys.version
```;
print 1
| evaluate python(typeof(name:string , ver:string), code, kwargs)
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let get_packages_version_fl = (packages:dynamic=dynamic([]))
{
let kwargs = pack('packages', packages);
let code =
```if 1:
import importlib
import sys
packages = kargs["packages"]
result = pd.DataFrame(columns=["name", "ver"])
for i in range(len(packages)):
result.loc[i, "name"] = packages[i]
try:
m = importlib.import_module(packages[i])
result.loc[i, "ver"] = m.__version__ if hasattr(m, "__version__") else "missing __version__ attribute"
except Exception as ex:
result.loc[i, "ver"] = "ERROR: " + (ex.msg if hasattr(ex, "msg") else "exception, no msg")
id = result.shape[0]
result.loc[id, "name"] = "Python"
result.loc[id, "ver"] = sys.version
```;
print 1
| evaluate python(typeof(name:string , ver:string), code, kwargs)
};
get_packages_version_fl(pack_array('numpy', 'scipy', 'pandas', 'statsmodels', 'sklearn', 'onnxruntime', 'plotly'))
Stored
get_packages_version_fl(pack_array('numpy', 'scipy', 'pandas', 'statsmodels', 'sklearn', 'onnxruntime', 'plotly'))
Output
| name | ver |
|---|---|
| numpy | 1.23.4 |
| onnxruntime | 1.13.1 |
| pandas | 1.5.1 |
| plotly | 5.11.0 |
| Python | 3.10.8 (tags/v3.10.8:aaaf517, Oct 11 2022, 16:50:30) [MSC v.1933 64 bit (AMD64)] |
| scipy | 1.9.3 |
| sklearn | 1.1.3 |
| statsmodels | 0.13.2 |
18 - kmeans_dynamic_fl()
The function kmeans_dynamic_fl() is a UDF (user-defined function) that clusterizes a dataset using the k-means algorithm. This function is similar to kmeans_fl() just the features are supplied by a single numerical array column and not by multiple scalar columns.
Syntax
T | invoke kmeans_dynamic_fl(k, features_col, cluster_col)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| k | int | ✔️ | The number of clusters. |
| features_col | string | ✔️ | The name of the column containing the numeric array of features to be used for clustering. |
| cluster_col | string | ✔️ | The name of the column to store the output cluster ID for each record. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let kmeans_dynamic_fl=(tbl:(*),k:int, features_col:string, cluster_col:string)
{
let kwargs = bag_pack('k', k, 'features_col', features_col, 'cluster_col', cluster_col);
let code = ```if 1:
from sklearn.cluster import KMeans
k = kargs["k"]
features_col = kargs["features_col"]
cluster_col = kargs["cluster_col"]
df1 = df[features_col].apply(np.array)
matrix = np.vstack(df1.values)
kmeans = KMeans(n_clusters=k, random_state=0)
kmeans.fit(matrix)
result = df
result[cluster_col] = kmeans.labels_
```;
tbl
| evaluate python(typeof(*),code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\ML", docstring = "K-Means clustering of features passed as a single column containing numerical array")
kmeans_dynamic_fl(tbl:(*),k:int, features_col:string, cluster_col:string)
{
let kwargs = bag_pack('k', k, 'features_col', features_col, 'cluster_col', cluster_col);
let code = ```if 1:
from sklearn.cluster import KMeans
k = kargs["k"]
features_col = kargs["features_col"]
cluster_col = kargs["cluster_col"]
df1 = df[features_col].apply(np.array)
matrix = np.vstack(df1.values)
kmeans = KMeans(n_clusters=k, random_state=0)
kmeans.fit(matrix)
result = df
result[cluster_col] = kmeans.labels_
```;
tbl
| evaluate python(typeof(*),code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Clustering of artificial dataset with three clusters
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let kmeans_dynamic_fl=(tbl:(*),k:int, features_col:string, cluster_col:string)
{
let kwargs = bag_pack('k', k, 'features_col', features_col, 'cluster_col', cluster_col);
let code = ```if 1:
from sklearn.cluster import KMeans
k = kargs["k"]
features_col = kargs["features_col"]
cluster_col = kargs["cluster_col"]
df1 = df[features_col].apply(np.array)
matrix = np.vstack(df1.values)
kmeans = KMeans(n_clusters=k, random_state=0)
kmeans.fit(matrix)
result = df
result[cluster_col] = kmeans.labels_
```;
tbl
| evaluate python(typeof(*),code, kwargs)
};
union
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| project Features=pack_array(x, y), cluster_id=int(null)
| invoke kmeans_dynamic_fl(3, "Features", "cluster_id")
| extend x=toreal(Features[0]), y=toreal(Features[1])
| render scatterchart with(series=cluster_id)
Stored
union
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| project Features=pack_array(x, y), cluster_id=int(null)
| invoke kmeans_dynamic_fl(3, "Features", "cluster_id")
| extend x=toreal(Features[0]), y=toreal(Features[1])
| render scatterchart with(series=cluster_id)

19 - kmeans_fl()
The function kmeans_fl() is a UDF (user-defined function) that clusterizes a dataset using the k-means algorithm.
Syntax
T | invoke kmeans_fl(k, features, cluster_col)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| k | int | ✔️ | The number of clusters. |
| features | dynamic | ✔️ | An array containing the names of the features columns to use for clustering. |
| cluster_col | string | ✔️ | The name of the column to store the output cluster ID for each record. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let kmeans_fl=(tbl:(*), k:int, features:dynamic, cluster_col:string)
{
let kwargs = bag_pack('k', k, 'features', features, 'cluster_col', cluster_col);
let code = ```if 1:
from sklearn.cluster import KMeans
k = kargs["k"]
features = kargs["features"]
cluster_col = kargs["cluster_col"]
km = KMeans(n_clusters=k)
df1 = df[features]
km.fit(df1)
result = df
result[cluster_col] = km.labels_
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create function with (folder = "Packages\\ML", docstring = "K-Means clustering")
kmeans_fl(tbl:(*), k:int, features:dynamic, cluster_col:string)
{
let kwargs = bag_pack('k', k, 'features', features, 'cluster_col', cluster_col);
let code = ```if 1:
from sklearn.cluster import KMeans
k = kargs["k"]
features = kargs["features"]
cluster_col = kargs["cluster_col"]
km = KMeans(n_clusters=k)
df1 = df[features]
km.fit(df1)
result = df
result[cluster_col] = km.labels_
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Clusterize artificial dataset with three clusters
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let kmeans_fl=(tbl:(*), k:int, features:dynamic, cluster_col:string)
{
let kwargs = bag_pack('k', k, 'features', features, 'cluster_col', cluster_col);
let code = ```if 1:
from sklearn.cluster import KMeans
k = kargs["k"]
features = kargs["features"]
cluster_col = kargs["cluster_col"]
km = KMeans(n_clusters=k)
df1 = df[features]
km.fit(df1)
result = df
result[cluster_col] = km.labels_
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
union
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| invoke kmeans_fl(3, bag_pack("x", "y"), "cluster_id")
| render scatterchart with(series=cluster_id)
Stored
union
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| invoke kmeans_fl(3, bag_pack("x", "y"), "cluster_id")
| render scatterchart with(series=cluster_id)

20 - ks_test_fl()
The function ks_test_fl() is a UDF (user-defined function) that performs the Kolmogorov Smirnov Test.
Syntax
T | invoke ks_test_fl(data1, data2, test_statistic,p_value)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| data1 | string | ✔️ | The name of the column containing the first set of data to be used for the test. |
| data2 | string | ✔️ | The name of the column containing the second set of data to be used for the test. |
| test_statistic | string | ✔️ | The name of the column to store test statistic value for the results. |
| p_value | string | ✔️ | The name of the column to store p-value for the results. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let ks_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.ks_2samp(row[data1], row[data2])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Kolmogorov Smirnov Test")
ks_test_fl(tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.ks_2samp(row[data1], row[data2])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let ks_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.ks_2samp(row[data1], row[data2])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke ks_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Stored
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke ks_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Output
| id | sample1 | sample2 | test_stat | p_val |
|---|---|---|---|---|
| Test #1 | [23.64, 20.57, 20.42] | [27.1, 22.12, 33.56] | 0.66666666666666674 | 0.3197243332709643 |
| Test #2 | [20.85, 21.89, 23.41] | [35.09, 30.02, 26.52] | 1 | 0.03262165165202116 |
| Test #3 | [20.13, 20.5, 21.7, 22.02] | [32.2, 32.79, 33.9, 34.22] | 1 | 0.01106563701580386 |
21 - levene_test_fl()
The function levene_test_fl() is a UDF (user-defined function) that performs the Levene Test.
Syntax
T | invoke levene_test_fl(data1, data2, test_statistic,p_value)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| data1 | string | ✔️ | The name of the column containing the first set of data to be used for the test. |
| data2 | string | ✔️ | The name of the column containing the second set of data to be used for the test. |
| test_statistic | string | ✔️ | The name of the column to store test statistic value for the results. |
| p_value | string | ✔️ | The name of the column to store p-value for the results. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
<!-- let levene_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.levene(row[data1], row[data2])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Levene Test")
levene_test_fl(tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.levene(row[data1], row[data2])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
<!-- let levene_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.levene(row[data1], row[data2])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke levene_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Stored
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke levene_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Output
| id | sample1 | sample2 | test_stat | p_val |
|---|---|---|---|---|
| Test #1 | [23.64, 20.57, 20.42] | [27.1, 22.12, 33.56] | 1.5587395987367387 | 0.27993504690044563 |
| Test #2 | [20.85, 21.89, 23.41] | [35.09, 30.02, 26.52] | 1.6402495788130482 | 0.26950872948841353 |
| Test #3 | [20.13, 20.5, 21.7, 22.02] | [32.2, 32.79, 33.9, 34.22] | 0.0032989690721642395 | 0.95606240301049072 |
22 - log_reduce_fl()
The function log_reduce_fl() finds common patterns in semi-structured textual columns, such as log lines, and clusters the lines according to the extracted patterns. It outputs a summary table containing the found patterns sorted top down by their respective frequency.
Syntax
T | invoke log_reduce_fl(reduce_col [, use_logram [, use_drain [, custom_regexes [, custom_regexes_policy [, delimiters [, similarity_th [, tree_depth [, trigram_th [, bigram_th ]]]]]]]]])
Parameters
The following parameters description is a summary. For more information, see More about the algorithm section.
| Name | Type | Required | Description |
|---|---|---|---|
| reduce_col | string | ✔️ | The name of the string column the function is applied to. |
| use_logram | bool | Enable or disable the Logram algorithm. Default value is true. | |
| use_drain | bool | Enable or disable the Drain algorithm. Default value is true. | |
| custom_regexes | dynamic | A dynamic array containing pairs of regular expression and replacement symbols to be searched in each input row, and replaced with their respective matching symbol. Default value is dynamic([]). The default regex table replaces numbers, IP addresses, and GUIDs. | |
| custom_regexes_policy | string | Either ‘prepend’, ‘append’ or ‘replace’. Controls whether custom_regexes are prepend/append/replace the default ones. Default value is ‘prepend’. | |
| delimiters | dynamic | A dynamic array containing delimiter strings. Default value is dynamic([" "]), defining space as the only single character delimiter. | |
| similarity_th | real | Similarity threshold, used by the Drain algorithm. Increasing similarity_th results in more refined databases. Default value is 0.5. If Drain is disabled, then this parameter has no effect. | |
| tree_depth | int | Increasing tree_depth improves the runtime of the Drain algorithm, but might reduce its accuracy. Default value is 4. If Drain is disabled, then this parameter has no effect. | |
| trigram_th | int | Decreasing trigram_th increases the chances of Logram to replace tokens with wildcards. Default value is 10. If Logram is disabled, then this parameter has no effect. | |
| bigram_th | int | Decreasing bigram_th increases the chances of Logram to replace tokens with wildcards. Default value is 15. If Logram is disabled, then this parameter has no effect. |
More about the algorithm
The function runs multiples passes over the rows to be reduced to common patterns. The following list explains the passes:
Regular expression replacements: In this pass, each line is independently matched to a set of regular expressions, and each matched expression is replaced by a replacement symbol. The default regular expressions replace IP addresses, numbers, and GUIDs with /<IP>, <GUID> and /<NUM>. The user can prepend/append more regular expressions to those, or replace it with new ones or empty list by modifying custom_regexes and custom_regexes_policy. For example, to replace whole numbers with <WNUM> set custom_regexes=pack_array(’/^\d+$/’, ‘<WNUM>’); to cancel regular expressions replacement set custom_regexes_policy=‘replace’. For each line, the function keeps list of the original expressions (before replacements) to be output as parameters of the generic replacement tokens.
Tokenization: similar to the previous step, each line is processed independently and broken into tokens based on set of delimiters. For example, to define breaking to tokens by either comma, period or semicolon set delimiters=pack_array(’,’, ‘.’, ‘;’).
Apply Logram algorithm: this pass is optional, pending use_logram is true. We recommend using Logram when large scale is required, and when parameters can appear in the first tokens of the log entry. OTOH, disable it when the log entries are short, as the algorithm tends to replace tokens with wildcards too often in such cases. The Logram algorithm considers 3-tuples and 2-tuples of tokens. If a 3-tuple of tokens is common in the log lines (it appears more than trigram_th times), then it’s likely that all three tokens are part of the pattern. If the 3-tuple is rare, then it’s likely that it contains a variable that should be replaced by a wildcard. For rare 3-tuples, we consider the frequency with which 2-tuples contained in the 3-tuple appear. If a 2-tuple is common (it appears more than bigram_th times), then the remaining token is likely to be a parameter, and not part of the pattern.
The Logram algorithm is easy to parallelize. It requires two passes on the log corpus: the first one to count the frequency of each 3-tuple and 2-tuple, and the second one to apply the logic previously described to each entry. To parallelize the algorithm, we only need to partition the log entries, and unify the frequency counts of different workers.Apply Drain algorithm: this pass is optional, pending use_drain is true. Drain is a log parsing algorithm based on a truncated depth prefix tree. Log messages are split according to their length, and for each length the first tree_depth tokens of the log message are used to build a prefix tree. If no match for the prefix tokens was found, a new branch is created. If a match for the prefix was found, we search for the most similar pattern among the patterns contained in the tree leaf. Pattern similarity is measured by the ratio of matched nonwildcard tokens out of all tokens. If the similarity of the most similar pattern is above the similarity threshold (the parameter similarity_th), then the log entry is matched to the pattern. For that pattern, the function replaces all nonmatching tokens by wildcards. If the similarity of the most similar pattern is below the similarity threshold, a new pattern containing the log entry is created.
We set default tree_depth to 4 based on testing various logs. Increasing this depth can improve runtime but might degrade patterns accuracy; decreasing it’s more accurate but slower, as each node performs many more similarity tests.
Usually, Drain efficiently generalizes and reduces patterns (though it’s hard to be parallelized). However, as it relies on a prefix tree, it might not be optimal in log entries containing parameters in the first tokens. This can be resolved in most cases by applying Logram first.
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let log_reduce_fl=(tbl:(*), reduce_col:string,
use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>',
'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>',
'(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', 'LogReduce', 'parameters_column', '',
'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table,
'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th,
'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn',
'output_type', 'summary');
let code = ```if 1:
from log_cluster import log_reduce
result = log_reduce.log_reduce(df, kargs)
```;
tbl
| extend LogReduce=''
| evaluate python(typeof(Count:int, LogReduce:string, example:string), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = 'Packages\\Text', docstring = 'Find common patterns in textual logs, output a summary table')
log_reduce_fl(tbl:(*), reduce_col:string,
use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>',
'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>',
'(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', 'LogReduce', 'parameters_column', '',
'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table,
'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th,
'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn',
'output_type', 'summary');
let code = ```if 1:
from log_cluster import log_reduce
result = log_reduce.log_reduce(df, kargs)
```;
tbl
| extend LogReduce=''
| evaluate python(typeof(Count:int, LogReduce:string, example:string), code, kwargs)
}
Example
The following example uses the invoke operator to run the function. This example uses Apache Hadoop distributed file system logs.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let log_reduce_fl=(tbl:(*), reduce_col:string,
use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>',
'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>',
'(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', 'LogReduce', 'parameters_column', '',
'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table,
'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th,
'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn',
'output_type', 'summary');
let code = ```if 1:
from log_cluster import log_reduce
result = log_reduce.log_reduce(df, kargs)
```;
tbl
| extend LogReduce=''
| evaluate python(typeof(Count:int, LogReduce:string, example:string), code, kwargs)
};
//
// Finding common patterns in HDFS logs, a commonly used benchmark for log parsing
//
HDFS_log
| take 100000
| invoke log_reduce_fl(reduce_col="data")
Stored
//
// Finding common patterns in HDFS logs, a commonly used benchmark for log parsing
//
HDFS_log
| take 100000
| invoke log_reduce_fl(reduce_col="data")
Output
| Count | LogReduce | Example |
|---|---|---|
| 55356 | 081110 | <NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* NameSystem.delete: blk_<NUM> is added to invalidSet of <IP> 081110 220623 26 INFO dfs.FSNamesystem: BLOCK* NameSystem.delete: blk_1239016582509138045 is added to invalidSet of 10.251.123.195:50010 |
| 10278 | 081110 | <NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: <IP> is added to blk_<NUM> size <NUM> 081110 215858 27 INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.11.85:50010 is added to blk_5080254298708411681 size 67108864 |
| 10256 | 081110 | <NUM> <NUM> INFO dfs.DataNode$PacketResponder: PacketResponder <NUM> for block blk_<NUM> terminating 081110 215858 15496 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-7746692545918257727 terminating |
| 10256 | 081110 | <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> 081110 215858 15485 INFO dfs.DataNode$PacketResponder: Received block blk_5080254298708411681 of size 67108864 from /10.251.43.21 |
| 9140 | 081110 | <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> 081110 215858 15494 INFO dfs.DataNode$DataXceiver: Receiving block blk_-7037346755429293022 src: /10.251.43.21:45933 dest: /10.251.43.21:50010 |
| 3047 | 081110 | <NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: /user/root/rand3/temporary/task<NUM><NUM>m<NUM>_<NUM>/part-<NUM>. <> 081110 215858 26 INFO dfs.FSNamesystem: BLOCK NameSystem.allocateBlock: /user/root/rand3/_temporary/task_200811101024_0005_m_001805_0/part-01805. blk-7037346755429293022 |
| 1402 | 081110 | <NUM> <NUM> INFO <>: <> block blk_<NUM> <> <> 081110 215957 15556 INFO dfs.DataNode$DataTransfer: 10.250.15.198:50010:Transmitted block blk_-3782569120714539446 to /10.251.203.129:50010 |
| 177 | 081110 | <NUM> <NUM> INFO <>: <> <> <> <*> 081110 215859 13 INFO dfs.DataBlockScanner: Verification succeeded for blk_-7244926816084627474 |
| 36 | 081110 | <NUM> <NUM> INFO <>: <> <> <> for block <*> 081110 215924 15636 INFO dfs.DataNode$BlockReceiver: Receiving empty packet for block blk_3991288654265301939 |
| 12 | 081110 | <NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* <> <> <> <> <> <> <> <> 081110 215953 19 INFO dfs.FSNamesystem: BLOCK* ask 10.250.15.198:50010 to replicate blk_-3782569120714539446 to datanode(s) 10.251.203.129:50010 |
| 12 | 081110 | <NUM> <NUM> INFO <>: <> <> <> <> <> block blk_<NUM> <> <> 081110 215955 18 INFO dfs.DataNode: 10.250.15.198:50010 Starting thread to transfer block blk_-3782569120714539446 to 10.251.203.129:50010 |
| 12 | 081110 | <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Received block blk_<NUM> src: <IP> dest: <IP> of size <NUM> 081110 215957 15226 INFO dfs.DataNode$DataXceiver: Received block blk_-3782569120714539446 src: /10.250.15.198:51013 dest: /10.250.15.198:50010 of size 14474705 |
| 6 | 081110 | <NUM> <NUM> <> dfs.FSNamesystem: BLOCK NameSystem.addStoredBlock: <> <> <> <> <> <> <> <> size <NUM> 081110 215924 27 WARN dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: Redundant addStoredBlock request received for blk_2522553781740514003 on 10.251.202.134:50010 size 67108864 |
| 6 | 081110 | <NUM> <NUM> INFO dfs.DataNode$DataXceiver: <> <> <> <> <>: <> <> <> <> <> 081110 215936 15714 INFO dfs.DataNode$DataXceiver: writeBlock blk_720939897861061328 received exception java.io.IOException: Couldn’t read from stream |
| 3 | 081110 | <NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: <> <> <> <> <> <> <> size <NUM> <> <> <> <> <> <> <> <>. 081110 220635 28 INFO dfs.FSNamesystem: BLOCK NameSystem.addStoredBlock: addStoredBlock request received for blk_-81196479666306310 on 10.250.17.177:50010 size 53457811 But it doesn’t belong to any file. |
| 1 | 081110 | <NUM> <NUM> <> <>: <> <> <> <> <> <> <>. <> <> <> <> <>. 081110 220631 19 WARN dfs.FSDataset: Unexpected error trying to delete block blk_-2012154052725261337. BlockInfo not found in volumeMap. |
23 - log_reduce_full_fl()
The function log_reduce_full_fl() finds common patterns in semi structured textual columns, such as log lines, and clusters the lines according to the extracted patterns. The function’s algorithm and most of the parameters are identical to log_reduce_fl(). However, log_reduce_fl() outputs a patterns summary table, whereas this function outputs a full table containing the pattern and parameters per each line.
Syntax
T | invoke log_reduce_full_fl(reduce_col [, pattern_col [, parameters_col [, use_logram [, use_drain [, custom_regexes [, custom_regexes_policy [, delimiters [, similarity_th [, tree_depth [, trigram_th [, bigram_th ]]]]]]]]]]])
Parameters
The following parameters description is a summary. For more information, see More about the algorithm section.
| Name | Type | Required | Description |
|---|---|---|---|
| reduce_col | string | ✔️ | The name of the string column the function is applied to. |
| pattern_col | string | ✔️ | The name of the string column to populate the pattern. |
| parameters_col | string | ✔️ | The name of the string column to populate the pattern’s parameters. |
| use_logram | bool | Enable or disable the Logram algorithm. Default value is true. | |
| use_drain | bool | Enable or disable the Drain algorithm. Default value is true. | |
| custom_regexes | dynamic | A dynamic array containing pairs of regular expression and replacement symbols to be searched in each input row, and replaced with their respective matching symbol. Default value is dynamic([]). The default regex table replaces numbers, IPs and GUIDs. | |
| custom_regexes_policy | string | Either ‘prepend’, ‘append’ or ‘replace’. Controls whether custom_regexes are prepend/append/replace the default ones. Default value is ‘prepend’. | |
| delimiters | dynamic | A dynamic array containing delimiter strings. Default value is dynamic([" "]), defining space as the only single character delimiter. | |
| similarity_th | real | Similarity threshold, used by the Drain algorithm. Increasing similarity_th results in more refined clusters. Default value is 0.5. If Drain is disabled, then this parameter has no effect. | |
| tree_depth | int | Increasing tree_depth improves the runtime of the Drain algorithm, but might reduce its accuracy. Default value is 4. If Drain is disabled, then this parameter has no effect. | |
| trigram_th | int | Decreasing trigram_th increases the chances of Logram to replace tokens with wildcards. Default value is 10. If Logram is disabled, then this parameter has no effect. | |
| bigram_th | int | Decreasing bigram_th increases the chances of Logram to replace tokens with wildcards. Default value is 15. If Logram is disabled, then this parameter has no effect. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let log_reduce_full_fl=(tbl:(*), reduce_col:string, pattern_col:string, parameters_col:string,
use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>',
'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>',
'(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', pattern_col, 'parameters_column', parameters_col,
'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table,
'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th,
'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn',
'output_type', 'full');
let code = ```if 1:
from log_cluster import log_reduce
result = log_reduce.log_reduce(df, kargs)
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = 'Packages\\Text', docstring = 'Find common patterns in textual logs, output a full table')
log_reduce_full_fl(tbl:(*), reduce_col:string, pattern_col:string, parameters_col:string,
use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>',
'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>',
'(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', pattern_col, 'parameters_column', parameters_col,
'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table,
'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th,
'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn',
'output_type', 'full');
let code = ```if 1:
from log_cluster import log_reduce
result = log_reduce.log_reduce(df, kargs)
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let log_reduce_full_fl=(tbl:(*), reduce_col:string, pattern_col:string, parameters_col:string,
use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>',
'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>',
'(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', pattern_col, 'parameters_column', parameters_col,
'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table,
'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th,
'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn',
'output_type', 'full');
let code = ```if 1:
from log_cluster import log_reduce
result = log_reduce.log_reduce(df, kargs)
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
//
// Finding common patterns in HDFS logs, a commonly used benchmark for log parsing
//
HDFS_log
| take 100000
| extend Patterns="", Parameters=""
| invoke log_reduce_full_fl(reduce_col="data", pattern_col="Patterns", parameters_col="Parameters")
| take 10
Stored
//
// Finding common patterns in HDFS logs, a commonly used benchmark for log parsing
//
HDFS_log
| take 100000
| extend Patterns="", Parameters=""
| invoke log_reduce_full_fl(reduce_col="data", pattern_col="Patterns", parameters_col="Parameters")
| take 10
Output
| data | Patterns | Parameters |
|---|---|---|
| 081110 | 215858 | 15485 INFO dfs.DataNode$PacketResponder: Received block blk_5080254298708411681 of size 67108864 from /10.251.43.21 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15485"”, ““parameter_2"”: ““5080254298708411681"”, ““parameter_3"”: ““67108864"”, ““parameter_4"”: “"/10.251.43.21"”}” |
| 081110 | 215858 | 15494 INFO dfs.DataNode$DataXceiver: Receiving block blk_-7037346755429293022 src: /10.251.43.21:45933 dest: /10.251.43.21:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15494"”, ““parameter_2"”: “"-7037346755429293022"”, ““parameter_3"”: “"/10.251.43.21:45933"”, ““parameter_4"”: “"/10.251.43.21:50010"”}” |
| 081110 | 215858 | 15496 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-7746692545918257727 terminating 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: PacketResponder <NUM> for block blk_<NUM> terminating “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15496"”, ““parameter_2"”: ““2"”, ““parameter_3"”: “"-7746692545918257727"”}” |
| 081110 | 215858 | 15496 INFO dfs.DataNode$PacketResponder: Received block blk_-7746692545918257727 of size 67108864 from /10.251.107.227 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15496"”, ““parameter_2"”: “"-7746692545918257727"”, ““parameter_3"”: ““67108864"”, ““parameter_4"”: “"/10.251.107.227"”}” |
| 081110 | 215858 | 15511 INFO dfs.DataNode$DataXceiver: Receiving block blk_-8578644687709935034 src: /10.251.107.227:39600 dest: /10.251.107.227:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15511"”, ““parameter_2"”: “"-8578644687709935034"”, ““parameter_3"”: “"/10.251.107.227:39600"”, ““parameter_4"”: “"/10.251.107.227:50010"”}” |
| 081110 | 215858 | 15514 INFO dfs.DataNode$DataXceiver: Receiving block blk_722881101738646364 src: /10.251.75.79:58213 dest: /10.251.75.79:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15514"”, ““parameter_2"”: ““722881101738646364"”, ““parameter_3"”: “"/10.251.75.79:58213"”, ““parameter_4"”: “"/10.251.75.79:50010"”}” |
| 081110 | 215858 | 15517 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-7110736255599716271 terminating 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: PacketResponder <NUM> for block blk_<NUM> terminating “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15517"”, ““parameter_2"”: ““2"”, ““parameter_3"”: “"-7110736255599716271"”}” |
| 081110 | 215858 | 15517 INFO dfs.DataNode$PacketResponder: Received block blk_-7110736255599716271 of size 67108864 from /10.251.42.246 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15517"”, ““parameter_2"”: “"-7110736255599716271"”, ““parameter_3"”: ““67108864"”, ““parameter_4"”: “"/10.251.42.246"”}” |
| 081110 | 215858 | 15533 INFO dfs.DataNode$DataXceiver: Receiving block blk_7257432994295824826 src: /10.251.26.8:41803 dest: /10.251.26.8:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15533"”, ““parameter_2"”: ““7257432994295824826"”, ““parameter_3"”: “"/10.251.26.8:41803"”, ““parameter_4"”: “"/10.251.26.8:50010"”}” |
| 081110 | 215858 | 15533 INFO dfs.DataNode$DataXceiver: Receiving block blk_-7771332301119265281 src: /10.251.43.210:34258 dest: /10.251.43.210:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15533"”, ““parameter_2"”: “"-7771332301119265281"”, ““parameter_3"”: “"/10.251.43.210:34258"”, ““parameter_4"”: “"/10.251.43.210:50010"”}” |
24 - log_reduce_predict_fl()
The function log_reduce_predict_fl() parses semi structured textual columns, such as log lines, and for each line it matches the respective pattern from a pretrained model or reports an anomaly if no matching pattern was found. The function’s’ output is similar to log_reduce_fl(), though the patterns are retrieved from a pretrained model that generated by log_reduce_train_fl().
Syntax
T | invoke log_reduce_predict_fl(models_tbl, model_name, reduce_col [, anomaly_str ])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| models_tbl | table | ✔️ | A table containing models generated by log_reduce_train_fl(). The table’s schema should be (name:string, timestamp: datetime, model:string). |
| model_name | string | ✔️ | The name of the model that will be retrieved from models_tbl. If the table contains few models matching the model name, the latest one is used. |
| reduce_col | string | ✔️ | The name of the string column the function is applied to. |
| anomaly_str | string | This string is output for lines that have no matched pattern in the model. Default value is “ANOMALY”. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let log_reduce_predict_fl=(tbl:(*), models_tbl: (name:string, timestamp: datetime, model:string),
model_name:string, reduce_col:string, anomaly_str: string = 'ANOMALY')
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('logs_col', reduce_col, 'output_patterns_col', 'LogReduce','output_parameters_col', '',
'model', model_str, 'anomaly_str', anomaly_str, 'output_type', 'summary');
let code = ```if 1:
from log_cluster import log_reduce_predict
result = log_reduce_predict.log_reduce_predict(df, kargs)
```;
tbl
| evaluate hint.distribution=per_node python(typeof(Count:int, LogReduce:string, example:string), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = 'Packages\\Text', docstring = 'Apply a trained model to find common patterns in textual logs, output a summary table')
log_reduce_predict_fl(tbl:(*), models_tbl: (name:string, timestamp: datetime, model:string),
model_name:string, reduce_col:string, anomaly_str: string = 'ANOMALY')
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('logs_col', reduce_col, 'output_patterns_col', 'LogReduce','output_parameters_col', '',
'model', model_str, 'anomaly_str', anomaly_str, 'output_type', 'summary');
let code = ```if 1:
from log_cluster import log_reduce_predict
result = log_reduce_predict.log_reduce_predict(df, kargs)
```;
tbl
| evaluate hint.distribution=per_node python(typeof(Count:int, LogReduce:string, example:string), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let log_reduce_predict_fl=(tbl:(*), models_tbl: (name:string, timestamp: datetime, model:string),
model_name:string, reduce_col:string, anomaly_str: string = 'ANOMALY')
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('logs_col', reduce_col, 'output_patterns_col', 'LogReduce','output_parameters_col', '',
'model', model_str, 'anomaly_str', anomaly_str, 'output_type', 'summary');
let code = ```if 1:
from log_cluster import log_reduce_predict
result = log_reduce_predict.log_reduce_predict(df, kargs)
```;
tbl
| evaluate hint.distribution=per_node python(typeof(Count:int, LogReduce:string, example:string), code, kwargs)
};
HDFS_log_100k
| take 1000
| invoke log_reduce_predict_fl(models_tbl=ML_Models, model_name="HDFS_100K", reduce_col="data")
Stored
HDFS_log_100k
| take 1000
| invoke log_reduce_predict_fl(models_tbl=ML_Models, model_name="HDFS_100K", reduce_col="data")
Output
| Count | LogReduce | example |
|---|---|---|
| 239 | 081110 | <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> 081110 215858 15494 INFO dfs.DataNode$DataXceiver: Receiving block blk_-7037346755429293022 src: /10.251.43.21:45933 dest: /10.251.43.21:50010 |
| 231 | 081110 | <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> 081110 215858 15485 INFO dfs.DataNode$PacketResponder: Received block blk_5080254298708411681 of size 67108864 from /10.251.43.21 |
| 230 | 081110 | <NUM> <NUM> INFO dfs.DataNode$PacketResponder: PacketResponder <NUM> for block blk_<NUM> terminating 081110 215858 15496 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-7746692545918257727 terminating |
| 218 | 081110 | <NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: <IP> is added to blk_<NUM> size <NUM> 081110 215858 27 INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.11.85:50010 is added to blk_5080254298708411681 size 67108864 |
| 79 | 081110 | <NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: <>. <> 081110 215858 26 INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: /user/root/rand3/_temporary/task_200811101024_0005_m_001805_0/part-01805. blk-7037346755429293022 |
| 3 | 081110 | <NUM> <NUM> INFO dfs.DataBlockScanner: Verification succeeded for <*> 081110 215859 13 INFO dfs.DataBlockScanner: Verification succeeded for blk_-7244926816084627474 |
25 - log_reduce_predict_full_fl()
The function log_reduce_predict_full_fl() parses semi structured textual columns, such as log lines, and for each line it matches the respective pattern from a pretrained model or reports an anomaly if no matching pattern was found. The patterns are retrieved from a pretrained model, generated by log_reduce_train_fl(). The function is similar to log_reduce_predict_fl(), but unlike log_reduce_predict_fl() that outputs a patterns summary table, this function outputs a full table containing the pattern and parameters per each line.
Syntax
T | invoke log_reduce_predict_full_fl(models_tbl, model_name, reduce_col, pattern_col, parameters_col [, anomaly_str ])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| models_tbl | table | ✔️ | A table containing models generated by log_reduce_train_fl(). The table’s schema should be (name:string, timestamp: datetime, model:string). |
| model_name | string | ✔️ | The name of the model that will be retrieved from models_tbl. If the table contains few models matching the model name, the latest one is used. |
| reduce_col | string | ✔️ | The name of the string column the function is applied to. |
| pattern_col | string | ✔️ | The name of the string column to populate the pattern. |
| parameters_col | string | ✔️ | The name of the string column to populate the pattern’s parameters. |
| anomaly_str | string | This string is output for lines that have no matched pattern in the model. Default value is “ANOMALY”. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let log_reduce_predict_full_fl=(tbl:(*), models_tbl: (name:string, timestamp: datetime, model:string),
model_name:string, reduce_col:string, pattern_col:string, parameters_col:string,
anomaly_str: string = 'ANOMALY')
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('logs_col', reduce_col, 'output_patterns_col', pattern_col,'output_parameters_col',
parameters_col, 'model', model_str, 'anomaly_str', anomaly_str, 'output_type', 'full');
let code = ```if 1:
from log_cluster import log_reduce_predict
result = log_reduce_predict.log_reduce_predict(df, kargs)
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = 'Packages\\Text', docstring = 'Apply a trained model to find common patterns in textual logs, output a full table')
log_reduce_predict_full_fl(tbl:(*), models_tbl: (name:string, timestamp: datetime, model:string),
model_name:string, reduce_col:string, pattern_col:string, parameters_col:string,
anomaly_str: string = 'ANOMALY')
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('logs_col', reduce_col, 'output_patterns_col', pattern_col,'output_parameters_col',
parameters_col, 'model', model_str, 'anomaly_str', anomaly_str, 'output_type', 'full');
let code = ```if 1:
from log_cluster import log_reduce_predict
result = log_reduce_predict.log_reduce_predict(df, kargs)
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let log_reduce_predict_full_fl=(tbl:(*), models_tbl: (name:string, timestamp: datetime, model:string),
model_name:string, reduce_col:string, pattern_col:string, parameters_col:string,
anomaly_str: string = 'ANOMALY')
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('logs_col', reduce_col, 'output_patterns_col', pattern_col,'output_parameters_col',
parameters_col, 'model', model_str, 'anomaly_str', anomaly_str, 'output_type', 'full');
let code = ```if 1:
from log_cluster import log_reduce_predict
result = log_reduce_predict.log_reduce_predict(df, kargs)
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
HDFS_log_100k
| extend Patterns='', Parameters=''
| take 10
| invoke log_reduce_predict_full_fl(models_tbl=ML_Models, model_name="HDFS_100K", reduce_col="data", pattern_col="Patterns", parameters_col="Parameters")
Stored
HDFS_log_100k
| extend Patterns='', Parameters=''
| take 10
| invoke log_reduce_predict_full_fl(models_tbl=ML_Models, model_name="HDFS_100K", reduce_col="data", pattern_col="Patterns", parameters_col="Parameters")
Output
| data | Patterns | Parameters |
|---|---|---|
| 081110 | 215858 | 15485 INFO dfs.DataNode$PacketResponder: Received block blk_5080254298708411681 of size 67108864 from /10.251.43.21 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> {“parameter_0”: “215858”, “parameter_1”: “15485”, “parameter_2”: “5080254298708411681”, “parameter_3”: “67108864”, “parameter_4”: “/10.251.43.21”} |
| 081110 | 215858 | 15494 INFO dfs.DataNode$DataXceiver: Receiving block blk_-7037346755429293022 src: /10.251.43.21:45933 dest: /10.251.43.21:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> {“parameter_0”: “215858”, “parameter_1”: “15494”, “parameter_2”: “-7037346755429293022”, “parameter_3”: “/10.251.43.21:45933”, “parameter_4”: “/10.251.43.21:50010”} |
| 081110 | 215858 | 15496 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-7746692545918257727 terminating 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: PacketResponder <NUM> for block blk_<NUM> terminating {“parameter_0”: “215858”, “parameter_1”: “15496”, “parameter_2”: “2”, “parameter_3”: “-7746692545918257727”} |
| 081110 | 215858 | 15496 INFO dfs.DataNode$PacketResponder: Received block blk_-7746692545918257727 of size 67108864 from /10.251.107.227 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> {“parameter_0”: “215858”, “parameter_1”: “15496”, “parameter_2”: “-7746692545918257727”, “parameter_3”: “67108864”, “parameter_4”: “/10.251.107.227”} |
| 081110 | 215858 | 15511 INFO dfs.DataNode$DataXceiver: Receiving block blk_-8578644687709935034 src: /10.251.107.227:39600 dest: /10.251.107.227:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> {“parameter_0”: “215858”, “parameter_1”: “15511”, “parameter_2”: “-8578644687709935034”, “parameter_3”: “/10.251.107.227:39600”, “parameter_4”: “/10.251.107.227:50010”} |
| 081110 | 215858 | 15514 INFO dfs.DataNode$DataXceiver: Receiving block blk_722881101738646364 src: /10.251.75.79:58213 dest: /10.251.75.79:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> {“parameter_0”: “215858”, “parameter_1”: “15514”, “parameter_2”: “722881101738646364”, “parameter_3”: “/10.251.75.79:58213”, “parameter_4”: “/10.251.75.79:50010”} |
| 081110 | 215858 | 15517 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-7110736255599716271 terminating 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: PacketResponder <NUM> for block blk_<NUM> terminating {“parameter_0”: “215858”, “parameter_1”: “15517”, “parameter_2”: “2”, “parameter_3”: “-7110736255599716271”} |
| 081110 | 215858 | 15517 INFO dfs.DataNode$PacketResponder: Received block blk_-7110736255599716271 of size 67108864 from /10.251.42.246 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> {“parameter_0”: “215858”, “parameter_1”: “15517”, “parameter_2”: “-7110736255599716271”, “parameter_3”: “67108864”, “parameter_4”: “/10.251.42.246”} |
| 081110 | 215858 | 15533 INFO dfs.DataNode$DataXceiver: Receiving block blk_7257432994295824826 src: /10.251.26.8:41803 dest: /10.251.26.8:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> {“parameter_0”: “215858”, “parameter_1”: “15533”, “parameter_2”: “7257432994295824826”, “parameter_3”: “/10.251.26.8:41803”, “parameter_4”: “/10.251.26.8:50010”} |
| 081110 | 215858 | 15533 INFO dfs.DataNode$DataXceiver: Receiving block blk_-7771332301119265281 src: /10.251.43.210:34258 dest: /10.251.43.210:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> {“parameter_0”: “215858”, “parameter_1”: “15533”, “parameter_2”: “-7771332301119265281”, “parameter_3”: “/10.251.43.210:34258”, “parameter_4”: “/10.251.43.210:50010”} |
26 - log_reduce_train_fl()
The function log_reduce_train_fl() finds common patterns in semi structured textual columns, such as log lines, and clusters the lines according to the extracted patterns. The function’s algorithm and most of the parameters are identical to log_reduce_fl(), but unlike log_reduce_fl() that outputs a patterns summary table, this function outputs the serialized model. The model can be used by the function log_reduce_predict_fl()/log_reduce_predict_full_fl() to predict the matched pattern for new log lines.
Syntax
T | invoke log_reduce_train_fl(reduce_col, model_name [, use_logram [, use_drain [, custom_regexes [, custom_regexes_policy [, delimiters [, similarity_th [, tree_depth [, trigram_th [, bigram_th ]]]]]]]]])
Parameters
The following parameters description is a summary. For more information, see More about the algorithm section.
| Name | Type | Required | Description |
|---|---|---|---|
| reduce_col | string | ✔️ | The name of the string column the function is applied to. |
| model_name | string | ✔️ | The name of the output model. |
| use_logram | bool | Enable or disable the Logram algorithm. Default value is true. | |
| use_drain | bool | Enable or disable the Drain algorithm. Default value is true. | |
| custom_regexes | dynamic | A dynamic array containing pairs of regular expression and replacement symbols to be searched in each input row, and replaced with their respective matching symbol. Default value is dynamic([]). The default regex table replaces numbers, IPs and GUIDs. | |
| custom_regexes_policy | string | Either ‘prepend’, ‘append’ or ‘replace’. Controls whether custom_regexes are prepend/append/replace the default ones. Default value is ‘prepend’. | |
| delimiters | dynamic | A dynamic array containing delimiter strings. Default value is dynamic([" "]), defining space as the only single character delimiter. | |
| similarity_th | real | Similarity threshold, used by the Drain algorithm. Increasing similarity_th results in more refined databases. Default value is 0.5. If Drain is disabled, then this parameter has no effect. | |
| tree_depth | int | Increasing tree_depth improves the runtime of the Drain algorithm, but might reduce its accuracy. Default value is 4. If Drain is disabled, then this parameter has no effect. | |
| trigram_th | int | Decreasing trigram_th increases the chances of Logram to replace tokens with wildcards. Default value is 10. If Logram is disabled, then this parameter has no effect. | |
| bigram_th | int | Decreasing bigram_th increases the chances of Logram to replace tokens with wildcards. Default value is 15. If Logram, then is disabled this parameter has no effect. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let log_reduce_train_fl=(tbl:(*), reduce_col:string, model_name:string,
use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>',
'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>',
'(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', 'LogReduce', 'parameters_column', '',
'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table,
'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th,
'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn',
'output_type', 'model');
let code = ```if 1:
from log_cluster import log_reduce
result = log_reduce.log_reduce(df, kargs)
```;
tbl
| extend LogReduce=''
| evaluate python(typeof(model:string), code, kwargs)
| project name=model_name, timestamp=now(), model
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = 'Packages\\Text', docstring = 'Find common patterns in textual logs, output a model')
log_reduce_train_fl(tbl:(*), reduce_col:string, model_name:string,
use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>',
'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>',
'(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', 'LogReduce', 'parameters_column', '',
'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table,
'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th,
'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn',
'output_type', 'model');
let code = ```if 1:
from log_cluster import log_reduce
result = log_reduce.log_reduce(df, kargs)
```;
tbl
| extend LogReduce=''
| evaluate python(typeof(model:string), code, kwargs)
| project name=model_name, timestamp=now(), model
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
//
// Finding common patterns in HDFS logs, export and store the trained model in ML_Models table
//
.set-or-append ML_Models <|
//
let log_reduce_train_fl=(tbl:(*), reduce_col:string, model_name:string,
use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>',
'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>',
'(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', 'LogReduce', 'parameters_column', '',
'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table,
'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th,
'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn',
'output_type', 'model');
let code = ```if 1:
from log_cluster import log_reduce
result = log_reduce.log_reduce(df, kargs)
```;
tbl
| extend LogReduce=''
| evaluate python(typeof(model:string), code, kwargs)
| project name=model_name, timestamp=now(), model
};
HDFS_log_100k
| take 100000
| invoke log_reduce_train_fl(reduce_col="data", model_name="HDFS_100K")
Stored
//
// Finding common patterns in HDFS logs, export and store the trained model in ML_Models table
//
.set-or-append ML_Models <|
//
HDFS_log_100k
| take 100000
| invoke log_reduce_train_fl(reduce_col="data", model_name="HDFS_100K")
Output
| ExtentId | OriginalSize | ExtentSize | CompressedSize | IndexSize | RowCount |
|---|---|---|---|---|---|
| 3734a525-cc08-44b9-a992-72de97b32414 | 10383 | 11546 | 10834 | 712 | 1 |
27 - mann_whitney_u_test_fl()
The function mann_whitney_u_test_fl() is a UDF (user-defined function) that performs the Mann-Whitney U Test.
Syntax
T | mann_whitney_u_test_fl(data1, data2, test_statistic,p_value [, use_continuity ])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| data1 | string | ✔️ | The name of the column containing the first set of data to be used for the test. |
| data2 | string | ✔️ | The name of the column containing the second set of data to be used for the test. |
| test_statistic | string | ✔️ | The name of the column to store test statistic value for the results. |
| p_value | string | ✔️ | The name of the column to store p-value for the results. |
| use_continuity | bool | Determines if a continuity correction (1/2) is applied. Default is true. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let mann_whitney_u_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string, use_continuity:bool=true)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value, 'use_continuity', use_continuity);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
use_continuity = kargs["use_continuity"]
def func(row):
statistics = stats.mannwhitneyu(row[data1], row[data2], use_continuity=use_continuity)
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Mann-Whitney U Test")
mann_whitney_u_test_fl(tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string, use_continuity:bool=true)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value, 'use_continuity', use_continuity);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
use_continuity = kargs["use_continuity"]
def func(row):
statistics = stats.mannwhitneyu(row[data1], row[data2], use_continuity=use_continuity)
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let mann_whitney_u_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string, use_continuity:bool=true)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value, 'use_continuity', use_continuity);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
use_continuity = kargs["use_continuity"]
def func(row):
statistics = stats.mannwhitneyu(row[data1], row[data2], use_continuity=use_continuity)
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke mann_whitney_u_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Stored
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke mann_whitney_u_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Output
| id | sample1 | sample2 | test_stat | p_val |
|---|---|---|---|---|
| Test #1 | [23.64, 20.57, 20.42] | [27.1, 22.12, 33.56] | 1 | 0.095215131912761986 |
| Test #2 | [20.85, 21.89, 23.41] | [35.09, 30.02, 26.52] | 0 | 0.04042779918502612 |
| Test #3 | [20.13, 20.5, 21.7, 22.02] | [32.2, 32.79, 33.9, 34.22] | 0 | 0.015191410988288745 |
28 - normality_test_fl()
The function normality_test_fl() is a UDF (user-defined function) that performs the Normality Test.
Syntax
T | invoke normality_test_fl(data, test_statistic,p_value)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| data | string | ✔️ | The name of the column containing the data to be used for the test. |
| test_statistic | string | ✔️ | The name of the column to store test statistic value for the results. |
| p_value | string | ✔️ | The name of the column to store p-value for the results. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let normality_test_fl = (tbl:(*), data:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data', data, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data = kargs["data"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.normaltest(row[data])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Normality Test")
normality_test_fl(tbl:(*), data:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data', data, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data = kargs["data"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.normaltest(row[data])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let normality_test_fl = (tbl:(*), data:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data', data, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data = kargs["data"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.normaltest(row[data])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, sample1:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42, 27.1, 22.12, 33.56, 23.64, 20.57]),
'Test #2', dynamic([20.85, 21.89, 23.41, 35.09, 30.02, 26.52, 20.85, 21.89]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02, 32.2, 32.79, 33.9, 34.22, 20.13, 20.5])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke normality_test_fl('sample1', 'test_stat', 'p_val')
Stored
datatable(id:string, sample1:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42, 27.1, 22.12, 33.56, 23.64, 20.57]),
'Test #2', dynamic([20.85, 21.89, 23.41, 35.09, 30.02, 26.52, 20.85, 21.89]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02, 32.2, 32.79, 33.9, 34.22, 20.13, 20.5])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke normality_test_fl('sample1', 'test_stat', 'p_val')
Output
| id | sample1 | test_stat | p_val |
|---|---|---|---|
| Test #1 | [23.64, 20.57, 20.42, 27.1, 22.12, 33.56, 23.64, 20.57] | 7.4881873153941036 | 0.023657060728893706 |
| Test #2 | [20.85, 21.89, 23.41, 35.09, 30.02, 26.52, 20.85, 21.89] | 3.29982750330276 | 0.19206647332255408 |
| Test #3 | [20.13, 20.5, 21.7, 22.02, 32.2, 32.79, 33.9, 34.22, 20.13, 20.5] | 6.9868433851364324 | 0.030396685911910585 |
29 - pair_probabilities_fl()
Calculate various probabilities and related metrics for a pair of categorical variables.
The function pair_probabilities_fl() is a UDF (user-defined function) that calculates the following probabilities and related metrics for a pair of categorical variables, A and B, as follows:
- P(A) is the probability of each value A=a
- P(B) is the probability of each value B=b
- P(A|B) is the conditional probability of A=a given B=b
- P(B|A) is the conditional probability of B=b given A=a
- P(A∪B) is the union probability (A=a or B=b)
- P(A∩B) is the intersection probability (A=a and B=b)
- The lift metric is calculated as P(A∩B)/P(A)*P(B). For more information, see lift metric.
- A lift near 1 means that the joint probability of two values is similar to what is expected in case that both variables are independent.
- Lift » 1 means that values cooccur more often than expected under independence assumption.
- Lift « 1 means that values are less likely to cooccur than expected under independence assumption.
- The Jaccard similarity coefficient is calculated as P(A∩B)/P(A∪B). For more information, see Jaccard similarity coefficient.
- A high Jaccard coefficient, close to 1, means that the values tend to occur together.
- A low Jaccard coefficient, close to 0, means that the values tend to stay apart.
Syntax
pair_probabilities_fl(A, B, Scope)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| A | scalar | ✔️ | The first categorical variable. |
| B | scalar | ✔️ | The second categorical variable. |
| Scope | scalar | ✔️ | The field that contains the scope, so that the probabilities for A and B are calculated independently for each scope value. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let pair_probabilities_fl = (tbl:(*), A_col:string, B_col:string, scope_col:string)
{
let T = materialize(tbl | extend _A = column_ifexists(A_col, ''), _B = column_ifexists(B_col, ''), _scope = column_ifexists(scope_col, ''));
let countOnScope = T | summarize countAllOnScope = count() by _scope;
let probAB = T | summarize countAB = count() by _A, _B, _scope | join kind = leftouter (countOnScope) on _scope | extend P_AB = todouble(countAB)/countAllOnScope;
let probA = probAB | summarize countA = sum(countAB), countAllOnScope = max(countAllOnScope) by _A, _scope | extend P_A = todouble(countA)/countAllOnScope;
let probB = probAB | summarize countB = sum(countAB), countAllOnScope = max(countAllOnScope) by _B, _scope | extend P_B = todouble(countB)/countAllOnScope;
probAB
| join kind = leftouter (probA) on _A, _scope // probability for each value of A
| join kind = leftouter (probB) on _B, _scope // probability for each value of B
| extend P_AUB = P_A + P_B - P_AB // union probability
, P_AIB = P_AB/P_B // conditional probability of A on B
, P_BIA = P_AB/P_A // conditional probability of B on A
| extend Lift_AB = P_AB/(P_A * P_B) // lift metric
, Jaccard_AB = P_AB/P_AUB // Jaccard similarity index
| project _A, _B, _scope, bin(P_A, 0.00001), bin(P_B, 0.00001), bin(P_AB, 0.00001), bin(P_AUB, 0.00001), bin(P_AIB, 0.00001)
, bin(P_BIA, 0.00001), bin(Lift_AB, 0.00001), bin(Jaccard_AB, 0.00001)
| sort by _scope, _A, _B
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Calculate probabilities and related metrics for a pair of categorical variables")
pair_probabilities_fl (tbl:(*), A_col:string, B_col:string, scope_col:string)
{
let T = materialize(tbl | extend _A = column_ifexists(A_col, ''), _B = column_ifexists(B_col, ''), _scope = column_ifexists(scope_col, ''));
let countOnScope = T | summarize countAllOnScope = count() by _scope;
let probAB = T | summarize countAB = count() by _A, _B, _scope | join kind = leftouter (countOnScope) on _scope | extend P_AB = todouble(countAB)/countAllOnScope;
let probA = probAB | summarize countA = sum(countAB), countAllOnScope = max(countAllOnScope) by _A, _scope | extend P_A = todouble(countA)/countAllOnScope;
let probB = probAB | summarize countB = sum(countAB), countAllOnScope = max(countAllOnScope) by _B, _scope | extend P_B = todouble(countB)/countAllOnScope;
probAB
| join kind = leftouter (probA) on _A, _scope // probability for each value of A
| join kind = leftouter (probB) on _B, _scope // probability for each value of B
| extend P_AUB = P_A + P_B - P_AB // union probability
, P_AIB = P_AB/P_B // conditional probability of A on B
, P_BIA = P_AB/P_A // conditional probability of B on A
| extend Lift_AB = P_AB/(P_A * P_B) // lift metric
, Jaccard_AB = P_AB/P_AUB // Jaccard similarity index
| project _A, _B, _scope, bin(P_A, 0.00001), bin(P_B, 0.00001), bin(P_AB, 0.00001), bin(P_AUB, 0.00001), bin(P_AIB, 0.00001)
, bin(P_BIA, 0.00001), bin(Lift_AB, 0.00001), bin(Jaccard_AB, 0.00001)
| sort by _scope, _A, _B
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let pair_probabilities_fl = (tbl:(*), A_col:string, B_col:string, scope_col:string)
{
let T = materialize(tbl | extend _A = column_ifexists(A_col, ''), _B = column_ifexists(B_col, ''), _scope = column_ifexists(scope_col, ''));
let countOnScope = T | summarize countAllOnScope = count() by _scope;
let probAB = T | summarize countAB = count() by _A, _B, _scope | join kind = leftouter (countOnScope) on _scope | extend P_AB = todouble(countAB)/countAllOnScope;
let probA = probAB | summarize countA = sum(countAB), countAllOnScope = max(countAllOnScope) by _A, _scope | extend P_A = todouble(countA)/countAllOnScope;
let probB = probAB | summarize countB = sum(countAB), countAllOnScope = max(countAllOnScope) by _B, _scope | extend P_B = todouble(countB)/countAllOnScope;
probAB
| join kind = leftouter (probA) on _A, _scope // probability for each value of A
| join kind = leftouter (probB) on _B, _scope // probability for each value of B
| extend P_AUB = P_A + P_B - P_AB // union probability
, P_AIB = P_AB/P_B // conditional probability of A on B
, P_BIA = P_AB/P_A // conditional probability of B on A
| extend Lift_AB = P_AB/(P_A * P_B) // lift metric
, Jaccard_AB = P_AB/P_AUB // Jaccard similarity index
| project _A, _B, _scope, bin(P_A, 0.00001), bin(P_B, 0.00001), bin(P_AB, 0.00001), bin(P_AUB, 0.00001), bin(P_AIB, 0.00001)
, bin(P_BIA, 0.00001), bin(Lift_AB, 0.00001), bin(Jaccard_AB, 0.00001)
| sort by _scope, _A, _B
};
//
let dancePairs = datatable(boy:string, girl:string, dance_class:string)[
'James', 'Mary', 'Modern',
'James', 'Mary', 'Modern',
'Robert', 'Mary', 'Modern',
'Robert', 'Mary', 'Modern',
'Michael', 'Patricia', 'Modern',
'Michael', 'Patricia', 'Modern',
'James', 'Patricia', 'Modern',
'Robert', 'Patricia', 'Modern',
'Michael', 'Patricia', 'Modern',
'Michael', 'Patricia', 'Modern',
'James', 'Linda', 'Modern',
'James', 'Linda', 'Modern',
'Robert', 'Linda', 'Modern',
'Robert', 'Linda', 'Modern',
'James', 'Linda', 'Modern',
'Robert', 'Mary', 'Modern',
'Michael', 'Patricia', 'Modern',
'Michael', 'Patricia', 'Modern',
'James', 'Linda', 'Modern',
'Robert', 'Mary', 'Classic',
'Robert', 'Linda', 'Classic',
'James', 'Mary', 'Classic',
'James', 'Linda', 'Classic'
];
dancePairs
| invoke pair_probabilities_fl('boy','girl', 'dance_class')
Stored
let dancePairs = datatable(boy:string, girl:string, dance_class:string)[
'James', 'Mary', 'Modern',
'James', 'Mary', 'Modern',
'Robert', 'Mary', 'Modern',
'Robert', 'Mary', 'Modern',
'Michael', 'Patricia', 'Modern',
'Michael', 'Patricia', 'Modern',
'James', 'Patricia', 'Modern',
'Robert', 'Patricia', 'Modern',
'Michael', 'Patricia', 'Modern',
'Michael', 'Patricia', 'Modern',
'James', 'Linda', 'Modern',
'James', 'Linda', 'Modern',
'Robert', 'Linda', 'Modern',
'Robert', 'Linda', 'Modern',
'James', 'Linda', 'Modern',
'Robert', 'Mary', 'Modern',
'Michael', 'Patricia', 'Modern',
'Michael', 'Patricia', 'Modern',
'James', 'Linda', 'Modern',
'Robert', 'Mary', 'Classic',
'Robert', 'Linda', 'Classic',
'James', 'Mary', 'Classic',
'James', 'Linda', 'Classic'
];
dancePairs
| invoke pair_probabilities_fl('boy','girl', 'dance_class')
Output
Let’s look at list of pairs of people dancing at two dance classes supposedly at random to find out if anything looks anomalous (meaning, not random). We’ll start by looking at each class by itself.
The Michael-Patricia pair has a lift metric of 2.375, which is significantly above 1. This value means that they’re seen together much more often that what would be expected if this pairing was random. Their Jaccard coefficient is 0.75, which is close to 1. When the pair dances, they prefer to dance together.
| A | B | scope | P_A | P_B | P_AB | P_AUB | P_AIB | P_BIA | Lift_AB | Jaccard_AB |
|---|---|---|---|---|---|---|---|---|---|---|
| Robert | Patricia | Modern | 0.31578 | 0.42105 | 0.05263 | 0.68421 | 0.12499 | 0.16666 | 0.39583 | 0.07692 |
| Robert | Mary | Modern | 0.31578 | 0.26315 | 0.15789 | 0.42105 | 0.59999 | 0.49999 | 1.89999 | 0.37499 |
| Robert | Linda | Modern | 0.31578 | 0.31578 | 0.10526 | 0.52631 | 0.33333 | 0.33333 | 1.05555 | 0.2 |
| Michael | Patricia | Modern | 0.31578 | 0.42105 | 0.31578 | 0.42105 | 0.75 | 0.99999 | 2.375 | 0.75 |
| James | Patricia | Modern | 0.36842 | 0.42105 | 0.05263 | 0.73684 | 0.12499 | 0.14285 | 0.33928 | 0.07142 |
| James | Mary | Modern | 0.36842 | 0.26315 | 0.10526 | 0.52631 | 0.4 | 0.28571 | 1.08571 | 0.2 |
| James | Linda | Modern | 0.36842 | 0.31578 | 0.21052 | 0.47368 | 0.66666 | 0.57142 | 1.80952 | 0.44444 |
| Robert | Mary | Classic | 0.49999 | 0.49999 | 0.24999 | 0.75 | 0.49999 | 0.49999 | 0.99999 | 0.33333 |
| Robert | Linda | Classic | 0.49999 | 0.49999 | 0.24999 | 0.75 | 0.49999 | 0.49999 | 0.99999 | 0.33333 |
| James | Mary | Classic | 0.49999 | 0.49999 | 0.24999 | 0.75 | 0.49999 | 0.49999 | 0.99999 | 0.33333 |
| James | Linda | Classic | 0.49999 | 0.49999 | 0.24999 | 0.75 | 0.49999 | 0.49999 | 0.99999 | 0.33333 |
30 - pairwise_dist_fl()
Calculate pairwise distances between entities based on multiple nominal and numerical variables.
The function pairwise_dist_fl() is a UDF (user-defined function) that calculates the multivariate distance between data points belonging to the same partition, taking into account nominal and numerical variables.
- All string fields, besides entity and partition names, are considered nominal variables. The distance is equal to 1 if the values are different, and 0 if they’re the same.
- All numerical fields are considered numerical variables. They’re normalized by transforming to z-scores and the distance is calculated as the absolute value of the difference. The total multivariate distance between data points is calculated as the average of the distances between variables.
A distance close to zero means that the entities are similar and a distance above 1 means they’re different. Similarly, an entity with an average distance close to or above one indicates that it’s different from many other entities in the partition, indicating a potential outlier.
Syntax
pairwise_dist_fl(entity, partition)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| entity | string | ✔️ | The name of the input table column containing the names or IDs of the entities for which the distances will be calculated. |
| partition | string | ✔️ | The name of the input table column containing the partition or scope, so that the distances are calculated for all pairs of entities under the same partition. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let pairwise_dist_fl = (tbl:(*), id_col:string, partition_col:string)
{
let generic_dist = (value1:dynamic, value2:dynamic)
{
// Calculates the distance between two values; treats all strings as nominal values and numbers as numerical,
// can be extended to other data types or tweaked by adding weights or changing formulas.
iff(gettype(value1[0]) == "string", todouble(tostring(value1[0]) != tostring(value2[0])), abs(todouble(value1[0]) - todouble(value2[0])))
};
let T = (tbl | extend _entity = column_ifexists(id_col, ''), _partition = column_ifexists(partition_col, '') | project-reorder _entity, _partition);
let sum_data = (
// Calculates summary statistics to be used for normalization.
T
| project-reorder _entity
| project _partition, p = pack_array(*)
| mv-expand with_itemindex=idx p
| summarize count(), avg(todouble(p)), stdev(todouble(p)) by _partition, idx
| sort by _partition, idx asc
| summarize make_list(avg_p), make_list(stdev_p) by _partition
);
let normalized_data = (
// Performs normalization on numerical variables by substrcting mean and scaling by standard deviation. Other normalization techniques can be used
// by adding metrics to previous function and using here.
T
| project _partition, p = pack_array(*)
| join kind = leftouter (sum_data) on _partition
| mv-apply p, list_avg_p, list_stdev_p on (
extend normalized = iff((not(isnan(todouble(list_avg_p))) and (list_stdev_p > 0)), pack_array((todouble(p) - todouble(list_avg_p))/todouble(list_stdev_p)), p)
| summarize a = make_list(normalized) by _partition
)
| project _partition, a
);
let dist_data = (
// Calculates distances of included variables and sums them up to get a multivariate distance between all entities under the same partition.
normalized_data
| join kind = inner (normalized_data) on _partition
| project entity = tostring(a[0]), entity1 = tostring(a1[0]), a = array_slice(a, 1, -1), a1 = array_slice(a1, 1, -1), _partition
| mv-apply a, a1 on
(
project d = generic_dist(pack_array(a), pack_array(a1))
| summarize d = make_list(d)
)
| extend dist = bin((1.0*array_sum(d)-1.0)/array_length(d), 0.0001) // -1 cancels the artifact distance calculated between entity names appearing in the bag and normalizes by number of features
| project-away d
| where entity != entity1
| sort by _partition asc, entity asc, dist asc
);
dist_data
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Calculate distances between pairs of entites based on multiple nominal and numerical variables")
pairwise_dist_fl (tbl:(*), id_col:string, partition_col:string)
{
let generic_dist = (value1:dynamic, value2:dynamic)
{
// Calculates the distance between two values; treats all strings as nominal values and numbers as numerical,
// can be extended to other data types or tweaked by adding weights or changing formulas.
iff(gettype(value1[0]) == "string", todouble(tostring(value1[0]) != tostring(value2[0])), abs(todouble(value1[0]) - todouble(value2[0])))
};
let T = (tbl | extend _entity = column_ifexists(id_col, ''), _partition = column_ifexists(partition_col, '') | project-reorder _entity, _partition);
let sum_data = (
// Calculates summary statistics to be used for normalization.
T
| project-reorder _entity
| project _partition, p = pack_array(*)
| mv-expand with_itemindex=idx p
| summarize count(), avg(todouble(p)), stdev(todouble(p)) by _partition, idx
| sort by _partition, idx asc
| summarize make_list(avg_p), make_list(stdev_p) by _partition
);
let normalized_data = (
// Performs normalization on numerical variables by substrcting mean and scaling by standard deviation. Other normalization techniques can be used
// by adding metrics to previous function and using here.
T
| project _partition, p = pack_array(*)
| join kind = leftouter (sum_data) on _partition
| mv-apply p, list_avg_p, list_stdev_p on (
extend normalized = iff((not(isnan(todouble(list_avg_p))) and (list_stdev_p > 0)), pack_array((todouble(p) - todouble(list_avg_p))/todouble(list_stdev_p)), p)
| summarize a = make_list(normalized) by _partition
)
| project _partition, a
);
let dist_data = (
// Calculates distances of included variables and sums them up to get a multivariate distance between all entities under the same partition.
normalized_data
| join kind = inner (normalized_data) on _partition
| project entity = tostring(a[0]), entity1 = tostring(a1[0]), a = array_slice(a, 1, -1), a1 = array_slice(a1, 1, -1), _partition
| mv-apply a, a1 on
(
project d = generic_dist(pack_array(a), pack_array(a1))
| summarize d = make_list(d)
)
| extend dist = bin((1.0*array_sum(d)-1.0)/array_length(d), 0.0001) // -1 cancels the artifact distance calculated between entity names appearing in the bag and normalizes by number of features
| project-away d
| where entity != entity1
| sort by _partition asc, entity asc, dist asc
);
dist_data
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let pairwise_dist_fl = (tbl:(*), id_col:string, partition_col:string)
{
let generic_dist = (value1:dynamic, value2:dynamic)
{
// Calculates the distance between two values; treats all strings as nominal values and numbers as numerical,
// can be extended to other data types or tweaked by adding weights or changing formulas.
iff(gettype(value1[0]) == "string", todouble(tostring(value1[0]) != tostring(value2[0])), abs(todouble(value1[0]) - todouble(value2[0])))
};
let T = (tbl | extend _entity = column_ifexists(id_col, ''), _partition = column_ifexists(partition_col, '') | project-reorder _entity, _partition);
let sum_data = (
// Calculates summary statistics to be used for normalization.
T
| project-reorder _entity
| project _partition, p = pack_array(*)
| mv-expand with_itemindex=idx p
| summarize count(), avg(todouble(p)), stdev(todouble(p)) by _partition, idx
| sort by _partition, idx asc
| summarize make_list(avg_p), make_list(stdev_p) by _partition
);
let normalized_data = (
// Performs normalization on numerical variables by substrcting mean and scaling by standard deviation. Other normalization techniques can be used
// by adding metrics to previous function and using here.
T
| project _partition, p = pack_array(*)
| join kind = leftouter (sum_data) on _partition
| mv-apply p, list_avg_p, list_stdev_p on (
extend normalized = iff((not(isnan(todouble(list_avg_p))) and (list_stdev_p > 0)), pack_array((todouble(p) - todouble(list_avg_p))/todouble(list_stdev_p)), p)
| summarize a = make_list(normalized) by _partition
)
| project _partition, a
);
let dist_data = (
// Calculates distances of included variables and sums them up to get a multivariate distance between all entities under the same partition.
normalized_data
| join kind = inner (normalized_data) on _partition
| project entity = tostring(a[0]), entity1 = tostring(a1[0]), a = array_slice(a, 1, -1), a1 = array_slice(a1, 1, -1), _partition
| mv-apply a, a1 on
(
project d = generic_dist(pack_array(a), pack_array(a1))
| summarize d = make_list(d)
)
| extend dist = bin((1.0*array_sum(d)-1.0)/array_length(d), 0.0001) // -1 cancels the artifact distance calculated between entity names appearing in the bag and normalizes by number of features
| project-away d
| where entity != entity1
| sort by _partition asc, entity asc, dist asc
);
dist_data
};
//
let raw_data = datatable(name:string, gender: string, height:int, weight:int, limbs:int, accessory:string, type:string)[
'Andy', 'M', 160, 80, 4, 'Hat', 'Person',
'Betsy', 'F', 170, 70, 4, 'Bag', 'Person',
'Cindy', 'F', 130, 30, 4, 'Hat', 'Person',
'Dan', 'M', 190, 105, 4, 'Hat', 'Person',
'Elmie', 'M', 110, 30, 4, 'Toy', 'Person',
'Franny', 'F', 170, 65, 4, 'Bag', 'Person',
'Godzilla', '?', 260, 210, 5, 'Tail', 'Person',
'Hannie', 'F', 112, 28, 4, 'Toy', 'Person',
'Ivie', 'F', 105, 20, 4, 'Toy', 'Person',
'Johnnie', 'M', 107, 21, 4, 'Toy', 'Person',
'Kyle', 'M', 175, 76, 4, 'Hat', 'Person',
'Laura', 'F', 180, 70, 4, 'Bag', 'Person',
'Mary', 'F', 160, 60, 4, 'Bag', 'Person',
'Noah', 'M', 178, 90, 4, 'Hat', 'Person',
'Odelia', 'F', 186, 76, 4, 'Bag', 'Person',
'Paul', 'M', 158, 69, 4, 'Bag', 'Person',
'Qui', 'F', 168, 62, 4, 'Bag', 'Person',
'Ronnie', 'M', 108, 26, 4, 'Toy', 'Person',
'Sonic', 'F', 52, 20, 6, 'Tail', 'Pet',
'Tweety', 'F', 52, 20, 6, 'Tail', 'Pet' ,
'Ulfie', 'M', 39, 29, 4, 'Wings', 'Pet',
'Vinnie', 'F', 53, 22, 1, 'Tail', 'Pet',
'Waldo', 'F', 51, 21, 4, 'Tail', 'Pet',
'Xander', 'M', 50, 24, 4, 'Tail', 'Pet'
];
raw_data
| invoke pairwise_dist_fl('name', 'type')
| where _partition == 'Person' | sort by entity asc, entity1 asc
| evaluate pivot (entity, max(dist), entity1) | sort by entity1 asc
Stored
let raw_data = datatable(name:string, gender: string, height:int, weight:int, limbs:int, accessory:string, type:string)[
'Andy', 'M', 160, 80, 4, 'Hat', 'Person',
'Betsy', 'F', 170, 70, 4, 'Bag', 'Person',
'Cindy', 'F', 130, 30, 4, 'Hat', 'Person',
'Dan', 'M', 190, 105, 4, 'Hat', 'Person',
'Elmie', 'M', 110, 30, 4, 'Toy', 'Person',
'Franny', 'F', 170, 65, 4, 'Bag', 'Person',
'Godzilla', '?', 260, 210, 5, 'Tail', 'Person',
'Hannie', 'F', 112, 28, 4, 'Toy', 'Person',
'Ivie', 'F', 105, 20, 4, 'Toy', 'Person',
'Johnnie', 'M', 107, 21, 4, 'Toy', 'Person',
'Kyle', 'M', 175, 76, 4, 'Hat', 'Person',
'Laura', 'F', 180, 70, 4, 'Bag', 'Person',
'Mary', 'F', 160, 60, 4, 'Bag', 'Person',
'Noah', 'M', 178, 90, 4, 'Hat', 'Person',
'Odelia', 'F', 186, 76, 4, 'Bag', 'Person',
'Paul', 'M', 158, 69, 4, 'Bag', 'Person',
'Qui', 'F', 168, 62, 4, 'Bag', 'Person',
'Ronnie', 'M', 108, 26, 4, 'Toy', 'Person',
'Sonic', 'F', 52, 20, 6, 'Tail', 'Pet',
'Tweety', 'F', 52, 20, 6, 'Tail', 'Pet' ,
'Ulfie', 'M', 39, 29, 4, 'Wings', 'Pet',
'Vinnie', 'F', 53, 22, 1, 'Tail', 'Pet',
'Woody', 'F', 51, 21, 4, 'Tail', 'Pet',
'Xander', 'M', 50, 24, 4, 'Tail', 'Pet'
];
raw_data
| invoke pairwise_dist_fl('name', 'type')
| where _partition == 'Person' | sort by entity asc, entity1 asc
| evaluate pivot (entity, max(dist), entity1) | sort by entity1 asc
Output
| entity1 | Andy | Betsy | Cindy | Dan | Elmie | Franny | Godzilla | Hannie | … |
|---|---|---|---|---|---|---|---|---|---|
| Andy | 0.354 | 0.4125 | 0.1887 | 0.4843 | 0.3702 | 1.2087 | 0.6265 | … | |
| Betsy | 0.354 | 0.416 | 0.4708 | 0.6307 | 0.0161 | 1.2051 | 0.4872 | … | |
| Cindy | 0.4125 | 0.416 | 0.6012 | 0.3575 | 0.3998 | 1.4783 | 0.214 | … | |
| Dan | 0.1887 | 0.4708 | 0.6012 | 0.673 | 0.487 | 1.0199 | 0.8152 | … | |
| Elmie | 0.4843 | 0.6307 | 0.3575 | 0.673 | 0.6145 | 1.5502 | 0.1565 | … | |
| Franny | 0.3702 | 0.0161 | 0.3998 | 0.487 | 0.6145 | 1.2213 | 0.471 | … | |
| Godzilla | 1.2087 | 1.2051 | 1.4783 | 1.0199 | 1.5502 | 1.2213 | 1.5495 | … | |
| Hannie | 0.6265 | 0.4872 | 0.214 | 0.8152 | 0.1565 | 0.471 | 1.5495 | … | |
| … | … | … | … | … | … | … | … | … | … |
Looking at entities of two different types, we would like to calculate distance between entities belonging to the same type, by taking into account both nominal variables (such as gender or preferred accessory) and numerical variables (such as the number of limbs, height, and weight). The numerical variables are on different scales and must be centralized and scaled, which is done automatically. The output is pairs of entities under the same partition with calculated multivariate distance. It can be analyzed directly, visualized as a distance matrix or scatterplot, or used as input data for outlier detection algorithm by calculating mean distance per entity, with entities with high values indicating global outliers. For example, when adding an optional visualization using a distance matrix, you get a table as shown in the sample. From the sample, you can see that:
- Some pairs of entities (Betsy and Franny) have a low distance value (close to 0) indicating they’re similar.
- Some pairs of entities (Godzilla and Elmie) have a high distance value (1 or above) indicating they’re different.
The output can further be used to calculate the average distance per entity. A high average distance might indicate global outliers. For example, we can see that on average Godzilla has a high distance from the others indicating that it’s a probable global outlier.
31 - percentiles_linear_fl()
The function percentiles_linear_fl() is a user-defined function (UDF) that calculates percentiles using linear interpolation between closest ranks, the same method used by Excel’s PERCENTILES.INC function. Kusto native percentile functions use the nearest rank method. For large sets of values the difference between both methods is insignificant, and we recommend using the native function for best performance. For further details on these and additional percentile calculation methods have a look at percentile article on Wikipedia.
The function accepts a table containing the column to calculate on and an optional grouping key, and a dynamic array of the required percentiles, and returns a column containing dynamic array of the percentiles’ values per each group.
Syntax
T | invoke percentiles_linear_fl(val_col, pct_arr [, aggr_col ])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| val_col | string | ✔️ | The name of the column that contains the values with which to calculate the percentiles. |
| pct_arr | dynamic | ✔️ | A numerical array containing the required percentiles. Each percentile should be in the range [0-100]. |
| aggr_col | string | The name of the column that contains the grouping key. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let percentiles_linear_fl=(tbl:(*), val_col:string, pct_arr:dynamic, aggr_col:string='')
{
tbl
| extend _vals = column_ifexists(val_col, 0.0)
| extend _key = column_ifexists(aggr_col, 'ALL')
| order by _key asc, _vals asc
| summarize _vals=make_list(_vals) by _key
| extend n = array_length(_vals)
| extend pct=pct_arr
| mv-apply pct to typeof(real) on (
extend index=pct/100.0*(n-1)
| extend low_index=tolong(floor(index, 1)), high_index=tolong(ceiling(index))
| extend interval=todouble(_vals[high_index])-todouble(_vals[low_index])
| extend pct_val=todouble(_vals[low_index])+(index-low_index)*interval
| summarize pct_arr=make_list(pct), pct_val=make_list(pct_val))
| project-away n
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Calculate linear interpolated percentiles (identical to Excel's PERCENTILE.INC)")
percentiles_linear_fl(tbl:(*), val_col:string, pct_arr:dynamic, aggr_col:string='')
{
tbl
| extend _vals = column_ifexists(val_col, 0.0)
| extend _key = column_ifexists(aggr_col, 'ALL')
| order by _key asc, _vals asc
| summarize _vals=make_list(_vals) by _key
| extend n = array_length(_vals)
| extend pct=pct_arr
| mv-apply pct to typeof(real) on (
extend index=pct/100.0*(n-1)
| extend low_index=tolong(floor(index, 1)), high_index=tolong(ceiling(index))
| extend interval=todouble(_vals[high_index])-todouble(_vals[low_index])
| extend pct_val=todouble(_vals[low_index])+(index-low_index)*interval
| summarize pct_arr=make_list(pct), pct_val=make_list(pct_val))
| project-away n
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let percentiles_linear_fl=(tbl:(*), val_col:string, pct_arr:dynamic, aggr_col:string='')
{
tbl
| extend _vals = column_ifexists(val_col, 0.0)
| extend _key = column_ifexists(aggr_col, 'ALL')
| order by _key asc, _vals asc
| summarize _vals=make_list(_vals) by _key
| extend n = array_length(_vals)
| extend pct=pct_arr
| mv-apply pct to typeof(real) on (
extend index=pct/100.0*(n-1)
| extend low_index=tolong(floor(index, 1)), high_index=tolong(ceiling(index))
| extend interval=todouble(_vals[high_index])-todouble(_vals[low_index])
| extend pct_val=todouble(_vals[low_index])+(index-low_index)*interval
| summarize pct_arr=make_list(pct), pct_val=make_list(pct_val))
| project-away n
};
datatable(x:long, name:string) [
5, 'A',
9, 'A',
7, 'A',
5, 'B',
7, 'B',
7, 'B',
10, 'B',
]
| invoke percentiles_linear_fl('x', dynamic([0, 25, 50, 75, 100]), 'name')
| project-rename name=_key, x=_vals
Stored
datatable(x:long, name:string) [
5, 'A',
9, 'A',
7, 'A',
5, 'B',
7, 'B',
7, 'B',
10, 'B',
]
| invoke percentiles_linear_fl('x', dynamic([0, 25, 50, 75, 100]), 'name')
| project-rename name=_key, x=_vals
Output
| name | x | pct_arr | pct_val |
|---|---|---|---|
| A | [5,7,9] | [0,25,50,75,100] | [5,6,7,8,9] |
| B | [5,7,7,10] | [0,25,50,75,100] | [5,6.5,7,7.75,10] |
32 - perm_fl()
Calculate P(n, k)
The function perm_fl() is a user-defined function (UDF) that calculates P(n, k), the number of permutations for selection of k items out of n, with order. It’s based on the native gamma() function to calculate factorial, (see facorial_fl()). For selection of k items without order, use comb_fl().
Syntax
perm_fl(n, k)
Parameters
| Name | Type | Required | Description |
|---|
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let perm_fl=(n:int, k:int)
{
let fact_n = gamma(n+1);
let fact_nk = gamma(n-k+1);
tolong(fact_n/fact_nk)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Calculate number of permutations for selection of k items out of n items with order")
perm_fl(n:int, k:int)
{
let fact_n = gamma(n+1);
let fact_nk = gamma(n-k+1);
tolong(fact_n/fact_nk)
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let perm_fl=(n:int, k:int)
{
let fact_n = gamma(n+1);
let fact_nk = gamma(n-k+1);
tolong(fact_n/fact_nk)
}
;
range n from 3 to 10 step 3
| extend k = n-2
| extend pnk = perm_fl(n, k)
Stored
range n from 3 to 10 step 3
| extend k = n-2
| extend pnk = perm_fl(n, k)
Output
| n | k | pnk |
|---|---|---|
| 3 | 1 | 3 |
| 6 | 4 | 360 |
| 9 | 7 | 181440 |
33 - plotly_anomaly_fl()
The function plotly_anomaly_fl() is a user-defined function (UDF) that allows you to customize a plotly template to create an interactive anomaly chart.
The function accepts a table containing the source and the baseline time series, lists of positive and negative anomalies with their respective sizes, and chart labeling string. The function returns a single cell table containing plotly JSON. Optionally, you can render the data in an Azure Data Explorer dashboard tile. For more information, see Plotly (preview). The function accepts a table containing the source and the baseline time series, lists of positive and negative anomalies with their respective sizes, and chart labeling string. The function returns a single cell table containing plotly JSON. Optionally, you can render the data in a Real-Time dashboard tile. For more information, see Plotly (preview).
Prerequisite
Extract the required ‘anomaly’ template from the publicly available PlotlyTemplate table. Copy this table from the Samples database to your database by running the following KQL command from your target database:
.set PlotlyTemplate <| cluster('help.kusto.windows.net').database('Samples').PlotlyTemplate
Syntax
T | invoke plotly_anomaly_fl(time_col, val_col, baseline_col, time_high_col, val_high_col, size_high_col, time_low_col, val_low__col, size_low_col, chart_title, series_name, val_name)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| time_col | string | ✔️ | The name of the column containing the dynamic array of the time points of the original time series |
| val_col | string | ✔️ | The name of the column containing the values of the original time series |
| baseline_col | string | ✔️ | The name of the column containing the values of the baseline time series. Anomalies are usually detected by large value offset from the expected baseline value. |
| time_high_col | string | ✔️ | The name of the column containing the time points of high (above the baseline) anomalies |
| val_high_col | string | ✔️ | The name of the column containing the values of the high anomalies |
| size_high_col | string | ✔️ | The name of the column containing the marker sizes of the high anomalies |
| time_low_col | string | ✔️ | The name of the column containing the time points of low anomalies |
| val_low_col | string | ✔️ | The name of the column containing the values of the low anomalies |
| size_low_col | string | ✔️ | The name of the column containing the marker sizes of the low anomalies |
| chart_title | string | Chart title, default is ‘Anomaly Chart’ | |
| series_name | string | Time series name, default is ‘Metric’ | |
| val_name | string | Value axis name, default is ‘Value’ |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let plotly_anomaly_fl=(tbl:(*), time_col:string, val_col:string, baseline_col:string, time_high_col:string , val_high_col:string, size_high_col:string,
time_low_col:string, val_low_col:string, size_low_col:string,
chart_title:string='Anomaly chart', series_name:string='Metric', val_name:string='Value')
{
let anomaly_chart = toscalar(PlotlyTemplate | where name == "anomaly" | project plotly);
let tbl_ex = tbl | extend _timestamp = column_ifexists(time_col, datetime(null)), _values = column_ifexists(val_col, 0.0), _baseline = column_ifexists(baseline_col, 0.0),
_high_timestamp = column_ifexists(time_high_col, datetime(null)), _high_values = column_ifexists(val_high_col, 0.0), _high_size = column_ifexists(size_high_col, 1),
_low_timestamp = column_ifexists(time_low_col, datetime(null)), _low_values = column_ifexists(val_low_col, 0.0), _low_size = column_ifexists(size_low_col, 1);
tbl_ex
| extend plotly = anomaly_chart
| extend plotly=replace_string(plotly, '$TIME_STAMPS$', tostring(_timestamp))
| extend plotly=replace_string(plotly, '$SERIES_VALS$', tostring(_values))
| extend plotly=replace_string(plotly, '$BASELINE_VALS$', tostring(_baseline))
| extend plotly=replace_string(plotly, '$TIME_STAMPS_HIGH_ANOMALIES$', tostring(_high_timestamp))
| extend plotly=replace_string(plotly, '$HIGH_ANOMALIES_VALS$', tostring(_high_values))
| extend plotly=replace_string(plotly, '$HIGH_ANOMALIES_MARKER_SIZE$', tostring(_high_size))
| extend plotly=replace_string(plotly, '$TIME_STAMPS_LOW_ANOMALIES$', tostring(_low_timestamp))
| extend plotly=replace_string(plotly, '$LOW_ANOMALIES_VALS$', tostring(_low_values))
| extend plotly=replace_string(plotly, '$LOW_ANOMALIES_MARKER_SIZE$', tostring(_low_size))
| extend plotly=replace_string(plotly, '$TITLE$', chart_title)
| extend plotly=replace_string(plotly, '$SERIES_NAME$', series_name)
| extend plotly=replace_string(plotly, '$Y_NAME$', val_name)
| project plotly
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Plotly", docstring = "Render anomaly chart using plotly template")
plotly_anomaly_fl(tbl:(*), time_col:string, val_col:string, baseline_col:string, time_high_col:string , val_high_col:string, size_high_col:string,
time_low_col:string, val_low_col:string, size_low_col:string,
chart_title:string='Anomaly chart', series_name:string='Metric', val_name:string='Value')
{
let anomaly_chart = toscalar(PlotlyTemplate | where name == "anomaly" | project plotly);
let tbl_ex = tbl | extend _timestamp = column_ifexists(time_col, datetime(null)), _values = column_ifexists(val_col, 0.0), _baseline = column_ifexists(baseline_col, 0.0),
_high_timestamp = column_ifexists(time_high_col, datetime(null)), _high_values = column_ifexists(val_high_col, 0.0), _high_size = column_ifexists(size_high_col, 1),
_low_timestamp = column_ifexists(time_low_col, datetime(null)), _low_values = column_ifexists(val_low_col, 0.0), _low_size = column_ifexists(size_low_col, 1);
tbl_ex
| extend plotly = anomaly_chart
| extend plotly=replace_string(plotly, '$TIME_STAMPS$', tostring(_timestamp))
| extend plotly=replace_string(plotly, '$SERIES_VALS$', tostring(_values))
| extend plotly=replace_string(plotly, '$BASELINE_VALS$', tostring(_baseline))
| extend plotly=replace_string(plotly, '$TIME_STAMPS_HIGH_ANOMALIES$', tostring(_high_timestamp))
| extend plotly=replace_string(plotly, '$HIGH_ANOMALIES_VALS$', tostring(_high_values))
| extend plotly=replace_string(plotly, '$HIGH_ANOMALIES_MARKER_SIZE$', tostring(_high_size))
| extend plotly=replace_string(plotly, '$TIME_STAMPS_LOW_ANOMALIES$', tostring(_low_timestamp))
| extend plotly=replace_string(plotly, '$LOW_ANOMALIES_VALS$', tostring(_low_values))
| extend plotly=replace_string(plotly, '$LOW_ANOMALIES_MARKER_SIZE$', tostring(_low_size))
| extend plotly=replace_string(plotly, '$TITLE$', chart_title)
| extend plotly=replace_string(plotly, '$SERIES_NAME$', series_name)
| extend plotly=replace_string(plotly, '$Y_NAME$', val_name)
| project plotly
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let plotly_anomaly_fl=(tbl:(*), time_col:string, val_col:string, baseline_col:string, time_high_col:string , val_high_col:string, size_high_col:string,
time_low_col:string, val_low_col:string, size_low_col:string,
chart_title:string='Anomaly chart', series_name:string='Metric', val_name:string='Value')
{
let anomaly_chart = toscalar(PlotlyTemplate | where name == "anomaly" | project plotly);
let tbl_ex = tbl | extend _timestamp = column_ifexists(time_col, datetime(null)), _values = column_ifexists(val_col, 0.0), _baseline = column_ifexists(baseline_col, 0.0),
_high_timestamp = column_ifexists(time_high_col, datetime(null)), _high_values = column_ifexists(val_high_col, 0.0), _high_size = column_ifexists(size_high_col, 1),
_low_timestamp = column_ifexists(time_low_col, datetime(null)), _low_values = column_ifexists(val_low_col, 0.0), _low_size = column_ifexists(size_low_col, 1);
tbl_ex
| extend plotly = anomaly_chart
| extend plotly=replace_string(plotly, '$TIME_STAMPS$', tostring(_timestamp))
| extend plotly=replace_string(plotly, '$SERIES_VALS$', tostring(_values))
| extend plotly=replace_string(plotly, '$BASELINE_VALS$', tostring(_baseline))
| extend plotly=replace_string(plotly, '$TIME_STAMPS_HIGH_ANOMALIES$', tostring(_high_timestamp))
| extend plotly=replace_string(plotly, '$HIGH_ANOMALIES_VALS$', tostring(_high_values))
| extend plotly=replace_string(plotly, '$HIGH_ANOMALIES_MARKER_SIZE$', tostring(_high_size))
| extend plotly=replace_string(plotly, '$TIME_STAMPS_LOW_ANOMALIES$', tostring(_low_timestamp))
| extend plotly=replace_string(plotly, '$LOW_ANOMALIES_VALS$', tostring(_low_values))
| extend plotly=replace_string(plotly, '$LOW_ANOMALIES_MARKER_SIZE$', tostring(_low_size))
| extend plotly=replace_string(plotly, '$TITLE$', chart_title)
| extend plotly=replace_string(plotly, '$SERIES_NAME$', series_name)
| extend plotly=replace_string(plotly, '$Y_NAME$', val_name)
| project plotly
};
let min_t = datetime(2017-01-05);
let max_t = datetime(2017-02-03 22:00);
let dt = 2h;
let marker_scale = 8;
let s_name = 'TS1';
demo_make_series2
| make-series num=avg(num) on TimeStamp from min_t to max_t step dt by sid
| where sid == s_name
| extend (anomalies, score, baseline) = series_decompose_anomalies(num, 1.5, -1, 'linefit')
| mv-apply num1=num to typeof(double), anomalies1=anomalies to typeof(double), score1=score to typeof(double), TimeStamp1=TimeStamp to typeof(datetime) on (
summarize pAnomalies=make_list_if(num1, anomalies1 > 0), pTimeStamp=make_list_if(TimeStamp1, anomalies1 > 0), pSize=make_list_if(toint(score1*marker_scale), anomalies1 > 0),
nAnomalies=make_list_if(num1, anomalies1 < 0), nTimeStamp=make_list_if(TimeStamp1, anomalies1 < 0), nSize=make_list_if(toint(-score1*marker_scale), anomalies1 < 0)
)
| invoke plotly_anomaly_fl('TimeStamp', 'num', 'baseline', 'pTimeStamp', 'pAnomalies', 'pSize', 'nTimeStamp', 'nAnomalies', 'nSize',
chart_title='Anomaly chart using plotly_anomaly_fl()', series_name=s_name, val_name='# of requests')
| render plotly
Stored
let min_t = datetime(2017-01-05);
let max_t = datetime(2017-02-03 22:00);
let dt = 2h;
let marker_scale = 8;
let s_name = 'TS1';
demo_make_series2
| make-series num=avg(num) on TimeStamp from min_t to max_t step dt by sid
| where sid == s_name
| extend (anomalies, score, baseline) = series_decompose_anomalies(num, 1.5, -1, 'linefit')
| mv-apply num1=num to typeof(double), anomalies1=anomalies to typeof(double), score1=score to typeof(double), TimeStamp1=TimeStamp to typeof(datetime) on (
summarize pAnomalies=make_list_if(num1, anomalies1 > 0), pTimeStamp=make_list_if(TimeStamp1, anomalies1 > 0), pSize=make_list_if(toint(score1*marker_scale), anomalies1 > 0),
nAnomalies=make_list_if(num1, anomalies1 < 0), nTimeStamp=make_list_if(TimeStamp1, anomalies1 < 0), nSize=make_list_if(toint(-score1*marker_scale), anomalies1 < 0)
)
| invoke plotly_anomaly_fl('TimeStamp', 'num', 'baseline', 'pTimeStamp', 'pAnomalies', 'pSize', 'nTimeStamp', 'nAnomalies', 'nSize',
chart_title='Anomaly chart using plotly_anomaly_fl()', series_name=s_name, val_name='# of requests')
| render plotly
Output
The output is a Plotly JSON string that can be rendered using ‘| render plotly’ or in an Azure Data Explorer dashboard tile. For more information on creating dashboard tiles, see Visualize data with Azure Data Explorer dashboards . The output is a Plotly JSON string that can be rendered in a Real-Time dashboard tile. For more information on creating dashboard tiles, see Real-Time dashboards.
The following image shows a sample anomaly chart using the above function:

You can zoom in and hover over anomalies:

34 - plotly_gauge_fl()
The function plotly_gauge_fl() is a user-defined function (UDF) that allows you to customize a plotly template to create a gauge chart.
The function accepts few parameters to customize the gauge chart and returns a single cell table containing plotly JSON. Optionally, you can render the data in an Azure Data Explorer dashboard tile. For more information, see Plotly (preview). The function accepts few parameters to customize the gauge chart and returns a single cell table containing plotly JSON. Optionally, you can render the data in a Real-Time dashboard tile. For more information, see Plotly (preview).
Prerequisite
Extract the required ‘gauge’ template from the publicly available PlotlyTemplate table. Copy this table from the Samples database to your database by running the following KQL command from your target database:
.set PlotlyTemplate <| cluster('help.kusto.windows.net').database('Samples').PlotlyTemplate
Syntax
T | invoke plotly_gauge_fl(value, max_range, mode, chart_title, font_color, bar_color, bar_bg_color, tick_color, tick_width)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| value | real | ✔️ | The number to be displayed. |
| max_range | range | The maximum range of the gauge. | |
| mode | string | Specifies how the value is displayed on the graph. Default is ‘gauge+number’. | |
| chart_title | string | The chart title. The default is empty title. | |
| font_color | string | The chart’s font color. Default is ‘black’. | |
| bar_color | string | The gauge’s filled bar color. Default is ‘green’. | |
| bar_bg_color | string | The gauge’s not filled bar color. Default is ’lightgreen’. | |
| tick_color | string | The gauge’s ticks color. Default is ‘darkblue’. | |
| tick_width | int | The gauge’s ticks width. Default is 1. |
Plotly gauge charts support many parameters, still this function exposes only the above ones. For more information, see indicator traces reference.
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let plotly_gauge_fl=(value:real, max_range:real=real(null), mode:string='gauge+number', chart_title:string='',font_color:string='black',
bar_color:string='green', bar_bg_color:string='lightgreen', tick_color:string='darkblue', tick_width:int=1)
{
let gauge_chart = toscalar(PlotlyTemplate | where name == "gauge" | project plotly);
print plotly = gauge_chart
| extend plotly=replace_string(plotly, '$VALUE$', tostring(value))
| extend plotly=replace_string(plotly, '$MAX_RANGE$', iff(isnull(max_range), 'null', tostring(max_range)))
| extend plotly=replace_string(plotly, '$MODE$', mode)
| extend plotly=replace_string(plotly, '$TITLE$', chart_title)
| extend plotly=replace_string(plotly, '$FONT_COLOR$', font_color)
| extend plotly=replace_string(plotly, '$BAR_COLOR$', bar_color)
| extend plotly=replace_string(plotly, '$BAR_BG_COLOR$', bar_bg_color)
| extend plotly=replace_string(plotly, '$TICK_COLOR$', tick_color)
| extend plotly=replace_string(plotly, '$TICK_WIDTH$', tostring(tick_width))
| project plotly
};
// Write your query to use your function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Plotly", docstring = "Render gauge chart using plotly template")
plotly_gauge_fl(value:real, max_range:real=real(null), mode:string='gauge+number', chart_title:string='',font_color:string='black',
bar_color:string='green', bar_bg_color:string='lightgreen', tick_color:string='darkblue', tick_width:int=1)
{
let gauge_chart = toscalar(PlotlyTemplate | where name == "gauge" | project plotly);
print plotly = gauge_chart
| extend plotly=replace_string(plotly, '$VALUE$', tostring(value))
| extend plotly=replace_string(plotly, '$MAX_RANGE$', iff(isnull(max_range), 'null', tostring(max_range)))
| extend plotly=replace_string(plotly, '$MODE$', mode)
| extend plotly=replace_string(plotly, '$TITLE$', chart_title)
| extend plotly=replace_string(plotly, '$FONT_COLOR$', font_color)
| extend plotly=replace_string(plotly, '$BAR_COLOR$', bar_color)
| extend plotly=replace_string(plotly, '$BAR_BG_COLOR$', bar_bg_color)
| extend plotly=replace_string(plotly, '$TICK_COLOR$', tick_color)
| extend plotly=replace_string(plotly, '$TICK_WIDTH$', tostring(tick_width))
| project plotly
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let plotly_gauge_fl=(value:real, max_range:real=real(null), mode:string='gauge+number', chart_title:string='',font_color:string='black',
bar_color:string='green', bar_bg_color:string='lightgreen', tick_color:string='darkblue', tick_width:int=1)
{
let gauge_chart = toscalar(PlotlyTemplate | where name == "gauge" | project plotly);
print plotly = gauge_chart
| extend plotly=replace_string(plotly, '$VALUE$', tostring(value))
| extend plotly=replace_string(plotly, '$MAX_RANGE$', iff(isnull(max_range), 'null', tostring(max_range)))
| extend plotly=replace_string(plotly, '$MODE$', mode)
| extend plotly=replace_string(plotly, '$TITLE$', chart_title)
| extend plotly=replace_string(plotly, '$FONT_COLOR$', font_color)
| extend plotly=replace_string(plotly, '$BAR_COLOR$', bar_color)
| extend plotly=replace_string(plotly, '$BAR_BG_COLOR$', bar_bg_color)
| extend plotly=replace_string(plotly, '$TICK_COLOR$', tick_color)
| extend plotly=replace_string(plotly, '$TICK_WIDTH$', tostring(tick_width))
| project plotly
};
plotly_gauge_fl(value=180, chart_title='Speed', font_color='purple', tick_width=5)
| render plotly
Stored
plotly_gauge_fl(value=180, chart_title='Speed', font_color='purple', tick_width=5)
| render plotly
Output
The output is a Plotly JSON string that can be rendered in an Azure Data Explorer dashboard tile. For more information on creating dashboard tiles, see Visualize data with Azure Data Explorer dashboards. The output is a Plotly JSON string that can be rendered in a Real-Time dashboard tile. For more information on creating dashboard tiles, see Real-Time dashboards.

35 - plotly_scatter3d_fl()
The function plotly_scatter3d_fl() is a user-defined function (UDF) that allows you to customize a plotly template to create an interactive 3D scatter chart.
The function accepts a table containing the records to be rendered, the names of the x, y, z & aggregation columns, and the chart title string. The function returns a single cell table containing plotly JSON. Optionally, you can render the data in an Azure Data Explorer dashboard tile. For more information, see Plotly (preview). The function accepts a table containing the records to be rendered, the names of the x, y, z & aggregation columns, and the chart title string. The function returns a single cell table containing plotly JSON. Optionally, you can render the data in a Real-Time dashboard tile. For more information, see Plotly (preview).
Prerequisite
Extract the required ‘scatter3d’ template from the publicly available PlotlyTemplate table. Copy this table from the Samples database to your database by running the following KQL command from your target database:
.set PlotlyTemplate <| cluster('help.kusto.windows.net').database('Samples').PlotlyTemplate
Syntax
T | invoke plotly_scatter3d_fl(x_col, y_col, z_col, aggr_col [, chart_title ])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| x_col | string | ✔️ | The name of the column for the X coordinated of the 3D plot. |
| y_col | string | ✔️ | The name of the column for the Y coordinated of the 3D plot. |
| z_col | string | ✔️ | The name of the column for the Z coordinated of the 3D plot. |
| aggr_col | string | ✔️ | The name of the grouping column. Records in the same group are rendered in distinct color. |
| chart_title | string | The chart title. The default is ‘3D Scatter chart’. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let plotly_scatter3d_fl=(tbl:(*), x_col:string, y_col:string, z_col:string, aggr_col:string='', chart_title:string='3D Scatter chart')
{
let scatter3d_chart = toscalar(PlotlyTemplate | where name == "scatter3d" | project plotly);
let tbl_ex = tbl | extend _x = column_ifexists(x_col, 0.0), _y = column_ifexists(y_col, 0.0), _z = column_ifexists(z_col, 0.0), _aggr = column_ifexists(aggr_col, 'ALL');
tbl_ex
| serialize
| summarize _x=pack_array(make_list(_x)), _y=pack_array(make_list(_y)), _z=pack_array(make_list(_z)) by _aggr
| summarize _aggr=make_list(_aggr), _x=make_list(_x), _y=make_list(_y), _z=make_list(_z)
| extend plotly = scatter3d_chart
| extend plotly=replace_string(plotly, '$CLASS1$', tostring(_aggr[0]))
| extend plotly=replace_string(plotly, '$CLASS2$', tostring(_aggr[1]))
| extend plotly=replace_string(plotly, '$CLASS3$', tostring(_aggr[2]))
| extend plotly=replace_string(plotly, '$X_NAME$', x_col)
| extend plotly=replace_string(plotly, '$Y_NAME$', y_col)
| extend plotly=replace_string(plotly, '$Z_NAME$', z_col)
| extend plotly=replace_string(plotly, '$CLASS1_X$', tostring(_x[0]))
| extend plotly=replace_string(plotly, '$CLASS1_Y$', tostring(_y[0]))
| extend plotly=replace_string(plotly, '$CLASS1_Z$', tostring(_z[0]))
| extend plotly=replace_string(plotly, '$CLASS2_X$', tostring(_x[1]))
| extend plotly=replace_string(plotly, '$CLASS2_Y$', tostring(_y[1]))
| extend plotly=replace_string(plotly, '$CLASS2_Z$', tostring(_z[1]))
| extend plotly=replace_string(plotly, '$CLASS3_X$', tostring(_x[2]))
| extend plotly=replace_string(plotly, '$CLASS3_Y$', tostring(_y[2]))
| extend plotly=replace_string(plotly, '$CLASS3_Z$', tostring(_z[2]))
| extend plotly=replace_string(plotly, '$TITLE$', chart_title)
| project plotly
};
// Write your query to use your function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Plotly", docstring = "Render 3D scatter chart using plotly template")
plotly_scatter3d_fl(tbl:(*), x_col:string, y_col:string, z_col:string, aggr_col:string='', chart_title:string='3D Scatter chart')
{
let scatter3d_chart = toscalar(PlotlyTemplate | where name == "scatter3d" | project plotly);
let tbl_ex = tbl | extend _x = column_ifexists(x_col, 0.0), _y = column_ifexists(y_col, 0.0), _z = column_ifexists(z_col, 0.0), _aggr = column_ifexists(aggr_col, 'ALL');
tbl_ex
| serialize
| summarize _x=pack_array(make_list(_x)), _y=pack_array(make_list(_y)), _z=pack_array(make_list(_z)) by _aggr
| summarize _aggr=make_list(_aggr), _x=make_list(_x), _y=make_list(_y), _z=make_list(_z)
| extend plotly = scatter3d_chart
| extend plotly=replace_string(plotly, '$CLASS1$', tostring(_aggr[0]))
| extend plotly=replace_string(plotly, '$CLASS2$', tostring(_aggr[1]))
| extend plotly=replace_string(plotly, '$CLASS3$', tostring(_aggr[2]))
| extend plotly=replace_string(plotly, '$X_NAME$', x_col)
| extend plotly=replace_string(plotly, '$Y_NAME$', y_col)
| extend plotly=replace_string(plotly, '$Z_NAME$', z_col)
| extend plotly=replace_string(plotly, '$CLASS1_X$', tostring(_x[0]))
| extend plotly=replace_string(plotly, '$CLASS1_Y$', tostring(_y[0]))
| extend plotly=replace_string(plotly, '$CLASS1_Z$', tostring(_z[0]))
| extend plotly=replace_string(plotly, '$CLASS2_X$', tostring(_x[1]))
| extend plotly=replace_string(plotly, '$CLASS2_Y$', tostring(_y[1]))
| extend plotly=replace_string(plotly, '$CLASS2_Z$', tostring(_z[1]))
| extend plotly=replace_string(plotly, '$CLASS3_X$', tostring(_x[2]))
| extend plotly=replace_string(plotly, '$CLASS3_Y$', tostring(_y[2]))
| extend plotly=replace_string(plotly, '$CLASS3_Z$', tostring(_z[2]))
| extend plotly=replace_string(plotly, '$TITLE$', chart_title)
| project plotly
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let plotly_scatter3d_fl=(tbl:(*), x_col:string, y_col:string, z_col:string, aggr_col:string='', chart_title:string='3D Scatter chart')
{
let scatter3d_chart = toscalar(PlotlyTemplate | where name == "scatter3d" | project plotly);
let tbl_ex = tbl | extend _x = column_ifexists(x_col, 0.0), _y = column_ifexists(y_col, 0.0), _z = column_ifexists(z_col, 0.0), _aggr = column_ifexists(aggr_col, 'ALL');
tbl_ex
| serialize
| summarize _x=pack_array(make_list(_x)), _y=pack_array(make_list(_y)), _z=pack_array(make_list(_z)) by _aggr
| summarize _aggr=make_list(_aggr), _x=make_list(_x), _y=make_list(_y), _z=make_list(_z)
| extend plotly = scatter3d_chart
| extend plotly=replace_string(plotly, '$CLASS1$', tostring(_aggr[0]))
| extend plotly=replace_string(plotly, '$CLASS2$', tostring(_aggr[1]))
| extend plotly=replace_string(plotly, '$CLASS3$', tostring(_aggr[2]))
| extend plotly=replace_string(plotly, '$X_NAME$', x_col)
| extend plotly=replace_string(plotly, '$Y_NAME$', y_col)
| extend plotly=replace_string(plotly, '$Z_NAME$', z_col)
| extend plotly=replace_string(plotly, '$CLASS1_X$', tostring(_x[0]))
| extend plotly=replace_string(plotly, '$CLASS1_Y$', tostring(_y[0]))
| extend plotly=replace_string(plotly, '$CLASS1_Z$', tostring(_z[0]))
| extend plotly=replace_string(plotly, '$CLASS2_X$', tostring(_x[1]))
| extend plotly=replace_string(plotly, '$CLASS2_Y$', tostring(_y[1]))
| extend plotly=replace_string(plotly, '$CLASS2_Z$', tostring(_z[1]))
| extend plotly=replace_string(plotly, '$CLASS3_X$', tostring(_x[2]))
| extend plotly=replace_string(plotly, '$CLASS3_Y$', tostring(_y[2]))
| extend plotly=replace_string(plotly, '$CLASS3_Z$', tostring(_z[2]))
| extend plotly=replace_string(plotly, '$TITLE$', chart_title)
| project plotly
};
Iris
| invoke plotly_scatter3d_fl(x_col='SepalLength', y_col='PetalLength', z_col='SepalWidth', aggr_col='Class', chart_title='3D scatter chart using plotly_scatter3d_fl()')
| render plotly
Stored
Iris
| invoke plotly_scatter3d_fl(x_col='SepalLength', y_col='PetalLength', z_col='SepalWidth', aggr_col='Class', chart_title='3D scatter chart using plotly_scatter3d_fl()')
Output
The output is a Plotly JSON string that can be rendered in an Azure Data Explorer dashboard tile. For more information on creating dashboard tiles, see Visualize data with Azure Data Explorer dashboards. The output is a Plotly JSON string that can be rendered in a Real-Time dashboard tile. For more information on creating dashboard tiles, see Real-Time dashboards.

You can rotate, zoom and hover over specific records:

36 - predict_fl()
The function predict_fl() is a user-defined function (UDF) that predicts using an existing trained machine learning model. This model was built using Scikit-learn, serialized to string, and saved in a standard table.
Syntax
T | invoke predict_fl(models_tbl, model_name, features_cols, pred_col)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| models_tbl | string | ✔️ | The name of the table that contains all serialized models. The table must have the following columns:name: the model nametimestamp: time of model trainingmodel: string representation of the serialized model |
| model_name | string | ✔️ | The name of the specific model to use. |
| features_cols | synamic | ✔️ | An array containing the names of the features columns that are used by the model for prediction. |
| pred_col | string | ✔️ | The name of the column that stores the predictions. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let predict_fl=(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
let code = ```if 1:
import pickle
import binascii
smodel = kargs["smodel"]
features_cols = kargs["features_cols"]
pred_col = kargs["pred_col"]
bmodel = binascii.unhexlify(smodel)
clf1 = pickle.loads(bmodel)
df1 = df[features_cols]
predictions = clf1.predict(df1)
result = df
result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])
```;
samples
| evaluate python(typeof(*), code, kwargs)
};
// Write your code to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create function with (folder = "Packages\\ML", docstring = "Predict using ML model, build by Scikit-learn")
predict_fl(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
let code = ```if 1:
import pickle
import binascii
smodel = kargs["smodel"]
features_cols = kargs["features_cols"]
pred_col = kargs["pred_col"]
bmodel = binascii.unhexlify(smodel)
clf1 = pickle.loads(bmodel)
df1 = df[features_cols]
predictions = clf1.predict(df1)
result = df
result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])
```;
samples
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let predict_fl=(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
let code = ```if 1:
import pickle
import binascii
smodel = kargs["smodel"]
features_cols = kargs["features_cols"]
pred_col = kargs["pred_col"]
bmodel = binascii.unhexlify(smodel)
clf1 = pickle.loads(bmodel)
df1 = df[features_cols]
predictions = clf1.predict(df1)
result = df
result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])
```;
samples
| evaluate python(typeof(*), code, kwargs)
};
//
// Predicts room occupancy from sensors measurements, and calculates the confusion matrix
//
// Occupancy Detection is an open dataset from UCI Repository at https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+
// It contains experimental data for binary classification of room occupancy from Temperature,Humidity,Light and CO2.
// Ground-truth labels were obtained from time stamped pictures that were taken every minute
//
OccupancyDetection
| where Test == 1
| extend pred_Occupancy=false
| invoke predict_fl(ML_Models, 'Occupancy', pack_array('Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio'), 'pred_Occupancy')
| summarize n=count() by Occupancy, pred_Occupancy
Stored
//
// Predicts room occupancy from sensors measurements, and calculates the confusion matrix
//
// Occupancy Detection is an open dataset from UCI Repository at https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+
// It contains experimental data for binary classification of room occupancy from Temperature,Humidity,Light and CO2.
// Ground-truth labels were obtained from time stamped pictures that were taken every minute
//
OccupancyDetection
| where Test == 1
| extend pred_Occupancy=false
| invoke predict_fl(ML_Models, 'Occupancy', pack_array('Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio'), 'pred_Occupancy')
| summarize n=count() by Occupancy, pred_Occupancy
Output
| Occupancy | pred_Occupancy | n |
|---|---|---|
| TRUE | TRUE | 3006 |
| FALSE | TRUE | 112 |
| TRUE | FALSE | 15 |
| FALSE | FALSE | 9284 |
Model asset
Get sample dataset and pre-trained model with Python plugin enabled.
//dataset
.set OccupancyDetection <| cluster('help').database('Samples').OccupancyDetection
//model
.set ML_Models <| datatable(name:string, timestamp:datetime, model:string) [
'Occupancy', datetime(now), '800363736b6c6561726e2e6c696e6561725f6d6f64656c2e6c6f6769737469630a4c6f67697374696352656772657373696f6e0a7100298171017d710228580700000070656e616c7479710358020000006c32710458040000006475616c7105895803000000746f6c7106473f1a36e2eb1c432d5801000000437107473ff0000000000000580d0000006669745f696e746572636570747108885811000000696e746572636570745f7363616c696e6771094b01580c000000636c6173735f776569676874710a4e580c00000072616e646f6d5f7374617465710b4e5806000000736f6c766572710c58090000006c69626c696e656172710d58080000006d61785f69746572710e4b64580b0000006d756c74695f636c617373710f58030000006f767271105807000000766572626f736571114b00580a0000007761726d5f737461727471128958060000006e5f6a6f627371134b015808000000636c61737365735f7114636e756d70792e636f72652e6d756c746961727261790a5f7265636f6e7374727563740a7115636e756d70790a6e6461727261790a71164b00857117430162711887711952711a284b014b0285711b636e756d70790a64747970650a711c58020000006231711d4b004b0187711e52711f284b0358010000007c71204e4e4e4affffffff4affffffff4b007471216289430200017122747123625805000000636f65665f7124681568164b008571256818877126527127284b014b014b05867128681c5802000000663871294b004b0187712a52712b284b0358010000003c712c4e4e4e4affffffff4affffffff4b0074712d628943286a02e0d50687e0bfc6d7c974fa93a63fb3d3b8080e6e943ffceb15defdad713f14c3a76bd73202bf712e74712f62580a000000696e746572636570745f7130681568164b008571316818877132527133284b014b01857134682b894308f1e89f57711290bf71357471366258070000006e5f697465725f7137681568164b00857138681887713952713a284b014b0185713b681c58020000006934713c4b004b0187713d52713e284b03682c4e4e4e4affffffff4affffffff4b0074713f628943040c00000071407471416258100000005f736b6c6561726e5f76657273696f6e71425806000000302e31392e32714375622e'
]
37 - predict_onnx_fl()
The function predict_onnx_fl() is a user-defined function (UDF) that predicts using an existing trained machine learning model. This model has been converted to ONNX format, serialized to string, and saved in a standard table.
Syntax
T | invoke predict_onnx_fl(models_tbl, model_name, features_cols, pred_col)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| models_tbl | string | ✔️ | The name of the table that contains all serialized models. The table must have the following columns:name: the model nametimestamp: time of model trainingmodel: string representation of the serialized model |
| model_name | string | ✔️ | The name of the specific model to use. |
| features_cols | synamic | ✔️ | An array containing the names of the features columns that are used by the model for prediction. |
| pred_col | string | ✔️ | The name of the column that stores the predictions. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let predict_onnx_fl=(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
let code = ```if 1:
import binascii
smodel = kargs["smodel"]
features_cols = kargs["features_cols"]
pred_col = kargs["pred_col"]
bmodel = binascii.unhexlify(smodel)
features_cols = kargs["features_cols"]
pred_col = kargs["pred_col"]
import onnxruntime as rt
sess = rt.InferenceSession(bmodel)
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
df1 = df[features_cols]
predictions = sess.run([label_name], {input_name: df1.values.astype(np.float32)})[0]
result = df
result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])
```;
samples | evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\ML", docstring = "Predict using ONNX model")
predict_onnx_fl(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
let code = ```if 1:
import binascii
smodel = kargs["smodel"]
features_cols = kargs["features_cols"]
pred_col = kargs["pred_col"]
bmodel = binascii.unhexlify(smodel)
features_cols = kargs["features_cols"]
pred_col = kargs["pred_col"]
import onnxruntime as rt
sess = rt.InferenceSession(bmodel)
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
df1 = df[features_cols]
predictions = sess.run([label_name], {input_name: df1.values.astype(np.float32)})[0]
result = df
result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])
```;
samples | evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let predict_onnx_fl=(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
let code = ```if 1:
import binascii
smodel = kargs["smodel"]
features_cols = kargs["features_cols"]
pred_col = kargs["pred_col"]
bmodel = binascii.unhexlify(smodel)
features_cols = kargs["features_cols"]
pred_col = kargs["pred_col"]
import onnxruntime as rt
sess = rt.InferenceSession(bmodel)
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
df1 = df[features_cols]
predictions = sess.run([label_name], {input_name: df1.values.astype(np.float32)})[0]
result = df
result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])
```;
samples | evaluate python(typeof(*), code, kwargs)
};
//
// Predicts room occupancy from sensors measurements, and calculates the confusion matrix
//
// Occupancy Detection is an open dataset from UCI Repository at https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+
// It contains experimental data for binary classification of room occupancy from Temperature,Humidity,Light and CO2.
// Ground-truth labels were obtained from time stamped pictures that were taken every minute
//
OccupancyDetection
| where Test == 1
| extend pred_Occupancy=bool(0)
| invoke predict_onnx_fl(ML_Models, 'ONNX-Occupancy', pack_array('Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio'), 'pred_Occupancy')
| summarize n=count() by Occupancy, pred_Occupancy
Stored
//
// Predicts room occupancy from sensors measurements, and calculates the confusion matrix
//
// Occupancy Detection is an open dataset from UCI Repository at https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+
// It contains experimental data for binary classification of room occupancy from Temperature,Humidity,Light and CO2.
// Ground-truth labels were obtained from time stamped pictures that were taken every minute
//
OccupancyDetection
| where Test == 1
| extend pred_Occupancy=bool(0)
| invoke predict_onnx_fl(ML_Models, 'ONNX-Occupancy', pack_array('Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio'), 'pred_Occupancy')
| summarize n=count() by Occupancy, pred_Occupancy
Output
| Occupancy | pred_Occupancy | n |
|---|---|---|
| TRUE | TRUE | 3006 |
| FALSE | TRUE | 112 |
| TRUE | FALSE | 15 |
| FALSE | FALSE | 9284 |
38 - quantize_fl()
The function quantize_fl() is a user-defined function (UDF) that bins metric columns. It quantizes metric columns to categorical labels, based on the K-Means algorithm.
Syntax
T | invoke quantize_fl(num_bins, in_cols, out_cols [, labels ])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| num_bins | int | ✔️ | The required number of bins. |
| in_cols | dynamic | ✔️ | An array containing the names of the columns to quantize. |
| out_cols | dynamic | ✔️ | An array containing the names of the respective output columns for the binned values. |
| labels | dynamic | An array containing the label names. If unspecified, bin ranges will be used. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let quantize_fl=(tbl:(*), num_bins:int, in_cols:dynamic, out_cols:dynamic, labels:dynamic=dynamic(null))
{
let kwargs = bag_pack('num_bins', num_bins, 'in_cols', in_cols, 'out_cols', out_cols, 'labels', labels);
let code = ```if 1:
from sklearn.preprocessing import KBinsDiscretizer
num_bins = kargs["num_bins"]
in_cols = kargs["in_cols"]
out_cols = kargs["out_cols"]
labels = kargs["labels"]
result = df
binner = KBinsDiscretizer(n_bins=num_bins, encode="ordinal", strategy="kmeans")
df_in = df[in_cols]
bdata = binner.fit_transform(df_in)
if labels is None:
for i in range(len(out_cols)): # loop on each column and convert it to binned labels
ii = np.round(binner.bin_edges_[i], 3)
labels = [str(ii[j-1]) + '-' + str(ii[j]) for j in range(1, num_bins+1)]
result.loc[:,out_cols[i]] = np.take(labels, bdata[:, i].astype(int))
else:
result[out_cols] = np.take(labels, bdata.astype(int))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create function with (folder = "Packages\\ML", docstring = "Binning metric columns")
quantize_fl(tbl:(*), num_bins:int, in_cols:dynamic, out_cols:dynamic, labels:dynamic)
{
let kwargs = bag_pack('num_bins', num_bins, 'in_cols', in_cols, 'out_cols', out_cols, 'labels', labels);
let code = ```if 1:
from sklearn.preprocessing import KBinsDiscretizer
num_bins = kargs["num_bins"]
in_cols = kargs["in_cols"]
out_cols = kargs["out_cols"]
labels = kargs["labels"]
result = df
binner = KBinsDiscretizer(n_bins=num_bins, encode="ordinal", strategy="kmeans")
df_in = df[in_cols]
bdata = binner.fit_transform(df_in)
if labels is None:
for i in range(len(out_cols)): # loop on each column and convert it to binned labels
ii = np.round(binner.bin_edges_[i], 3)
labels = [str(ii[j-1]) + '-' + str(ii[j]) for j in range(1, num_bins+1)]
result.loc[:,out_cols[i]] = np.take(labels, bdata[:, i].astype(int))
else:
result[out_cols] = np.take(labels, bdata.astype(int))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let quantize_fl=(tbl:(*), num_bins:int, in_cols:dynamic, out_cols:dynamic, labels:dynamic=dynamic(null))
{
let kwargs = bag_pack('num_bins', num_bins, 'in_cols', in_cols, 'out_cols', out_cols, 'labels', labels);
let code = ```if 1:
from sklearn.preprocessing import KBinsDiscretizer
num_bins = kargs["num_bins"]
in_cols = kargs["in_cols"]
out_cols = kargs["out_cols"]
labels = kargs["labels"]
result = df
binner = KBinsDiscretizer(n_bins=num_bins, encode="ordinal", strategy="kmeans")
df_in = df[in_cols]
bdata = binner.fit_transform(df_in)
if labels is None:
for i in range(len(out_cols)): # loop on each column and convert it to binned labels
ii = np.round(binner.bin_edges_[i], 3)
labels = [str(ii[j-1]) + '-' + str(ii[j]) for j in range(1, num_bins+1)]
result.loc[:,out_cols[i]] = np.take(labels, bdata[:, i].astype(int))
else:
result[out_cols] = np.take(labels, bdata.astype(int))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
//
union
(range x from 1 to 5 step 1),
(range x from 10 to 15 step 1),
(range x from 20 to 25 step 1)
| extend x_label='', x_bin=''
| invoke quantize_fl(3, pack_array('x'), pack_array('x_label'), pack_array('Low', 'Med', 'High'))
| invoke quantize_fl(3, pack_array('x'), pack_array('x_bin'), dynamic(null))
Stored
union
(range x from 1 to 5 step 1),
(range x from 10 to 15 step 1),
(range x from 20 to 25 step 1)
| extend x_label='', x_bin=''
| invoke quantize_fl(3, pack_array('x'), pack_array('x_label'), pack_array('Low', 'Med', 'High'))
| invoke quantize_fl(3, pack_array('x'), pack_array('x_bin'), dynamic(null))
Output
| x | x_label | x_bin |
|---|---|---|
| 1 | Low | 1.0-7.75 |
| 2 | Low | 1.0-7.75 |
| 3 | Low | 1.0-7.75 |
| 4 | Low | 1.0-7.75 |
| 5 | Low | 1.0-7.75 |
| 20 | High | 17.5-25.0 |
| 21 | High | 17.5-25.0 |
| 22 | High | 17.5-25.0 |
| 23 | High | 17.5-25.0 |
| 24 | High | 17.5-25.0 |
| 25 | High | 17.5-25.0 |
| 10 | Med | 7.75-17.5 |
| 11 | Med | 7.75-17.5 |
| 12 | Med | 7.75-17.5 |
| 13 | Med | 7.75-17.5 |
| 14 | Med | 7.75-17.5 |
| 15 | Med | 7.75-17.5 |
39 - series_clean_anomalies_fl()
Cleans anomalous points in a series.
The function series_clean_anomalies_fl() is a user-defined function (UDF) that takes a dynamic numerical array as input and another numerical array of anomalies and replaces the anomalies in the input array with interpolated value of their adjacent points.
Syntax
series_clean_anomalies_fl(y_series, anomalies)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| y_series | dynamic | ✔️ | The input array of numeric values. |
| anomalies | dynamic | ✔️ | The anomalies array containing either 0 for normal points or any other value for anomalous points. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_clean_anomalies_fl = (y_series:dynamic, anomalies:dynamic)
{
let fnum = array_iff(series_not_equals(anomalies, 0), real(null), y_series); // replace anomalies with null values
series_fill_linear(fnum)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Replace anomalies by interpolated value", skipvalidation = "true")
series_clean_anomalies_fl(y_series:dynamic, anomalies:dynamic)
{
let fnum = array_iff(series_not_equals(anomalies, 0), real(null), y_series); // replace anomalies with null values
series_fill_linear(fnum)
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_clean_anomalies_fl = (y_series:dynamic, anomalies:dynamic)
{
let fnum = array_iff(series_not_equals(anomalies, 0), real(null), y_series); // replace anomalies with null values
series_fill_linear(fnum)
}
;
let min_t = datetime(2016-08-29);
let max_t = datetime(2016-08-31);
demo_make_series1
| make-series num=count() on TimeStamp from min_t to max_t step 20m by OsVer
| extend anomalies = series_decompose_anomalies(num, 0.8)
| extend num_c = series_clean_anomalies_fl(num, anomalies)
| render anomalychart with (anomalycolumns=anomalies)
Stored
let min_t = datetime(2016-08-29);
let max_t = datetime(2016-08-31);
demo_make_series1
| make-series num=count() on TimeStamp from min_t to max_t step 20m by OsVer
| extend anomalies = series_decompose_anomalies(num, 0.8)
| extend num_c = series_clean_anomalies_fl(num, anomalies)
| render anomalychart with (anomalycolumns=anomalies)
Output

40 - series_cosine_similarity_fl()
Calculates the cosine similarity of two numerical vectors.
The function series_cosine_similarity_fl() is a user-defined function (UDF) that takes an expression containing two dynamic numerical arrays as input and calculates their cosine similarity.
Syntax
series_cosine_similarity_fl(vec1, vec2, [ vec1_size [, vec2_size ]])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| vec1 | dynamic | ✔️ | An array of numeric values. |
| vec2 | dynamic | ✔️ | An array of numeric values that is the same length as vec1. |
| vec1_size | real | The size of vec1. This is equivalent to the square root of the dot product of the vector with itself. | |
| vec2_size | real | The size of vec2. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_cosine_similarity_fl=(vec1:dynamic, vec2:dynamic, vec1_size:real=double(null), vec2_size:real=double(null))
{
let dp = series_dot_product(vec1, vec2);
let v1l = iff(isnull(vec1_size), sqrt(series_dot_product(vec1, vec1)), vec1_size);
let v2l = iff(isnull(vec2_size), sqrt(series_dot_product(vec2, vec2)), vec2_size);
dp/(v1l*v2l)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Calculate the Cosine similarity of 2 numerical arrays")
series_cosine_similarity_fl(vec1:dynamic, vec2:dynamic, vec1_size:real=double(null), vec2_size:real=double(null))
{
let dp = series_dot_product(vec1, vec2);
let v1l = iff(isnull(vec1_size), sqrt(series_dot_product(vec1, vec1)), vec1_size);
let v2l = iff(isnull(vec2_size), sqrt(series_dot_product(vec2, vec2)), vec2_size);
dp/(v1l*v2l)
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_cosine_similarity_fl=(vec1:dynamic, vec2:dynamic, vec1_size:real=double(null), vec2_size:real=double(null))
{
let dp = series_dot_product(vec1, vec2);
let v1l = iff(isnull(vec1_size), sqrt(series_dot_product(vec1, vec1)), vec1_size);
let v2l = iff(isnull(vec2_size), sqrt(series_dot_product(vec2, vec2)), vec2_size);
dp/(v1l*v2l)
};
let s1=pack_array(0, 1);
let s2=pack_array(sqrt(2), sqrt(2));
print angle=acos(series_cosine_similarity_fl(s1, s2))/(2*pi())*360
Stored
let s1=pack_array(0, 1);
let s2=pack_array(sqrt(2), sqrt(2));
print angle=acos(series_cosine_similarity_fl(s1, s2))/(2*pi())*360
Output
| angle |
|---|
| 45 |
41 - series_dbl_exp_smoothing_fl()
Applies a double exponential smoothing filter on a series.
The function series_dbl_exp_smoothing_fl() is a user-defined function (UDF) that takes an expression containing a dynamic numerical array as input and applies a double exponential smoothing filter. When there is trend in the series, this function is superior to the series_exp_smoothing_fl() function, which implements a basic exponential smoothing filter.
Syntax
series_dbl_exp_smoothing_fl(y_series [, alpha [, beta ]])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| y_series | dynamic | ✔️ | An array of numeric values. |
| alpha | real | A value in the range [0-1] that specifies the weight of the last point vs. the weight of the previous points, which is 1 - alpha. The default is 0.5. | |
| beta | real | A value in the range [0-1] that specifies the weight of the last slope vs. the weight of the previous slopes, which is 1 - beta. The default is 0.5. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_dbl_exp_smoothing_fl = (y_series:dynamic, alpha:double=0.5, beta:double=0.5)
{
series_iir(y_series, pack_array(alpha, alpha*(beta-1)), pack_array(1, alpha*(1+beta)-2, 1-alpha))
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Double exponential smoothing for a series")
series_dbl_exp_smoothing_fl(y_series:dynamic, alpha:double=0.5, beta:double=0.5)
{
series_iir(y_series, pack_array(alpha, alpha*(beta-1)), pack_array(1, alpha*(1+beta)-2, 1-alpha))
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_dbl_exp_smoothing_fl = (y_series:dynamic, alpha:double=0.5, beta:double=0.5)
{
series_iir(y_series, pack_array(alpha, alpha*(beta-1)), pack_array(1, alpha*(1+beta)-2, 1-alpha))
};
range x from 1 to 50 step 1
| extend y = x + rand()*10
| summarize x = make_list(x), y = make_list(y)
| extend dbl_exp_smooth_y = series_dbl_exp_smoothing_fl(y, 0.2, 0.4)
| render linechart
Stored
range x from 1 to 50 step 1
| extend y = x + rand()*10
| summarize x = make_list(x), y = make_list(y)
| extend dbl_exp_smooth_y = series_dbl_exp_smoothing_fl(y, 0.2, 0.4)
| render linechart
Output

42 - series_dot_product_fl()
Calculates the dot product of two numerical vectors.
The function series_dot_product_fl() is a user-defined function (UDF) that takes an expression containing two dynamic numerical arrays as input and calculates their dot product.
Syntax
series_dot_product_fl(vec1, vec2)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| vec1 | dynamic | ✔️ | An array of numeric values. |
| vec2 | dynamic | ✔️ | An array of numeric values that is the same length as vec1. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_dot_product_fl=(vec1:dynamic, vec2:dynamic)
{
let elem_prod = series_multiply(vec1, vec2);
let cum_sum = series_iir(elem_prod, dynamic([1]), dynamic([1,-1]));
todouble(cum_sum[-1])
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Calculate the dot product of 2 numerical arrays")
series_dot_product_fl(vec1:dynamic, vec2:dynamic)
{
let elem_prod = series_multiply(vec1, vec2);
let cum_sum = series_iir(elem_prod, dynamic([1]), dynamic([1,-1]));
todouble(cum_sum[-1])
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_dot_product_fl=(vec1:dynamic, vec2:dynamic)
{
let elem_prod = series_multiply(vec1, vec2);
let cum_sum = series_iir(elem_prod, dynamic([1]), dynamic([1,-1]));
todouble(cum_sum[-1])
};
union
(print 1 | project v1=range(1, 3, 1), v2=range(4, 6, 1)),
(print 1 | project v1=range(11, 13, 1), v2=range(14, 16, 1))
| extend v3=series_dot_product_fl(v1, v2)
Stored
union
(print 1 | project v1=range(1, 3, 1), v2=range(4, 6, 1)),
(print 1 | project v1=range(11, 13, 1), v2=range(14, 16, 1))
| extend v3=series_dot_product_fl(v1, v2)
Output

43 - series_downsample_fl()
The function series_downsample_fl() is a user-defined function (UDF) that downsamples a time series by an integer factor. This function takes a table containing multiple time series (dynamic numerical array), and downsamples each series. The output contains both the coarser series and its respective times array. To avoid aliasing, the function applies a simple low pass filter on each series before subsampling.
Syntax
T | invoke series_downsample_fl(t_col, y_col, ds_t_col, ds_y_col, sampling_factor)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| t_col | string | ✔️ | The name of the column that contains the time axis of the series to downsample. |
| y_col | string | ✔️ | The name of the column that contains the series to downsample. |
| ds_t_col | string | ✔️ | The name of the column to store the down sampled time axis of each series. |
| ds_y_col | string | ✔️ | The name of the column to store the down sampled series. |
| sampling_factor | int | ✔️ | An integer specifying the required down sampling. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_downsample_fl=(tbl:(*), t_col:string, y_col:string, ds_t_col:string, ds_y_col:string, sampling_factor:int)
{
tbl
| extend _t_ = column_ifexists(t_col, dynamic(0)), _y_ = column_ifexists(y_col, dynamic(0))
| extend _y_ = series_fir(_y_, repeat(1, sampling_factor), true, true) // apply a simple low pass filter before sub-sampling
| mv-apply _t_ to typeof(DateTime), _y_ to typeof(double) on
(extend rid=row_number()-1
| where rid % sampling_factor == ceiling(sampling_factor/2.0)-1 // sub-sampling
| summarize _t_ = make_list(_t_), _y_ = make_list(_y_))
| extend cols = bag_pack(ds_t_col, _t_, ds_y_col, _y_)
| project-away _t_, _y_
| evaluate bag_unpack(cols)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Downsampling a series by an integer factor")
series_downsample_fl(tbl:(*), t_col:string, y_col:string, ds_t_col:string, ds_y_col:string, sampling_factor:int)
{
tbl
| extend _t_ = column_ifexists(t_col, dynamic(0)), _y_ = column_ifexists(y_col, dynamic(0))
| extend _y_ = series_fir(_y_, repeat(1, sampling_factor), true, true) // apply a simple low pass filter before sub-sampling
| mv-apply _t_ to typeof(DateTime), _y_ to typeof(double) on
(extend rid=row_number()-1
| where rid % sampling_factor == ceiling(sampling_factor/2.0)-1 // sub-sampling
| summarize _t_ = make_list(_t_), _y_ = make_list(_y_))
| extend cols = bag_pack(ds_t_col, _t_, ds_y_col, _y_)
| project-away _t_, _y_
| evaluate bag_unpack(cols)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_downsample_fl=(tbl:(*), t_col:string, y_col:string, ds_t_col:string, ds_y_col:string, sampling_factor:int)
{
tbl
| extend _t_ = column_ifexists(t_col, dynamic(0)), _y_ = column_ifexists(y_col, dynamic(0))
| extend _y_ = series_fir(_y_, repeat(1, sampling_factor), true, true) // apply a simple low pass filter before sub-sampling
| mv-apply _t_ to typeof(DateTime), _y_ to typeof(double) on
(extend rid=row_number()-1
| where rid % sampling_factor == ceiling(sampling_factor/2.0)-1 // sub-sampling
| summarize _t_ = make_list(_t_), _y_ = make_list(_y_))
| extend cols = bag_pack(ds_t_col, _t_, ds_y_col, _y_)
| project-away _t_, _y_
| evaluate bag_unpack(cols)
};
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| invoke series_downsample_fl('TimeStamp', 'num', 'coarse_TimeStamp', 'coarse_num', 4)
| render timechart with(xcolumn=coarse_TimeStamp, ycolumns=coarse_num)
Stored
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| invoke series_downsample_fl('TimeStamp', 'num', 'coarse_TimeStamp', 'coarse_num', 4)
| render timechart with(xcolumn=coarse_TimeStamp, ycolumns=coarse_num)
Output
The time series downsampled by 4:

For reference, here is the original time series (before downsampling):
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| render timechart with(xcolumn=TimeStamp, ycolumns=num)

44 - series_exp_smoothing_fl()
Applies a basic exponential smoothing filter on a series.
The function series_exp_smoothing_fl() is a user-defined function (UDF) that takes an expression containing a dynamic numerical array as input and applies a basic exponential smoothing filter.
Syntax
series_exp_smoothing_fl(y_series [, alpha ])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| y_series | dynamic | ✔️ | An array cell of numeric values. |
| alpha | real | A value in the range [0-1] that specifies the weight of the last point vs. the weight of the previous points, which is 1 - alpha. The default is 0.5. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_exp_smoothing_fl = (y_series:dynamic, alpha:double=0.5)
{
series_iir(y_series, pack_array(alpha), pack_array(1, alpha-1))
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Basic exponential smoothing for a series")
series_exp_smoothing_fl(y_series:dynamic, alpha:double=0.5)
{
series_iir(y_series, pack_array(alpha), pack_array(1, alpha-1))
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_exp_smoothing_fl = (y_series:dynamic, alpha:double=0.5)
{
series_iir(y_series, pack_array(alpha), pack_array(1, alpha-1))
};
range x from 1 to 50 step 1
| extend y = x % 10
| summarize x = make_list(x), y = make_list(y)
| extend exp_smooth_y = series_exp_smoothing_fl(y, 0.4)
| render linechart
Stored
range x from 1 to 50 step 1
| extend y = x % 10
| summarize x = make_list(x), y = make_list(y)
| extend exp_smooth_y = series_exp_smoothing_fl(y, 0.4)
| render linechart
Output

45 - series_fbprophet_forecast_fl()
The function series_fbprophet_forecast_fl() is a user-defined function (UDF) that takes an expression containing a time series as input, and predicts the values of the last trailing points using the Prophet algorithm. The function returns both the forecasted points and their confidence intervals. This function is a Kusto Query Language (KQL) wrapper to Prophet() class, and exposes only the parameters that are mandatory for prediction. Feel free to modify your copy to support more parameters. such as holidays, change points, Fourier order, and so on.
- Install the
fbprophetpackage since it isn’t included in the Python image. To install the package, do the following:- Follow the guidelines for Installing packages for the Python plugin.
- To save time in the above guidelines, you can download the
prophetZIP file, containing the wheel files ofprophetand its dependencies, from https://artifactswestusnew.blob.core.windows.net/public/prophet-1.1.5.zip. Save this file to your allowlisted blob container.
- To save time in the above guidelines, you can download the
- Create a SAS token with read access to your ZIP file. To create a SAS token, see get the SAS for a blob container.
- In the Example, replace the URL reference in the
external_artifactsparameter with your file path and its SAS token.
- Follow the guidelines for Installing packages for the Python plugin.
Syntax
T | invoke series_fbprophet_forecast_fl(ts_series, y_series, y_pred_series, [ points ], [ y_pred_low_series ], [ y_pred_high_series ])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| ts_series | string | ✔️ | The name of the input table column containing the time stamps of the series to predict. |
| y_series | string | ✔️ | The name of the input table column containing the values of the series to predict. |
| y_pred_series | string | ✔️ | The name of the column to store the predicted series. |
| points | int | ✔️ | The number of points at the end of the series to predict (forecast). These points are excluded from the learning (regression) process. The default is 0. |
| y_pred_low_series | string | The name of the column to store the series of the lowest values of the confidence interval. Omit if the confidence interval isn’t needed. | |
| y_pred_high_series | string | The name of the column to store the series of the highest values of the confidence interval. Omit if the confidence interval isn’t needed. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_fbprophet_forecast_fl=(tbl:(*), ts_series:string, y_series:string, y_pred_series:string, points:int=0, y_pred_low_series:string='', y_pred_high_series:string='')
{
let kwargs = bag_pack('ts_series', ts_series, 'y_series', y_series, 'y_pred_series', y_pred_series, 'points', points, 'y_pred_low_series', y_pred_low_series, 'y_pred_high_series', y_pred_high_series);
let code = ```if 1:
from sandbox_utils import Zipackage
Zipackage.install("prophet.zip")
ts_series = kargs["ts_series"]
y_series = kargs["y_series"]
y_pred_series = kargs["y_pred_series"]
points = kargs["points"]
y_pred_low_series = kargs["y_pred_low_series"]
y_pred_high_series = kargs["y_pred_high_series"]
result = df
sr = pd.Series(df[y_pred_series])
if y_pred_low_series != '':
srl = pd.Series(df[y_pred_low_series])
if y_pred_high_series != '':
srh = pd.Series(df[y_pred_high_series])
from prophet import Prophet
df1 = pd.DataFrame(columns=["ds", "y"])
for i in range(df.shape[0]):
df1["ds"] = pd.to_datetime(df[ts_series][i])
df1["ds"] = df1["ds"].dt.tz_convert(None)
df1["y"] = df[y_series][i]
df2 = df1[:-points]
m = Prophet()
m.fit(df2)
future = df1[["ds"]]
forecast = m.predict(future)
sr[i] = list(forecast["yhat"])
if y_pred_low_series != '':
srl[i] = list(forecast["yhat_lower"])
if y_pred_high_series != '':
srh[i] = list(forecast["yhat_upper"])
result[y_pred_series] = sr
if y_pred_low_series != '':
result[y_pred_low_series] = srl
if y_pred_high_series != '':
result[y_pred_high_series] = srh
```;
tbl
| evaluate python(typeof(*), code, kwargs
, external_artifacts=bag_pack('prophet.zip', 'https://artifactswestusnew.blob.core.windows.net/public/prophet-1.1.5.zip?*** YOUR SAS TOKEN ***'))
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Time Series Forecast using Facebook fbprophet package")
series_fbprophet_forecast_fl(tbl:(*), ts_series:string, y_series:string, y_pred_series:string, points:int=0, y_pred_low_series:string='', y_pred_high_series:string='')
{
let kwargs = bag_pack('ts_series', ts_series, 'y_series', y_series, 'y_pred_series', y_pred_series, 'points', points, 'y_pred_low_series', y_pred_low_series, 'y_pred_high_series', y_pred_high_series);
let code = ```if 1:
from sandbox_utils import Zipackage
Zipackage.install("prophet.zip")
ts_series = kargs["ts_series"]
y_series = kargs["y_series"]
y_pred_series = kargs["y_pred_series"]
points = kargs["points"]
y_pred_low_series = kargs["y_pred_low_series"]
y_pred_high_series = kargs["y_pred_high_series"]
result = df
sr = pd.Series(df[y_pred_series])
if y_pred_low_series != '':
srl = pd.Series(df[y_pred_low_series])
if y_pred_high_series != '':
srh = pd.Series(df[y_pred_high_series])
from prophet import Prophet
df1 = pd.DataFrame(columns=["ds", "y"])
for i in range(df.shape[0]):
df1["ds"] = pd.to_datetime(df[ts_series][i])
df1["ds"] = df1["ds"].dt.tz_convert(None)
df1["y"] = df[y_series][i]
df2 = df1[:-points]
m = Prophet()
m.fit(df2)
future = df1[["ds"]]
forecast = m.predict(future)
sr[i] = list(forecast["yhat"])
if y_pred_low_series != '':
srl[i] = list(forecast["yhat_lower"])
if y_pred_high_series != '':
srh[i] = list(forecast["yhat_upper"])
result[y_pred_series] = sr
if y_pred_low_series != '':
result[y_pred_low_series] = srl
if y_pred_high_series != '':
result[y_pred_high_series] = srh
```;
tbl
| evaluate python(typeof(*), code, kwargs
, external_artifacts=bag_pack('prophet.zip', 'https://artifactswestusnew.blob.core.windows.net/public/prophet-1.1.5.zip?*** YOUR SAS TOKEN ***'))
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_fbprophet_forecast_fl=(tbl:(*), ts_series:string, y_series:string, y_pred_series:string, points:int=0, y_pred_low_series:string='', y_pred_high_series:string='')
{
let kwargs = bag_pack('ts_series', ts_series, 'y_series', y_series, 'y_pred_series', y_pred_series, 'points', points, 'y_pred_low_series', y_pred_low_series, 'y_pred_high_series', y_pred_high_series);
let code = ```if 1:
from sandbox_utils import Zipackage
Zipackage.install("prophet.zip")
ts_series = kargs["ts_series"]
y_series = kargs["y_series"]
y_pred_series = kargs["y_pred_series"]
points = kargs["points"]
y_pred_low_series = kargs["y_pred_low_series"]
y_pred_high_series = kargs["y_pred_high_series"]
result = df
sr = pd.Series(df[y_pred_series])
if y_pred_low_series != '':
srl = pd.Series(df[y_pred_low_series])
if y_pred_high_series != '':
srh = pd.Series(df[y_pred_high_series])
from prophet import Prophet
df1 = pd.DataFrame(columns=["ds", "y"])
for i in range(df.shape[0]):
df1["ds"] = pd.to_datetime(df[ts_series][i])
df1["ds"] = df1["ds"].dt.tz_convert(None)
df1["y"] = df[y_series][i]
df2 = df1[:-points]
m = Prophet()
m.fit(df2)
future = df1[["ds"]]
forecast = m.predict(future)
sr[i] = list(forecast["yhat"])
if y_pred_low_series != '':
srl[i] = list(forecast["yhat_lower"])
if y_pred_high_series != '':
srh[i] = list(forecast["yhat_upper"])
result[y_pred_series] = sr
if y_pred_low_series != '':
result[y_pred_low_series] = srl
if y_pred_high_series != '':
result[y_pred_high_series] = srh
```;
tbl
| evaluate python(typeof(*), code, kwargs
, external_artifacts=bag_pack('prophet.zip', 'https://artifactswestusnew.blob.core.windows.net/public/prophet-1.1.5.zip?*** YOUR SAS TOKEN ***'))
};
//
// Forecasting 3 time series using fbprophet, compare to forecasting using the native function series_decompose_forecast()
//
let min_t = datetime(2017-01-05);
let max_t = datetime(2017-02-03 22:00);
let dt = 2h;
let horizon=7d;
demo_make_series2
| make-series num=avg(num) on TimeStamp from min_t to max_t+horizon step dt by sid
| extend pred_num_native = series_decompose_forecast(num, toint(horizon/dt))
| extend pred_num=dynamic(null), pred_num_lower=dynamic(null), pred_num_upper=dynamic(null)
| invoke series_fbprophet_forecast_fl('TimeStamp', 'num', 'pred_num', toint(horizon/dt), 'pred_num_lower', 'pred_num_upper')
| render timechart
Stored
//
// Forecasting 3 time series using fbprophet, compare to forecasting using the native function series_decompose_forecast()
//
let min_t = datetime(2017-01-05);
let max_t = datetime(2017-02-03 22:00);
let dt = 2h;
let horizon=7d;
demo_make_series2
| make-series num=avg(num) on TimeStamp from min_t to max_t+horizon step dt by sid
| extend pred_num_native = series_decompose_forecast(num, toint(horizon/dt))
| extend pred_num=dynamic(null), pred_num_lower=dynamic(null), pred_num_upper=dynamic(null)
| invoke series_fbprophet_forecast_fl('TimeStamp', 'num', 'pred_num', toint(horizon/dt), 'pred_num_lower', 'pred_num_upper')
| render timechart
Output

46 - series_fit_lowess_fl()
no-loc: LOWESS
series_fit_lowess_fl()
The function series_fit_lowess_fl() is a user-defined function (UDF) that applies a LOWESS regression on a series. This function takes a table with multiple series (dynamic numerical arrays) and generates a LOWESS Curve, which is a smoothed version of the original series.
Syntax
T | invoke series_fit_lowess_fl(y_series, y_fit_series, [ fit_size ], [ x_series ], [ x_istime ])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| y_series | string | ✔️ | The name of the input table column containing the dependent variable. This column is the series to fit. |
| y_fit_series | string | ✔️ | The name of the column to store the fitted series. |
| fit_size | int | For each point, the local regression is applied on its respective fit_size closest points. The default is 5. | |
| x_series | string | The name of the column containing the independent variable, that is, the x or time axis. This parameter is optional, and is needed only for unevenly spaced series. The default value is an empty string, as x is redundant for the regression of an evenly spaced series. | |
| x_istime | bool | This boolean parameter is needed only if x_series is specified and it’s a vector of datetime. The default is false. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_fit_lowess_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_size:int=5, x_series:string='', x_istime:bool=False)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_size', fit_size, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_size = kargs["fit_size"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
import statsmodels.api as sm
def lowess_fit(ts_row, x_col, y_col, fsize):
y = ts_row[y_col]
fraction = fsize/len(y)
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
lowess = sm.nonparametric.lowess
z = lowess(y, x, return_sorted=False, frac=fraction)
return list(z)
result = df
result[y_fit_series] = df.apply(lowess_fit, axis=1, args=(x_series, y_series, fit_size))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Fits a local polynomial using LOWESS method to a series")
series_fit_lowess_fl(tbl:(*), y_series:string, y_fit_series:string, fit_size:int=5, x_series:string='', x_istime:bool=False)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_size', fit_size, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_size = kargs["fit_size"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
import statsmodels.api as sm
def lowess_fit(ts_row, x_col, y_col, fsize):
y = ts_row[y_col]
fraction = fsize/len(y)
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
lowess = sm.nonparametric.lowess
z = lowess(y, x, return_sorted=False, frac=fraction)
return list(z)
result = df
result[y_fit_series] = df.apply(lowess_fit, axis=1, args=(x_series, y_series, fit_size))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Examples
The following examples use the invoke operator to run the function.
LOWESS regression on regular time series
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_fit_lowess_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_size:int=5, x_series:string='', x_istime:bool=False)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_size', fit_size, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_size = kargs["fit_size"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
import statsmodels.api as sm
def lowess_fit(ts_row, x_col, y_col, fsize):
y = ts_row[y_col]
fraction = fsize/len(y)
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
lowess = sm.nonparametric.lowess
z = lowess(y, x, return_sorted=False, frac=fraction)
return list(z)
result = df
result[y_fit_series] = df.apply(lowess_fit, axis=1, args=(x_series, y_series, fit_size))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
//
// Apply 9 points LOWESS regression on regular time series
//
let max_t = datetime(2016-09-03);
demo_make_series1
| make-series num=count() on TimeStamp from max_t-1d to max_t step 5m by OsVer
| extend fnum = dynamic(null)
| invoke series_fit_lowess_fl('num', 'fnum', 9)
| render timechart
Stored
//
// Apply 9 points LOWESS regression on regular time series
//
let max_t = datetime(2016-09-03);
demo_make_series1
| make-series num=count() on TimeStamp from max_t-1d to max_t step 5m by OsVer
| extend fnum = dynamic(null)
| invoke series_fit_lowess_fl('num', 'fnum', 9)
| render timechart
Output

Test irregular time series
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_fit_lowess_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_size:int=5, x_series:string='', x_istime:bool=False)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_size', fit_size, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_size = kargs["fit_size"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
import statsmodels.api as sm
def lowess_fit(ts_row, x_col, y_col, fsize):
y = ts_row[y_col]
fraction = fsize/len(y)
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
lowess = sm.nonparametric.lowess
z = lowess(y, x, return_sorted=False, frac=fraction)
return list(z)
result = df
result[y_fit_series] = df.apply(lowess_fit, axis=1, args=(x_series, y_series, fit_size))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
let max_t = datetime(2016-09-03);
demo_make_series1
| where TimeStamp between ((max_t-1d)..max_t)
| summarize num=count() by bin(TimeStamp, 5m), OsVer
| order by TimeStamp asc
| where hourofday(TimeStamp) % 6 != 0 // delete every 6th hour to create irregular time series
| summarize TimeStamp=make_list(TimeStamp), num=make_list(num) by OsVer
| extend fnum = dynamic(null)
| invoke series_fit_lowess_fl('num', 'fnum', 9, 'TimeStamp', True)
| render timechart
Stored
let max_t = datetime(2016-09-03);
demo_make_series1
| where TimeStamp between ((max_t-1d)..max_t)
| summarize num=count() by bin(TimeStamp, 5m), OsVer
| order by TimeStamp asc
| where hourofday(TimeStamp) % 6 != 0 // delete every 6th hour to create irregular time series
| summarize TimeStamp=make_list(TimeStamp), num=make_list(num) by OsVer
| extend fnum = dynamic(null)
| invoke series_fit_lowess_fl('num', 'fnum', 9, 'TimeStamp', True)
| render timechart
Output

Compare LOWESS versus polynomial fit
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_fit_lowess_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_size:int=5, x_series:string='', x_istime:bool=False)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_size', fit_size, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_size = kargs["fit_size"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
import statsmodels.api as sm
def lowess_fit(ts_row, x_col, y_col, fsize):
y = ts_row[y_col]
fraction = fsize/len(y)
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
lowess = sm.nonparametric.lowess
z = lowess(y, x, return_sorted=False, frac=fraction)
return list(z)
result = df
result[y_fit_series] = df.apply(lowess_fit, axis=1, args=(x_series, y_series, fit_size))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
range x from 1 to 200 step 1
| project x = rand()*5 - 2.3
| extend y = pow(x, 5)-8*pow(x, 3)+10*x+6
| extend y = y + (rand() - 0.5)*0.5*y
| summarize x=make_list(x), y=make_list(y)
| extend y_lowess = dynamic(null)
| invoke series_fit_lowess_fl('y', 'y_lowess', 15, 'x')
| extend series_fit_poly(y, x, 5)
| project x, y, y_lowess, y_polynomial=series_fit_poly_y_poly_fit
| render linechart
Stored
range x from 1 to 200 step 1
| project x = rand()*5 - 2.3
| extend y = pow(x, 5)-8*pow(x, 3)+10*x+6
| extend y = y + (rand() - 0.5)*0.5*y
| summarize x=make_list(x), y=make_list(y)
| extend y_lowess = dynamic(null)
| invoke series_fit_lowess_fl('y', 'y_lowess', 15, 'x')
| extend series_fit_poly(y, x, 5)
| project x, y, y_lowess, y_polynomial=series_fit_poly_y_poly_fit
| render linechart
Output

47 - series_fit_poly_fl()
The function series_fit_poly_fl() is a user-defined function (UDF) that applies a polynomial regression on a series. This function takes a table containing multiple series (dynamic numerical arrays) and generates the best fit high-order polynomial for each series using polynomial regression. This function returns both the polynomial coefficients and the interpolated polynomial over the range of the series.
Syntax
T | invoke series_fit_poly_fl(y_series, y_fit_series, fit_coeff, degree, [ x_series ], [ x_istime ])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| y_series | string | ✔️ | The name of the input table column containing the dependent variable. That is, the series to fit. |
| y_fit_series | string | ✔️ | The name of the column to store the best fit series. |
| fit_coeff | string | ✔️ | The name of the column to store the best fit polynomial coefficients. |
| degree | int | ✔️ | The required order of the polynomial to fit. For example, 1 for linear regression, 2 for quadratic regression, and so on. |
| x_series | string | The name of the column containing the independent variable, that is, the x or time axis. This parameter is optional, and is needed only for unevenly spaced series. The default value is an empty string, as x is redundant for the regression of an evenly spaced series. | |
| x_istime | bool | This parameter is needed only if x_series is specified and it’s a vector of datetime. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_fit_poly_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_coeff:string, degree:int, x_series:string='', x_istime:bool=False)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_coeff', fit_coeff, 'degree', degree, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_coeff = kargs["fit_coeff"]
degree = kargs["degree"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
def fit(ts_row, x_col, y_col, deg):
y = ts_row[y_col]
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
coeff = np.polyfit(x, y, deg)
p = np.poly1d(coeff)
z = p(x)
return z, coeff
result = df
if len(df):
result[[y_fit_series, fit_coeff]] = df.apply(fit, axis=1, args=(x_series, y_series, degree,), result_type="expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Fit a polynomial of a specified degree to a series")
series_fit_poly_fl(tbl:(*), y_series:string, y_fit_series:string, fit_coeff:string, degree:int, x_series:string='', x_istime:bool=false)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_coeff', fit_coeff, 'degree', degree, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_coeff = kargs["fit_coeff"]
degree = kargs["degree"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
def fit(ts_row, x_col, y_col, deg):
y = ts_row[y_col]
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
coeff = np.polyfit(x, y, deg)
p = np.poly1d(coeff)
z = p(x)
return z, coeff
result = df
if len(df):
result[[y_fit_series, fit_coeff]] = df.apply(fit, axis=1, args=(x_series, y_series, degree,), result_type="expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Examples
The following examples use the invoke operator to run the function.
Fit fifth order polynomial to a regular time series
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_fit_poly_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_coeff:string, degree:int, x_series:string='', x_istime:bool=False)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_coeff', fit_coeff, 'degree', degree, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_coeff = kargs["fit_coeff"]
degree = kargs["degree"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
def fit(ts_row, x_col, y_col, deg):
y = ts_row[y_col]
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
coeff = np.polyfit(x, y, deg)
p = np.poly1d(coeff)
z = p(x)
return z, coeff
result = df
if len(df):
result[[y_fit_series, fit_coeff]] = df.apply(fit, axis=1, args=(x_series, y_series, degree,), result_type="expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
//
// Fit fifth order polynomial to a regular (evenly spaced) time series, created with make-series
//
let max_t = datetime(2016-09-03);
demo_make_series1
| make-series num=count() on TimeStamp from max_t-1d to max_t step 5m by OsVer
| extend fnum = dynamic(null), coeff=dynamic(null), fnum1 = dynamic(null), coeff1=dynamic(null)
| invoke series_fit_poly_fl('num', 'fnum', 'coeff', 5)
| render timechart with(ycolumns=num, fnum)
Stored
//
// Fit fifth order polynomial to a regular (evenly spaced) time series, created with make-series
//
let max_t = datetime(2016-09-03);
demo_make_series1
| make-series num=count() on TimeStamp from max_t-1d to max_t step 5m by OsVer
| extend fnum = dynamic(null), coeff=dynamic(null), fnum1 = dynamic(null), coeff1=dynamic(null)
| invoke series_fit_poly_fl('num', 'fnum', 'coeff', 5)
| render timechart with(ycolumns=num, fnum)
Output

Test irregular time series
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_fit_poly_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_coeff:string, degree:int, x_series:string='', x_istime:bool=False)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_coeff', fit_coeff, 'degree', degree, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_coeff = kargs["fit_coeff"]
degree = kargs["degree"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
def fit(ts_row, x_col, y_col, deg):
y = ts_row[y_col]
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
coeff = np.polyfit(x, y, deg)
p = np.poly1d(coeff)
z = p(x)
return z, coeff
result = df
if len(df):
result[[y_fit_series, fit_coeff]] = df.apply(fit, axis=1, args=(x_series, y_series, degree,), result_type="expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
let max_t = datetime(2016-09-03);
demo_make_series1
| where TimeStamp between ((max_t-2d)..max_t)
| summarize num=count() by bin(TimeStamp, 5m), OsVer
| order by TimeStamp asc
| where hourofday(TimeStamp) % 6 != 0 // delete every 6th hour to create unevenly spaced time series
| summarize TimeStamp=make_list(TimeStamp), num=make_list(num) by OsVer
| extend fnum = dynamic(null), coeff=dynamic(null)
| invoke series_fit_poly_fl('num', 'fnum', 'coeff', 8, 'TimeStamp', True)
| render timechart with(ycolumns=num, fnum)
Stored
let max_t = datetime(2016-09-03);
demo_make_series1
| where TimeStamp between ((max_t-2d)..max_t)
| summarize num=count() by bin(TimeStamp, 5m), OsVer
| order by TimeStamp asc
| where hourofday(TimeStamp) % 6 != 0 // delete every 6th hour to create unevenly spaced time series
| summarize TimeStamp=make_list(TimeStamp), num=make_list(num) by OsVer
| extend fnum = dynamic(null), coeff=dynamic(null)
| invoke series_fit_poly_fl('num', 'fnum', 'coeff', 8, 'TimeStamp', True)
| render timechart with(ycolumns=num, fnum)
Output

Fifth order polynomial with noise on x & y axes
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_fit_poly_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_coeff:string, degree:int, x_series:string='', x_istime:bool=False)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_coeff', fit_coeff, 'degree', degree, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_coeff = kargs["fit_coeff"]
degree = kargs["degree"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
def fit(ts_row, x_col, y_col, deg):
y = ts_row[y_col]
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
coeff = np.polyfit(x, y, deg)
p = np.poly1d(coeff)
z = p(x)
return z, coeff
result = df
if len(df):
result[[y_fit_series, fit_coeff]] = df.apply(fit, axis=1, args=(x_series, y_series, degree,), result_type="expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
range x from 1 to 200 step 1
| project x = rand()*5 - 2.3
| extend y = pow(x, 5)-8*pow(x, 3)+10*x+6
| extend y = y + (rand() - 0.5)*0.5*y
| summarize x=make_list(x), y=make_list(y)
| extend y_fit = dynamic(null), coeff=dynamic(null)
| invoke series_fit_poly_fl('y', 'y_fit', 'coeff', 5, 'x')
|fork (project-away coeff) (project coeff | mv-expand coeff)
| render linechart
Stored
range x from 1 to 200 step 1
| project x = rand()*5 - 2.3
| extend y = pow(x, 5)-8*pow(x, 3)+10*x+6
| extend y = y + (rand() - 0.5)*0.5*y
| summarize x=make_list(x), y=make_list(y)
| extend y_fit = dynamic(null), coeff=dynamic(null)
| invoke series_fit_poly_fl('y', 'y_fit', 'coeff', 5, 'x')
|fork (project-away coeff) (project coeff | mv-expand coeff)
| render linechart
Output


48 - series_lag_fl()
Applies a lag on a series.
The function series_lag_fl() is a user-defined function (UDF) that takes an expression containing a dynamic numerical array as input and shifts it backward. It’s commonly used for shifting time series to test whether a pattern is new or it matches historical data.
Syntax
series_lag_fl(y_series, offset)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| y_series | dynamic | ✔️ | An array cell of numeric values. |
| offset | int | ✔️ | An integer specifying the required offset in bins. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_lag_fl = (series:dynamic, offset:int)
{
let lag_f = toscalar(range x from 1 to offset+1 step 1
| project y=iff(x == offset+1, 1, 0)
| summarize lag_filter = make_list(y));
fir(series, lag_f, false)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Shift a series by a specified offset")
series_lag_fl(series:dynamic, offset:int)
{
let lag_f = toscalar(range x from 1 to offset+1 step 1
| project y=iff(x == offset+1, 1, 0)
| summarize lag_filter = make_list(y));
fir(series, lag_f, false)
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_lag_fl = (series:dynamic, offset:int)
{
let lag_f = toscalar(range x from 1 to offset+1 step 1
| project y=iff(x == offset+1, 1, 0)
| summarize lag_filter = make_list(y));
fir(series, lag_f, false)
};
let dt = 1h;
let time_shift = 1d;
let bins_shift = toint(time_shift/dt);
demo_make_series1
| make-series num=count() on TimeStamp step dt by OsVer
| extend num_shifted=series_lag_fl(num, bins_shift)
| render timechart
Stored
let dt = 1h;
let time_shift = 1d;
let bins_shift = toint(time_shift/dt);
demo_make_series1
| make-series num=count() on TimeStamp step dt by OsVer
| extend num_shifted=series_lag_fl(num, bins_shift)
| render timechart
Output

49 - series_metric_fl()
The series_metric_fl() function is a user-defined function (UDF) that selects and retrieves time series of metrics ingested to your database using the Prometheus monitoring system. This function assumes the data stored in your database is structured following the Prometheus data model. Specifically, each record contains:
- timestamp
- metric name
- metric value
- a variable set of labels (
"key":"value"pairs)
Prometheus defines a time series by its metric name and a distinct set of labels. You can retrieve sets of time series using Prometheus Query Language (PromQL) by specifying the metric name and time series selector (a set of labels).
Syntax
T | invoke series_metric_fl(timestamp_col, name_col, labels_col, value_col, metric_name, labels_selector, lookback, offset)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| timestamp_col | string | ✔️ | The name of the column containing the timestamp. |
| name_col | string | ✔️ | The name of the column containing the metric name. |
| labels_col | string | ✔️ | The name of the column containing the labels dictionary. |
| value_col | string | ✔️ | The name of the column containing the metric value. |
| metric_name | string | ✔️ | The metric time series to retrieve. |
| labels_selector | string | Time series selector string, similar to PromQL. It’s a string containing a list of "key":"value" pairs, for example '"key1":"val1","key2":"val2"'. The default is an empty string, which means no filtering. Note that regular expressions are not supported. | |
| lookback | timespan | The range vector to retrieve, similar to PromQL. The default is 10 minutes. | |
| offset | datetime | Offset back from current time to retrieve, similar to PromQL. Data is retrieved from ago(offset)-lookback to ago(offset). The default is 0, which means that data is retrieved up to now(). |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_metric_fl=(metrics_tbl:(*), timestamp_col:string, name_col:string, labels_col:string, value_col:string, metric_name:string, labels_selector:string='', lookback:timespan=timespan(10m), offset:timespan=timespan(0))
{
let selector_d=iff(labels_selector == '', dynamic(['']), split(labels_selector, ','));
let etime = ago(offset);
let stime = etime - lookback;
metrics_tbl
| extend timestamp = column_ifexists(timestamp_col, datetime(null)), name = column_ifexists(name_col, ''), labels = column_ifexists(labels_col, dynamic(null)), value = column_ifexists(value_col, 0)
| extend labels = dynamic_to_json(labels) // convert to string and sort by key
| where name == metric_name and timestamp between(stime..etime)
| order by timestamp asc
| summarize timestamp = make_list(timestamp), value=make_list(value) by name, labels
| where labels has_all (selector_d)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create function with (folder = "Packages\\Series", docstring = "Selecting & retrieving metrics like PromQL")
series_metric_fl(metrics_tbl:(*), timestamp_col:string, name_col:string, labels_col:string, value_col:string, metric_name:string, labels_selector:string='', lookback:timespan=timespan(10m), offset:timespan=timespan(0))
{
let selector_d=iff(labels_selector == '', dynamic(['']), split(labels_selector, ','));
let etime = ago(offset);
let stime = etime - lookback;
metrics_tbl
| extend timestamp = column_ifexists(timestamp_col, datetime(null)), name = column_ifexists(name_col, ''), labels = column_ifexists(labels_col, dynamic(null)), value = column_ifexists(value_col, 0)
| extend labels = dynamic_to_json(labels) // convert to string and sort by key
| where name == metric_name and timestamp between(stime..etime)
| order by timestamp asc
| summarize timestamp = make_list(timestamp), value=make_list(value) by name, labels
| where labels has_all (selector_d)
}
Examples
The following examples use the invoke operator to run the function.
With specifying selector
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_metric_fl=(metrics_tbl:(*), timestamp_col:string, name_col:string, labels_col:string, value_col:string, metric_name:string, labels_selector:string='', lookback:timespan=timespan(10m), offset:timespan=timespan(0))
{
let selector_d=iff(labels_selector == '', dynamic(['']), split(labels_selector, ','));
let etime = ago(offset);
let stime = etime - lookback;
metrics_tbl
| extend timestamp = column_ifexists(timestamp_col, datetime(null)), name = column_ifexists(name_col, ''), labels = column_ifexists(labels_col, dynamic(null)), value = column_ifexists(value_col, 0)
| extend labels = dynamic_to_json(labels) // convert to string and sort by key
| where name == metric_name and timestamp between(stime..etime)
| order by timestamp asc
| summarize timestamp = make_list(timestamp), value=make_list(value) by name, labels
| where labels has_all (selector_d)
};
demo_prometheus
| invoke series_metric_fl('TimeStamp', 'Name', 'Labels', 'Val', 'writes', '"disk":"sda1","host":"aks-agentpool-88086459-vmss000001"', offset=now()-datetime(2020-12-08 00:00))
| render timechart with(series=labels)
Stored
demo_prometheus
| invoke series_metric_fl('TimeStamp', 'Name', 'Labels', 'Val', 'writes', '"disk":"sda1","host":"aks-agentpool-88086459-vmss000001"', offset=now()-datetime(2020-12-08 00:00))
| render timechart with(series=labels)
Output

Without specifying selector
The following example doesn’t specify selector, so all ‘writes’ metrics are selected. This example assumes that the function is already installed, and uses alternative direct calling syntax, specifying the input table as the first parameter:
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_metric_fl=(metrics_tbl:(*), timestamp_col:string, name_col:string, labels_col:string, value_col:string, metric_name:string, labels_selector:string='', lookback:timespan=timespan(10m), offset:timespan=timespan(0))
{
let selector_d=iff(labels_selector == '', dynamic(['']), split(labels_selector, ','));
let etime = ago(offset);
let stime = etime - lookback;
metrics_tbl
| extend timestamp = column_ifexists(timestamp_col, datetime(null)), name = column_ifexists(name_col, ''), labels = column_ifexists(labels_col, dynamic(null)), value = column_ifexists(value_col, 0)
| extend labels = dynamic_to_json(labels) // convert to string and sort by key
| where name == metric_name and timestamp between(stime..etime)
| order by timestamp asc
| summarize timestamp = make_list(timestamp), value=make_list(value) by name, labels
| where labels has_all (selector_d)
};
series_metric_fl(demo_prometheus, 'TimeStamp', 'Name', 'Labels', 'Val', 'writes', offset=now()-datetime(2020-12-08 00:00))
| render timechart with(series=labels, ysplit=axes)
Stored
series_metric_fl(demo_prometheus, 'TimeStamp', 'Name', 'Labels', 'Val', 'writes', offset=now()-datetime(2020-12-08 00:00))
| render timechart with(series=labels, ysplit=axes)
Output

50 - series_monthly_decompose_anomalies_fl()
Detect anomalous points in a daily series with monthly seasonality.
The function series_monthly_decompose_anomalies_fl() is a user-defined function (UDF) that detects anomalies in multiple time series that have monthly seasonality. The function is built on top of series_decompose_anomalies(). The challenge is that the length of a month is variable between 28 to 31 days, so building a baseline by using series_decompose_anomalies() out of the box detects fixed seasonality thus fails to match spikes or other patterns that occur in the 1st or other day in each month.
Syntax
series_monthly_decompose_anomalies_fl(threshold)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| threshold | real | Anomaly threshold. Default is 1.5. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_monthly_decompose_anomalies_fl=(tbl:(_key:string, _date:datetime, _val:real), threshold:real=1.5)
{
let _tbl=materialize(tbl
| extend _year=getyear(_date), _dom = dayofmonth(_date), _moy=monthofyear(_date), _doy=dayofyear(_date)
| extend _vdoy = 31*(_moy-1)+_dom // virtual day of year (assuming all months have 31 days)
);
let median_tbl = _tbl | summarize p50=percentiles(_val, 50) by _key, _dom;
let keys = _tbl | summarize by _key | extend dummy=1;
let years = _tbl | summarize by _year | extend dummy=1;
let vdoys = range _vdoy from 0 to 31*12-1 step 1 | extend _moy=_vdoy/31+1, _vdom=_vdoy%31+1, _vdoy=_vdoy+1 | extend dummy=1
| join kind=fullouter years on dummy | join kind=fullouter keys on dummy | project-away dummy, dummy1, dummy2;
vdoys
| join kind=leftouter _tbl on _key, _year, _vdoy
| project-away _key1, _year1, _moy1, _vdoy1
| extend _adoy=31*12*_year+_doy, _vadoy = 31*12*_year+_vdoy
| partition by _key (as T
| where _vadoy >= toscalar(T | summarize (_adoy, _vadoy)=arg_min(_adoy, _vadoy) | project _vadoy) and
_vadoy <= toscalar(T | summarize (_adoy, _vadoy)=arg_max(_adoy, _vadoy) | project _vadoy)
)
| join kind=inner median_tbl on _key, $left._vdom == $right._dom
| extend _vval = coalesce(_val, p50)
//| order by _key asc, _vadoy asc // for debugging
| make-series _vval=avg(_vval), _date=any(_date) default=datetime(null) on _vadoy step 1 by _key
| extend (anomalies, score, baseline) = series_decompose_anomalies(_vval, threshold, 31)
| mv-expand _date to typeof(datetime), _vval to typeof(real), _vadoy to typeof(long), anomalies to typeof(int), score to typeof(real), baseline to typeof(real)
| project-away _vadoy
| project-rename _val=_vval
| where isnotnull(_date)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Anomaly Detection for daily time series with monthly seasonality")
series_monthly_decompose_anomalies_fl(tbl:(_key:string, _date:datetime, _val:real), threshold:real=1.5)
{
let _tbl=materialize(tbl
| extend _year=getyear(_date), _dom = dayofmonth(_date), _moy=monthofyear(_date), _doy=dayofyear(_date)
| extend _vdoy = 31*(_moy-1)+_dom // virtual day of year (assuming all months have 31 days)
);
let median_tbl = _tbl | summarize p50=percentiles(_val, 50) by _key, _dom;
let keys = _tbl | summarize by _key | extend dummy=1;
let years = _tbl | summarize by _year | extend dummy=1;
let vdoys = range _vdoy from 0 to 31*12-1 step 1 | extend _moy=_vdoy/31+1, _vdom=_vdoy%31+1, _vdoy=_vdoy+1 | extend dummy=1
| join kind=fullouter years on dummy | join kind=fullouter keys on dummy | project-away dummy, dummy1, dummy2;
vdoys
| join kind=leftouter _tbl on _key, _year, _vdoy
| project-away _key1, _year1, _moy1, _vdoy1
| extend _adoy=31*12*_year+_doy, _vadoy = 31*12*_year+_vdoy
| partition by _key (as T
| where _vadoy >= toscalar(T | summarize (_adoy, _vadoy)=arg_min(_adoy, _vadoy) | project _vadoy) and
_vadoy <= toscalar(T | summarize (_adoy, _vadoy)=arg_max(_adoy, _vadoy) | project _vadoy)
)
| join kind=inner median_tbl on _key, $left._vdom == $right._dom
| extend _vval = coalesce(_val, p50)
//| order by _key asc, _vadoy asc // for debugging
| make-series _vval=avg(_vval), _date=any(_date) default=datetime(null) on _vadoy step 1 by _key
| extend (anomalies, score, baseline) = series_decompose_anomalies(_vval, threshold, 31)
| mv-expand _date to typeof(datetime), _vval to typeof(real), _vadoy to typeof(long), anomalies to typeof(int), score to typeof(real), baseline to typeof(real)
| project-away _vadoy
| project-rename _val=_vval
| where isnotnull(_date)
}
Example
The input table must contain _key, _date and _val columns. The query builds a set of time series of _val per each _key and adds anomalies, score and baseline columns.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_monthly_decompose_anomalies_fl=(tbl:(_key:string, _date:datetime, _val:real), threshold:real=1.5)
{
let _tbl=materialize(tbl
| extend _year=getyear(_date), _dom = dayofmonth(_date), _moy=monthofyear(_date), _doy=dayofyear(_date)
| extend _vdoy = 31*(_moy-1)+_dom // virtual day of year (assuming all months have 31 days)
);
let median_tbl = _tbl | summarize p50=percentiles(_val, 50) by _key, _dom;
let keys = _tbl | summarize by _key | extend dummy=1;
let years = _tbl | summarize by _year | extend dummy=1;
let vdoys = range _vdoy from 0 to 31*12-1 step 1 | extend _moy=_vdoy/31+1, _vdom=_vdoy%31+1, _vdoy=_vdoy+1 | extend dummy=1
| join kind=fullouter years on dummy | join kind=fullouter keys on dummy | project-away dummy, dummy1, dummy2;
vdoys
| join kind=leftouter _tbl on _key, _year, _vdoy
| project-away _key1, _year1, _moy1, _vdoy1
| extend _adoy=31*12*_year+_doy, _vadoy = 31*12*_year+_vdoy
| partition by _key (as T
| where _vadoy >= toscalar(T | summarize (_adoy, _vadoy)=arg_min(_adoy, _vadoy) | project _vadoy) and
_vadoy <= toscalar(T | summarize (_adoy, _vadoy)=arg_max(_adoy, _vadoy) | project _vadoy)
)
| join kind=inner median_tbl on _key, $left._vdom == $right._dom
| extend _vval = coalesce(_val, p50)
//| order by _key asc, _vadoy asc // for debugging
| make-series _vval=avg(_vval), _date=any(_date) default=datetime(null) on _vadoy step 1 by _key
| extend (anomalies, score, baseline) = series_decompose_anomalies(_vval, threshold, 31)
| mv-expand _date to typeof(datetime), _vval to typeof(real), _vadoy to typeof(long), anomalies to typeof(int), score to typeof(real), baseline to typeof(real)
| project-away _vadoy
| project-rename _val=_vval
| where isnotnull(_date)
};
demo_monthly_ts
| project _key=key, _date=ts, _val=val
| invoke series_monthly_decompose_anomalies_fl()
| project-rename key=_key, ts=_date, val=_val
| render anomalychart with(anomalycolumns=anomalies, xcolumn=ts, ycolumns=val)
Stored
demo_monthly_ts
| project _key=key, _date=ts, _val=val
| invoke series_monthly_decompose_anomalies_fl()
| project-rename key=_key, ts=_date, val=_val
| render anomalychart with(anomalycolumns=anomalies, xcolumn=ts, ycolumns=val)
Output
Series A with monthly anomalies:

Series B with monthly anomalies:

51 - series_moving_avg_fl()
Applies a moving average filter on a series.
The function series_moving_avg_fl() is a user-defined function (UDF) that takes an expression containing a dynamic numerical array as input and applies on it a simple moving average filter.
Syntax
series_moving_avg_fl(y_series, n [, center ])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| y_series | dynamic | ✔️ | An array cell of numeric values. |
| n | int | ✔️ | The width of the moving average filter. |
| center | bool | Indicates whether the moving average is either applied symmetrically on a window before and after the current point or applied on a window from the current point backwards. By default, center is false. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_moving_avg_fl = (y_series:dynamic, n:int, center:bool=false)
{
series_fir(y_series, repeat(1, n), true, center)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Calculate moving average of specified width")
series_moving_avg_fl(y_series:dynamic, n:int, center:bool=false)
{
series_fir(y_series, repeat(1, n), true, center)
}
Example
The following example uses the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_moving_avg_fl = (y_series:dynamic, n:int, center:bool=false)
{
series_fir(y_series, repeat(1, n), true, center)
};
//
// Moving average of 5 bins
//
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| extend num_ma=series_moving_avg_fl(num, 5, True)
| render timechart
Stored
//
// Moving average of 5 bins
//
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| extend num_ma=series_moving_avg_fl(num, 5, True)
| render timechart
Output

52 - series_moving_var_fl()
Applies a moving variance filter on a series.
The function series_moving_var_fl() is a user-defined function (UDF) that takes an expression containing a dynamic numerical array as input and applies on it a moving variance filter.
Syntax
series_moving_var_fl(y_series, n [, center ])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| y_series | dynamic | ✔️ | An array cell of numeric values. |
| n | int | ✔️ | The width of the moving variance filter. |
| center | bool | Indicates whether the moving variance is either applied symmetrically on a window before and after the current point or applied on a window from the current point backwards. By default, center is false. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_moving_var_fl = (y_series:dynamic, n:int, center:bool=false)
{
let ey = series_fir(y_series, repeat(1, n), true, center);
let e2y = series_multiply(ey, ey);
let y2 = series_multiply(y_series, y_series);
let ey2 = series_fir(y2, repeat(1, n), true, center);
let var_series = series_subtract(ey2, e2y);
var_series
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Calculate moving variance of specified width")
series_moving_var_fl(y_series:dynamic, n:int, center:bool=false)
{
let ey = series_fir(y_series, repeat(1, n), true, center);
let e2y = series_multiply(ey, ey);
let y2 = series_multiply(y_series, y_series);
let ey2 = series_fir(y2, repeat(1, n), true, center);
let var_series = series_subtract(ey2, e2y);
var_series
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_moving_var_fl = (y_series:dynamic, n:int, center:bool=false)
{
let ey = series_fir(y_series, repeat(1, n), true, center);
let e2y = series_multiply(ey, ey);
let y2 = series_multiply(y_series, y_series);
let ey2 = series_fir(y2, repeat(1, n), true, center);
let var_series = series_subtract(ey2, e2y);
var_series
}
;
let sinewave=(x:double, period:double, gain:double=1.0, phase:double=0.0)
{
gain*sin(2*pi()/period*(x+phase))
}
;
let n=128;
let T=10;
let window=T*2;
union
(range x from 0 to n-1 step 1 | extend y=sinewave(x, T)),
(range x from n to 2*n-1 step 1 | extend y=0.0),
(range x from 2*n to 3*n-1 step 1 | extend y=sinewave(x, T)),
(range x from 3*n to 4*n-1 step 1 | extend y=(x-3.0*n)/128.0),
(range x from 4*n to 5*n-1 step 1 | extend y=sinewave(x, T))
| order by x asc
| summarize x=make_list(x), y=make_list(y)
| extend y_var=series_moving_var_fl(y, T, true)
| render linechart
Stored
let sinewave=(x:double, period:double, gain:double=1.0, phase:double=0.0)
{
gain*sin(2*pi()/period*(x+phase))
}
;
let n=128;
let T=10;
let window=T*2;
union
(range x from 0 to n-1 step 1 | extend y=sinewave(x, T)),
(range x from n to 2*n-1 step 1 | extend y=0.0),
(range x from 2*n to 3*n-1 step 1 | extend y=sinewave(x, T)),
(range x from 3*n to 4*n-1 step 1 | extend y=(x-3.0*n)/128.0),
(range x from 4*n to 5*n-1 step 1 | extend y=sinewave(x, T))
| order by x asc
| summarize x=make_list(x), y=make_list(y)
| extend y_var=series_moving_var_fl(y, T, true)
| render linechart
Output

53 - series_mv_ee_anomalies_fl()
The function series_mv_ee_anomalies_fl() is a user-defined function (UDF) that detects multivariate anomalies in series by applying elliptic envelope model from scikit-learn. This model assumes that the source of the multivariate data is multi-dimensional normal distribution. The function accepts a set of series as numerical dynamic arrays, the names of the features columns and the expected percentage of anomalies out of the whole series. The function builds a multi-dimensional elliptical envelope for each series and marks the points that fall outside this normal envelope as anomalies.
Syntax
T | invoke series_mv_ee_anomalies_fl(features_cols, anomaly_col [, score_col [, anomalies_pct ]])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| features_cols | dynamic | ✔️ | An array containing the names of the columns that are used for the multivariate anomaly detection model. |
| anomaly_col | string | ✔️ | The name of the column to store the detected anomalies. |
| score_col | string | The name of the column to store the scores of the anomalies. | |
| anomalies_pct | real | A real number in the range [0-50] specifying the expected percentage of anomalies in the data. Default value: 4%. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
// Define function
let series_mv_ee_anomalies_fl=(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0)
{
let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct);
let code = ```if 1:
from sklearn.covariance import EllipticEnvelope
features_cols = kargs['features_cols']
anomaly_col = kargs['anomaly_col']
score_col = kargs['score_col']
anomalies_pct = kargs['anomalies_pct']
dff = df[features_cols]
ellipsoid = EllipticEnvelope(contamination=anomalies_pct/100.0)
for i in range(len(dff)):
dffi = dff.iloc[[i], :]
dffe = dffi.explode(features_cols)
ellipsoid.fit(dffe)
df.loc[i, anomaly_col] = (ellipsoid.predict(dffe) < 0).astype(int).tolist()
if score_col != '':
df.loc[i, score_col] = ellipsoid.decision_function(dffe).tolist()
result = df
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Anomaly Detection for multi dimensional normally distributed data using elliptical envelope model")
series_mv_ee_anomalies_fl(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0)
{
let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct);
let code = ```if 1:
from sklearn.covariance import EllipticEnvelope
features_cols = kargs['features_cols']
anomaly_col = kargs['anomaly_col']
score_col = kargs['score_col']
anomalies_pct = kargs['anomalies_pct']
dff = df[features_cols]
ellipsoid = EllipticEnvelope(contamination=anomalies_pct/100.0)
for i in range(len(dff)):
dffi = dff.iloc[[i], :]
dffe = dffi.explode(features_cols)
ellipsoid.fit(dffe)
df.loc[i, anomaly_col] = (ellipsoid.predict(dffe) < 0).astype(int).tolist()
if score_col != '':
df.loc[i, score_col] = ellipsoid.decision_function(dffe).tolist()
result = df
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
// Define function
let series_mv_ee_anomalies_fl=(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0)
{
let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct);
let code = ```if 1:
from sklearn.covariance import EllipticEnvelope
features_cols = kargs['features_cols']
anomaly_col = kargs['anomaly_col']
score_col = kargs['score_col']
anomalies_pct = kargs['anomalies_pct']
dff = df[features_cols]
ellipsoid = EllipticEnvelope(contamination=anomalies_pct/100.0)
for i in range(len(dff)):
dffi = dff.iloc[[i], :]
dffe = dffi.explode(features_cols)
ellipsoid.fit(dffe)
df.loc[i, anomaly_col] = (ellipsoid.predict(dffe) < 0).astype(int).tolist()
if score_col != '':
df.loc[i, score_col] = ellipsoid.decision_function(dffe).tolist()
result = df
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Usage
normal_2d_with_anomalies
| extend anomalies=dynamic(null), scores=dynamic(null)
| invoke series_mv_ee_anomalies_fl(pack_array('x', 'y'), 'anomalies', 'scores')
| extend anomalies=series_multiply(80, anomalies)
| render timechart
Stored
normal_2d_with_anomalies
| extend anomalies=dynamic(null), scores=dynamic(null)
| invoke series_mv_ee_anomalies_fl(pack_array('x', 'y'), 'anomalies', 'scores')
| extend anomalies=series_multiply(80, anomalies)
| render timechart
Output
The table normal_2d_with_anomalies contains a set of 3 time series. Each time series has two-dimensional normal distribution with daily anomalies added at midnight, 8am, and 4pm respectively. You can create this sample dataset using an example query.

To view the data as a scatter chart, replace the usage code with the following:
normal_2d_with_anomalies
| extend anomalies=dynamic(null)
| invoke series_mv_ee_anomalies_fl(pack_array('x', 'y'), 'anomalies')
| where name == 'TS1'
| project x, y, anomalies
| mv-expand x to typeof(real), y to typeof(real), anomalies to typeof(string)
| render scatterchart with(series=anomalies)

You can see that on TS1 most of the midnight anomalies were detected using this multivariate model.
Create a sample dataset
.set normal_2d_with_anomalies <|
//
let window=14d;
let dt=1h;
let n=toint(window/dt);
let rand_normal_fl=(avg:real=0.0, stdv:real=1.0)
{
let x =rand()+rand()+rand()+rand()+rand()+rand()+rand()+rand()+rand()+rand()+rand()+rand();
(x - 6)*stdv + avg
};
union
(range s from 0 to n step 1
| project t=startofday(now())-s*dt
| extend x=rand_normal_fl(10, 5)
| extend y=iff(hourofday(t) == 0, 2*(10-x)+7+rand_normal_fl(0, 3), 2*x+7+rand_normal_fl(0, 3)) // anomalies every midnight
| extend name='TS1'),
(range s from 0 to n step 1
| project t=startofday(now())-s*dt
| extend x=rand_normal_fl(15, 3)
| extend y=iff(hourofday(t) == 8, (15-x)+10+rand_normal_fl(0, 2), x-7+rand_normal_fl(0, 1)) // anomalies every 8am
| extend name='TS2'),
(range s from 0 to n step 1
| project t=startofday(now())-s*dt
| extend x=rand_normal_fl(8, 6)
| extend y=iff(hourofday(t) == 16, x+5+rand_normal_fl(0, 4), (12-x)+rand_normal_fl(0, 4)) // anomalies every 4pm
| extend name='TS3')
| summarize t=make_list(t), x=make_list(x), y=make_list(y) by name

54 - series_mv_if_anomalies_fl()
The function series_mv_if_anomalies_fl() is a user-defined function (UDF) that detects multivariate anomalies in series by applying isolation forest model from scikit-learn. The function accepts a set of series as numerical dynamic arrays, the names of the features columns and the expected percentage of anomalies out of the whole series. The function builds an ensemble of isolation trees for each series and marks the points that are quickly isolated as anomalies.
Syntax
T | invoke series_mv_if_anomalies_fl(features_cols, anomaly_col [, score_col [, anomalies_pct [, num_trees [, samples_pct ]]]])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| features_cols | dynamic | ✔️ | An array containing the names of the columns that are used for the multivariate anomaly detection model. |
| anomaly_col | string | ✔️ | The name of the column to store the detected anomalies. |
| score_col | string | The name of the column to store the scores of the anomalies. | |
| anomalies_pct | real | A real number in the range [0-50] specifying the expected percentage of anomalies in the data. Default value: 4%. | |
| num_trees | int | The number of isolation trees to build for each time series. Default value: 100. | |
| samples_pct | real | A real number in the range [0-100] specifying the percentage of samples used to build each tree. Default value: 100%, i.e. use the full series. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
// Define function
let series_mv_if_anomalies_fl=(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0, num_trees:int=100, samples_pct:real=100.0)
{
let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct, 'num_trees', num_trees, 'samples_pct', samples_pct);
let code = ```if 1:
from sklearn.ensemble import IsolationForest
features_cols = kargs['features_cols']
anomaly_col = kargs['anomaly_col']
score_col = kargs['score_col']
anomalies_pct = kargs['anomalies_pct']
num_trees = kargs['num_trees']
samples_pct = kargs['samples_pct']
dff = df[features_cols]
iforest = IsolationForest(contamination=anomalies_pct/100.0, random_state=0, n_estimators=num_trees, max_samples=samples_pct/100.0)
for i in range(len(dff)):
dffi = dff.iloc[[i], :]
dffe = dffi.explode(features_cols)
iforest.fit(dffe)
df.loc[i, anomaly_col] = (iforest.predict(dffe) < 0).astype(int).tolist()
if score_col != '':
df.loc[i, score_col] = iforest.decision_function(dffe).tolist()
result = df
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Anomaly Detection for multi dimensional data using isolation forest model")
series_mv_if_anomalies_fl(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0, num_trees:int=100, samples_pct:real=100.0)
{
let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct, 'num_trees', num_trees, 'samples_pct', samples_pct);
let code = ```if 1:
from sklearn.ensemble import IsolationForest
features_cols = kargs['features_cols']
anomaly_col = kargs['anomaly_col']
score_col = kargs['score_col']
anomalies_pct = kargs['anomalies_pct']
num_trees = kargs['num_trees']
samples_pct = kargs['samples_pct']
dff = df[features_cols]
iforest = IsolationForest(contamination=anomalies_pct/100.0, random_state=0, n_estimators=num_trees, max_samples=samples_pct/100.0)
for i in range(len(dff)):
dffi = dff.iloc[[i], :]
dffe = dffi.explode(features_cols)
iforest.fit(dffe)
df.loc[i, anomaly_col] = (iforest.predict(dffe) < 0).astype(int).tolist()
if score_col != '':
df.loc[i, score_col] = iforest.decision_function(dffe).tolist()
result = df
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
// Define function
let series_mv_if_anomalies_fl=(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0, num_trees:int=100, samples_pct:real=100.0)
{
let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct, 'num_trees', num_trees, 'samples_pct', samples_pct);
let code = ```if 1:
from sklearn.ensemble import IsolationForest
features_cols = kargs['features_cols']
anomaly_col = kargs['anomaly_col']
score_col = kargs['score_col']
anomalies_pct = kargs['anomalies_pct']
num_trees = kargs['num_trees']
samples_pct = kargs['samples_pct']
dff = df[features_cols]
iforest = IsolationForest(contamination=anomalies_pct/100.0, random_state=0, n_estimators=num_trees, max_samples=samples_pct/100.0)
for i in range(len(dff)):
dffi = dff.iloc[[i], :]
dffe = dffi.explode(features_cols)
iforest.fit(dffe)
df.loc[i, anomaly_col] = (iforest.predict(dffe) < 0).astype(int).tolist()
if score_col != '':
df.loc[i, score_col] = iforest.decision_function(dffe).tolist()
result = df
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Usage
normal_2d_with_anomalies
| extend anomalies=dynamic(null), scores=dynamic(null)
| invoke series_mv_if_anomalies_fl(pack_array('x', 'y'), 'anomalies', 'scores', anomalies_pct=8, num_trees=1000)
| extend anomalies=series_multiply(40, anomalies)
| render timechart
Stored
normal_2d_with_anomalies
| extend anomalies=dynamic(null), scores=dynamic(null)
| invoke series_mv_if_anomalies_fl(pack_array('x', 'y'), 'anomalies', 'scores', anomalies_pct=8, num_trees=1000)
| extend anomalies=series_multiply(40, anomalies)
| render timechart
Output
The table normal_2d_with_anomalies contains a set of 3 time series. Each time series has two-dimensional normal distribution with daily anomalies added at midnight, 8am, and 4pm respectively. You can create this sample dataset using an example query.

To view the data as a scatter chart, replace the usage code with the following:
normal_2d_with_anomalies
| extend anomalies=dynamic(null)
| invoke series_mv_if_anomalies_fl(pack_array('x', 'y'), 'anomalies')
| where name == 'TS1'
| project x, y, anomalies
| mv-expand x to typeof(real), y to typeof(real), anomalies to typeof(string)
| render scatterchart with(series=anomalies)

You can see that on TS2 most of the anomalies occurring at 8am were detected using this multivariate model.
55 - series_mv_oc_anomalies_fl()
The function series_mv_oc_anomalies_fl() is a user-defined function (UDF) that detects multivariate anomalies in series by applying the One Class SVM model from scikit-learn. The function accepts a set of series as numerical dynamic arrays, the names of the features columns and the expected percentage of anomalies out of the whole series. The function trains one class SVM for each series and marks the points that fall outside the hyper sphere as anomalies.
Syntax
T | invoke series_mv_oc_anomalies_fl(features_cols, anomaly_col [, score_col [, anomalies_pct ]])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| features_cols | dynamic | ✔️ | An array containing the names of the columns that are used for the multivariate anomaly detection model. |
| anomaly_col | string | ✔️ | The name of the column to store the detected anomalies. |
| score_col | string | The name of the column to store the scores of the anomalies. | |
| anomalies_pct | real | A real number in the range [0-50] specifying the expected percentage of anomalies in the data. Default value: 4%. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_mv_oc_anomalies_fl=(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0)
{
let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct);
let code = ```if 1:
from sklearn.svm import OneClassSVM
features_cols = kargs['features_cols']
anomaly_col = kargs['anomaly_col']
score_col = kargs['score_col']
anomalies_pct = kargs['anomalies_pct']
dff = df[features_cols]
svm = OneClassSVM(nu=anomalies_pct/100.0)
for i in range(len(dff)):
dffi = dff.iloc[[i], :]
dffe = dffi.explode(features_cols)
svm.fit(dffe)
df.loc[i, anomaly_col] = (svm.predict(dffe) < 0).astype(int).tolist()
if score_col != '':
df.loc[i, score_col] = svm.decision_function(dffe).tolist()
result = df
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Write your query to use the function.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Anomaly Detection for multi dimensional data using One Class SVM model")
series_mv_oc_anomalies_fl(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0)
{
let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct);
let code = ```if 1:
from sklearn.svm import OneClassSVM
features_cols = kargs['features_cols']
anomaly_col = kargs['anomaly_col']
score_col = kargs['score_col']
anomalies_pct = kargs['anomalies_pct']
dff = df[features_cols]
svm = OneClassSVM(nu=anomalies_pct/100.0)
for i in range(len(dff)):
dffi = dff.iloc[[i], :]
dffe = dffi.explode(features_cols)
svm.fit(dffe)
df.loc[i, anomaly_col] = (svm.predict(dffe) < 0).astype(int).tolist()
if score_col != '':
df.loc[i, score_col] = svm.decision_function(dffe).tolist()
result = df
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_mv_oc_anomalies_fl=(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0)
{
let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct);
let code = ```if 1:
from sklearn.svm import OneClassSVM
features_cols = kargs['features_cols']
anomaly_col = kargs['anomaly_col']
score_col = kargs['score_col']
anomalies_pct = kargs['anomalies_pct']
dff = df[features_cols]
svm = OneClassSVM(nu=anomalies_pct/100.0)
for i in range(len(dff)):
dffi = dff.iloc[[i], :]
dffe = dffi.explode(features_cols)
svm.fit(dffe)
df.loc[i, anomaly_col] = (svm.predict(dffe) < 0).astype(int).tolist()
if score_col != '':
df.loc[i, score_col] = svm.decision_function(dffe).tolist()
result = df
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Usage
normal_2d_with_anomalies
| extend anomalies=dynamic(null), scores=dynamic(null)
| invoke series_mv_oc_anomalies_fl(pack_array('x', 'y'), 'anomalies', 'scores', anomalies_pct=6)
| extend anomalies=series_multiply(80, anomalies)
| render timechart
Stored
normal_2d_with_anomalies
| extend anomalies=dynamic(null), scores=dynamic(null)
| invoke series_mv_oc_anomalies_fl(pack_array('x', 'y'), 'anomalies', 'scores', anomalies_pct=6)
| extend anomalies=series_multiply(80, anomalies)
| render timechart
Output
The table normal_2d_with_anomalies contains a set of 3 time series. Each time series has two-dimensional normal distribution with daily anomalies added at midnight, 8am, and 4pm respectively. You can create this sample dataset using an example query.

To view the data as a scatter chart, replace the usage code with the following:
normal_2d_with_anomalies
| extend anomalies=dynamic(null)
| invoke series_mv_oc_anomalies_fl(pack_array('x', 'y'), 'anomalies')
| where name == 'TS1'
| project x, y, anomalies
| mv-expand x to typeof(real), y to typeof(real), anomalies to typeof(string)
| render scatterchart with(series=anomalies)

You can see that on TS1 most of the anomalies occurring at midnights were detected using this multivariate model.
56 - series_rate_fl()
The function series_rate_fl() is a user-defined function (UDF) that calculates the average rate of metric increase per second. Its logic follows PromQL rate() function. It should be used for time series of counter metrics ingested to your database by Prometheus monitoring system, and retrieved by series_metric_fl().
Syntax
T | invoke series_rate_fl([ n_bins [, fix_reset ]])
T is a table returned from series_metric_fl(). Its schema includes (timestamp:dynamic, name:string, labels:string, value:dynamic).
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| n_bins | int | The number of bins to specify the gap between the extracted metric values for calculation of the rate. The function calculates the difference between the current sample and the one n_bins before, and divide it by the difference of their respective timestamps in seconds. The default is one bin. The default settings calculate irate(), the PromQL instantaneous rate function. | |
| fix_reset | bool | Controls whether to check for counter resets and correct it like PromQL rate() function. The default is true. Set it to false to save redundant analysis in case no need to check for resets. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_rate_fl=(tbl:(timestamp:dynamic, value:dynamic), n_bins:int=1, fix_reset:bool=true)
{
tbl
| where fix_reset // Prometheus counters can only go up
| mv-apply value to typeof(double) on
( extend correction = iff(value < prev(value), prev(value), 0.0) // if the value decreases we assume it was reset to 0, so add last value
| extend cum_correction = row_cumsum(correction)
| extend corrected_value = value + cum_correction
| summarize value = make_list(corrected_value))
| union (tbl | where not(fix_reset))
| extend timestampS = array_shift_right(timestamp, n_bins), valueS = array_shift_right(value, n_bins)
| extend dt = series_subtract(timestamp, timestampS)
| extend dt = series_divide(dt, 1e7) // converts from ticks to seconds
| extend dv = series_subtract(value, valueS)
| extend rate = series_divide(dv, dt)
| project-away dt, dv, timestampS, value, valueS
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create function with (folder = "Packages\\Series", docstring = "Simulate PromQL rate()")
series_rate_fl(tbl:(timestamp:dynamic, value:dynamic), n_bins:int=1, fix_reset:bool=true)
{
tbl
| where fix_reset // Prometheus counters can only go up
| mv-apply value to typeof(double) on
( extend correction = iff(value < prev(value), prev(value), 0.0) // if the value decreases we assume it was reset to 0, so add last value
| extend cum_correction = row_cumsum(correction)
| extend corrected_value = value + cum_correction
| summarize value = make_list(corrected_value))
| union (tbl | where not(fix_reset))
| extend timestampS = array_shift_right(timestamp, n_bins), valueS = array_shift_right(value, n_bins)
| extend dt = series_subtract(timestamp, timestampS)
| extend dt = series_divide(dt, 1e7) // converts from ticks to seconds
| extend dv = series_subtract(value, valueS)
| extend rate = series_divide(dv, dt)
| project-away dt, dv, timestampS, value, valueS
}
Examples
The following examples use the invoke operator to run the function.
Calculate average rate of metric increase
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_rate_fl=(tbl:(timestamp:dynamic, value:dynamic), n_bins:int=1, fix_reset:bool=true)
{
tbl
| where fix_reset // Prometheus counters can only go up
| mv-apply value to typeof(double) on
( extend correction = iff(value < prev(value), prev(value), 0.0) // if the value decreases we assume it was reset to 0, so add last value
| extend cum_correction = row_cumsum(correction)
| extend corrected_value = value + cum_correction
| summarize value = make_list(corrected_value))
| union (tbl | where not(fix_reset))
| extend timestampS = array_shift_right(timestamp, n_bins), valueS = array_shift_right(value, n_bins)
| extend dt = series_subtract(timestamp, timestampS)
| extend dt = series_divide(dt, 1e7) // converts from ticks to seconds
| extend dv = series_subtract(value, valueS)
| extend rate = series_divide(dv, dt)
| project-away dt, dv, timestampS, value, valueS
};
//
demo_prometheus
| invoke series_metric_fl('TimeStamp', 'Name', 'Labels', 'Val', 'writes', offset=now()-datetime(2020-12-08 00:00))
| invoke series_rate_fl(2)
| render timechart with(series=labels)
Stored
demo_prometheus
| invoke series_metric_fl('TimeStamp', 'Name', 'Labels', 'Val', 'writes', offset=now()-datetime(2020-12-08 00:00))
| invoke series_rate_fl(2)
| render timechart with(series=labels)
Output

Selects the main disk of two hosts
The following example selects the main disk of two hosts, and assumes that the function is already installed. This example uses alternative direct calling syntax, specifying the input table as the first parameter:
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_rate_fl=(tbl:(timestamp:dynamic, value:dynamic), n_bins:int=1, fix_reset:bool=true)
{
tbl
| where fix_reset // Prometheus counters can only go up
| mv-apply value to typeof(double) on
( extend correction = iff(value < prev(value), prev(value), 0.0) // if the value decreases we assume it was reset to 0, so add last value
| extend cum_correction = row_cumsum(correction)
| extend corrected_value = value + cum_correction
| summarize value = make_list(corrected_value))
| union (tbl | where not(fix_reset))
| extend timestampS = array_shift_right(timestamp, n_bins), valueS = array_shift_right(value, n_bins)
| extend dt = series_subtract(timestamp, timestampS)
| extend dt = series_divide(dt, 1e7) // converts from ticks to seconds
| extend dv = series_subtract(value, valueS)
| extend rate = series_divide(dv, dt)
| project-away dt, dv, timestampS, value, valueS
};
//
series_rate_fl(series_metric_fl(demo_prometheus, 'TimeStamp', 'Name', 'Labels', 'Val', 'writes', '"disk":"sda1"', lookback=2h, offset=now()-datetime(2020-12-08 00:00)), n_bins=10)
| render timechart with(series=labels)
Stored
series_rate_fl(series_metric_fl(demo_prometheus, 'TimeStamp', 'Name', 'Labels', 'Val', 'writes', '"disk":"sda1"', lookback=2h, offset=now()-datetime(2020-12-08 00:00)), n_bins=10)
| render timechart with(series=labels)
Output

57 - series_rolling_fl()
The function series_rolling_fl() is a user-defined function (UDF) that applies rolling aggregation on a series. It takes a table containing multiple series (dynamic numerical array) and applies, for each series, a rolling aggregation function.
Syntax
T | invoke series_rolling_fl(y_series, y_rolling_series, n, aggr, aggr_params, center)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| y_series | string | ✔️ | The name of the column that contains the series to fit. |
| y_rolling_series | string | ✔️ | The name of the column to store the rolling aggregation series. |
| n | int | ✔️ | The width of the rolling window. |
| aggr | string | ✔️ | The name of the aggregation function to use. See aggregation functions. |
| aggr_params | string | Optional parameters for the aggregation function. | |
| center | bool | Indicates whether the rolling window is applied symmetrically before and after the current point or applied from the current point backwards. By default, center is false, for calculation on streaming data. |
Aggregation functions
This function supports any aggregation function from numpy or scipy.stats that calculates a scalar out of a series. The following list isn’t exhaustive:
summeanminmaxptp (max-min)percentilemedianstdvargmean(geometric mean)hmean(harmonic mean)mode(most common value)moment(nth moment)tmean(trimmed mean)tmintmaxtstdiqr(inter quantile range)
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_rolling_fl = (tbl:(*), y_series:string, y_rolling_series:string, n:int, aggr:string, aggr_params:dynamic=dynamic([null]), center:bool=true)
{
let kwargs = bag_pack('y_series', y_series, 'y_rolling_series', y_rolling_series, 'n', n, 'aggr', aggr, 'aggr_params', aggr_params, 'center', center);
let code = ```if 1:
y_series = kargs["y_series"]
y_rolling_series = kargs["y_rolling_series"]
n = kargs["n"]
aggr = kargs["aggr"]
aggr_params = kargs["aggr_params"]
center = kargs["center"]
result = df
in_s = df[y_series]
func = getattr(np, aggr, None)
if not func:
import scipy.stats
func = getattr(scipy.stats, aggr)
if func:
result[y_rolling_series] = list(pd.Series(in_s[i]).rolling(n, center=center, min_periods=1).apply(func, args=tuple(aggr_params)).values for i in range(len(in_s)))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Rolling window functions on a series")
series_rolling_fl(tbl:(*), y_series:string, y_rolling_series:string, n:int, aggr:string, aggr_params:dynamic, center:bool=true)
{
let kwargs = bag_pack('y_series', y_series, 'y_rolling_series', y_rolling_series, 'n', n, 'aggr', aggr, 'aggr_params', aggr_params, 'center', center);
let code = ```if 1:
y_series = kargs["y_series"]
y_rolling_series = kargs["y_rolling_series"]
n = kargs["n"]
aggr = kargs["aggr"]
aggr_params = kargs["aggr_params"]
center = kargs["center"]
result = df
in_s = df[y_series]
func = getattr(np, aggr, None)
if not func:
import scipy.stats
func = getattr(scipy.stats, aggr)
if func:
result[y_rolling_series] = list(pd.Series(in_s[i]).rolling(n, center=center, min_periods=1).apply(func, args=tuple(aggr_params)).values for i in range(len(in_s)))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Examples
The following examples use the invoke operator to run the function.
Calculate rolling median of 9 elements
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_rolling_fl = (tbl:(*), y_series:string, y_rolling_series:string, n:int, aggr:string, aggr_params:dynamic=dynamic([null]), center:bool=true)
{
let kwargs = bag_pack('y_series', y_series, 'y_rolling_series', y_rolling_series, 'n', n, 'aggr', aggr, 'aggr_params', aggr_params, 'center', center);
let code = ```if 1:
y_series = kargs["y_series"]
y_rolling_series = kargs["y_rolling_series"]
n = kargs["n"]
aggr = kargs["aggr"]
aggr_params = kargs["aggr_params"]
center = kargs["center"]
result = df
in_s = df[y_series]
func = getattr(np, aggr, None)
if not func:
import scipy.stats
func = getattr(scipy.stats, aggr)
if func:
result[y_rolling_series] = list(pd.Series(in_s[i]).rolling(n, center=center, min_periods=1).apply(func, args=tuple(aggr_params)).values for i in range(len(in_s)))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
//
// Calculate rolling median of 9 elements
//
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| extend rolling_med = dynamic(null)
| invoke series_rolling_fl('num', 'rolling_med', 9, 'median')
| render timechart
Stored
//
// Calculate rolling median of 9 elements
//
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| extend rolling_med = dynamic(null)
| invoke series_rolling_fl('num', 'rolling_med', 9, 'median', dynamic([null]))
| render timechart
Output

Calculate rolling min, max & 75th percentile of 15 elements
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_rolling_fl = (tbl:(*), y_series:string, y_rolling_series:string, n:int, aggr:string, aggr_params:dynamic=dynamic([null]), center:bool=true)
{
let kwargs = bag_pack('y_series', y_series, 'y_rolling_series', y_rolling_series, 'n', n, 'aggr', aggr, 'aggr_params', aggr_params, 'center', center);
let code = ```if 1:
y_series = kargs["y_series"]
y_rolling_series = kargs["y_rolling_series"]
n = kargs["n"]
aggr = kargs["aggr"]
aggr_params = kargs["aggr_params"]
center = kargs["center"]
result = df
in_s = df[y_series]
func = getattr(np, aggr, None)
if not func:
import scipy.stats
func = getattr(scipy.stats, aggr)
if func:
result[y_rolling_series] = list(pd.Series(in_s[i]).rolling(n, center=center, min_periods=1).apply(func, args=tuple(aggr_params)).values for i in range(len(in_s)))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
//
// Calculate rolling min, max & 75th percentile of 15 elements
//
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| extend rolling_min = dynamic(null), rolling_max = dynamic(null), rolling_pct = dynamic(null)
| invoke series_rolling_fl('num', 'rolling_min', 15, 'min', dynamic([null]))
| invoke series_rolling_fl('num', 'rolling_max', 15, 'max', dynamic([null]))
| invoke series_rolling_fl('num', 'rolling_pct', 15, 'percentile', dynamic([75]))
| render timechart
Stored
//
// Calculate rolling min, max & 75th percentile of 15 elements
//
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| extend rolling_min = dynamic(null), rolling_max = dynamic(null), rolling_pct = dynamic(null)
| invoke series_rolling_fl('num', 'rolling_min', 15, 'min', dynamic([null]))
| invoke series_rolling_fl('num', 'rolling_max', 15, 'max', dynamic([null]))
| invoke series_rolling_fl('num', 'rolling_pct', 15, 'percentile', dynamic([75]))
| render timechart
Output

Calculate the rolling trimmed mean
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_rolling_fl = (tbl:(*), y_series:string, y_rolling_series:string, n:int, aggr:string, aggr_params:dynamic=dynamic([null]), center:bool=true)
{
let kwargs = bag_pack('y_series', y_series, 'y_rolling_series', y_rolling_series, 'n', n, 'aggr', aggr, 'aggr_params', aggr_params, 'center', center);
let code = ```if 1:
y_series = kargs["y_series"]
y_rolling_series = kargs["y_rolling_series"]
n = kargs["n"]
aggr = kargs["aggr"]
aggr_params = kargs["aggr_params"]
center = kargs["center"]
result = df
in_s = df[y_series]
func = getattr(np, aggr, None)
if not func:
import scipy.stats
func = getattr(scipy.stats, aggr)
if func:
result[y_rolling_series] = list(pd.Series(in_s[i]).rolling(n, center=center, min_periods=1).apply(func, args=tuple(aggr_params)).values for i in range(len(in_s)))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
range x from 1 to 100 step 1
| extend y=iff(x % 13 == 0, 2.0, iff(x % 23 == 0, -2.0, rand()))
| summarize x=make_list(x), y=make_list(y)
| extend yr = dynamic(null)
| invoke series_rolling_fl('y', 'yr', 7, 'tmean', pack_array(pack_array(-2, 2), pack_array(false, false))) // trimmed mean: ignoring values outside [-2,2] inclusive
| render linechart
Stored
range x from 1 to 100 step 1
| extend y=iff(x % 13 == 0, 2.0, iff(x % 23 == 0, -2.0, rand()))
| summarize x=make_list(x), y=make_list(y)
| extend yr = dynamic(null)
| invoke series_rolling_fl('y', 'yr', 7, 'tmean', pack_array(pack_array(-2, 2), pack_array(false, false))) // trimmed mean: ignoring values outside [-2,2] inclusive
| render linechart
Output

58 - series_shapes_fl()
The function series_shapes_fl() is a user-defined function (UDF) that detects positive/negative trend or jump in a series. This function takes a table containing multiple time series (dynamic numerical array), and calculates trend and jump scores for each series. The output is a dictionary (dynamic) containing the scores.
Syntax
T | extend series_shapes_fl(y_series, advanced)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| y_series | dynamic | ✔️ | An array cell of numeric values. |
| advanced | bool | The default is false. Set to true to output additional calculated parameters. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_shapes_fl=(series:dynamic, advanced:bool=false)
{
let n = array_length(series);
// calculate normal dynamic range between 10th and 90th percentiles
let xs = array_sort_asc(series);
let low_idx = tolong(n*0.1);
let high_idx = tolong(n*0.9);
let low_pct = todouble(xs[low_idx]);
let high_pct = todouble(xs[high_idx]);
let norm_range = high_pct-low_pct;
// trend score
let lf = series_fit_line_dynamic(series);
let slope = todouble(lf.slope);
let rsquare = todouble(lf.rsquare);
let rel_slope = abs(n*slope/norm_range);
let sign_slope = iff(slope >= 0.0, 1.0, -1.0);
let norm_slope = sign_slope*rel_slope/(rel_slope+0.1); // map rel_slope from [-Inf, +Inf] to [-1, 1]; 0.1 is a clibration constant
let trend_score = norm_slope*rsquare;
// jump score
let lf2=series_fit_2lines_dynamic(series);
let lslope = todouble(lf2.left.slope);
let rslope = todouble(lf2.right.slope);
let rsquare2 = todouble(lf2.rsquare);
let split_idx = tolong(lf2.split_idx);
let last_left = todouble(lf2.left.interception)+lslope*split_idx;
let first_right = todouble(lf2.right.interception)+rslope;
let jump = first_right-last_left;
let rel_jump = abs(jump/norm_range);
let sign_jump = iff(first_right >= last_left, 1.0, -1.0);
let norm_jump = sign_jump*rel_jump/(rel_jump+0.1); // map rel_jump from [-Inf, +Inf] to [-1, 1]; 0.1 is a clibration constant
let jump_score1 = norm_jump*rsquare2;
// filter for jumps that are not close to the series edges and the right slope has the same direction
let norm_rslope = abs(rslope/norm_range);
let jump_score = iff((sign_jump*rslope >= 0.0 or norm_rslope < 0.02) and split_idx between((0.1*n)..(0.9*n)), jump_score1, 0.0);
let res = iff(advanced, bag_pack("n", n, "low_pct", low_pct, "high_pct", high_pct, "norm_range", norm_range, "slope", slope, "rsquare", rsquare, "rel_slope", rel_slope, "norm_slope", norm_slope,
"trend_score", trend_score, "split_idx", split_idx, "jump", jump, "rsquare2", rsquare2, "last_left", last_left, "first_right", first_right, "rel_jump", rel_jump,
"lslope", lslope, "rslope", rslope, "norm_rslope", norm_rslope, "norm_jump", norm_jump, "jump_score", jump_score)
, bag_pack("trend_score", trend_score, "jump_score", jump_score));
res
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Series detector for positive/negative trend or step. Returns a dynamic with trend and jump scores")
series_shapes_fl(series:dynamic, advanced:bool=false)
{
let n = array_length(series);
// calculate normal dynamic range between 10th and 90th percentiles
let xs = array_sort_asc(series);
let low_idx = tolong(n*0.1);
let high_idx = tolong(n*0.9);
let low_pct = todouble(xs[low_idx]);
let high_pct = todouble(xs[high_idx]);
let norm_range = high_pct-low_pct;
// trend score
let lf = series_fit_line_dynamic(series);
let slope = todouble(lf.slope);
let rsquare = todouble(lf.rsquare);
let rel_slope = abs(n*slope/norm_range);
let sign_slope = iff(slope >= 0.0, 1.0, -1.0);
let norm_slope = sign_slope*rel_slope/(rel_slope+0.1); // map rel_slope from [-Inf, +Inf] to [-1, 1]; 0.1 is a clibration constant
let trend_score = norm_slope*rsquare;
// jump score
let lf2=series_fit_2lines_dynamic(series);
let lslope = todouble(lf2.left.slope);
let rslope = todouble(lf2.right.slope);
let rsquare2 = todouble(lf2.rsquare);
let split_idx = tolong(lf2.split_idx);
let last_left = todouble(lf2.left.interception)+lslope*split_idx;
let first_right = todouble(lf2.right.interception)+rslope;
let jump = first_right-last_left;
let rel_jump = abs(jump/norm_range);
let sign_jump = iff(first_right >= last_left, 1.0, -1.0);
let norm_jump = sign_jump*rel_jump/(rel_jump+0.1); // map rel_jump from [-Inf, +Inf] to [-1, 1]; 0.1 is a clibration constant
let jump_score1 = norm_jump*rsquare2;
// filter for jumps that are not close to the series edges and the right slope has the same direction
let norm_rslope = abs(rslope/norm_range);
let jump_score = iff((sign_jump*rslope >= 0.0 or norm_rslope < 0.02) and split_idx between((0.1*n)..(0.9*n)), jump_score1, 0.0);
let res = iff(advanced, bag_pack("n", n, "low_pct", low_pct, "high_pct", high_pct, "norm_range", norm_range, "slope", slope, "rsquare", rsquare, "rel_slope", rel_slope, "norm_slope", norm_slope,
"trend_score", trend_score, "split_idx", split_idx, "jump", jump, "rsquare2", rsquare2, "last_left", last_left, "first_right", first_right, "rel_jump", rel_jump,
"lslope", lslope, "rslope", rslope, "norm_rslope", norm_rslope, "norm_jump", norm_jump, "jump_score", jump_score)
, bag_pack("trend_score", trend_score, "jump_score", jump_score));
res
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_shapes_fl=(series:dynamic, advanced:bool=false)
{
let n = array_length(series);
// calculate normal dynamic range between 10th and 90th percentiles
let xs = array_sort_asc(series);
let low_idx = tolong(n*0.1);
let high_idx = tolong(n*0.9);
let low_pct = todouble(xs[low_idx]);
let high_pct = todouble(xs[high_idx]);
let norm_range = high_pct-low_pct;
// trend score
let lf = series_fit_line_dynamic(series);
let slope = todouble(lf.slope);
let rsquare = todouble(lf.rsquare);
let rel_slope = abs(n*slope/norm_range);
let sign_slope = iff(slope >= 0.0, 1.0, -1.0);
let norm_slope = sign_slope*rel_slope/(rel_slope+0.1); // map rel_slope from [-Inf, +Inf] to [-1, 1]; 0.1 is a clibration constant
let trend_score = norm_slope*rsquare;
// jump score
let lf2=series_fit_2lines_dynamic(series);
let lslope = todouble(lf2.left.slope);
let rslope = todouble(lf2.right.slope);
let rsquare2 = todouble(lf2.rsquare);
let split_idx = tolong(lf2.split_idx);
let last_left = todouble(lf2.left.interception)+lslope*split_idx;
let first_right = todouble(lf2.right.interception)+rslope;
let jump = first_right-last_left;
let rel_jump = abs(jump/norm_range);
let sign_jump = iff(first_right >= last_left, 1.0, -1.0);
let norm_jump = sign_jump*rel_jump/(rel_jump+0.1); // map rel_jump from [-Inf, +Inf] to [-1, 1]; 0.1 is a clibration constant
let jump_score1 = norm_jump*rsquare2;
// filter for jumps that are not close to the series edges and the right slope has the same direction
let norm_rslope = abs(rslope/norm_range);
let jump_score = iff((sign_jump*rslope >= 0.0 or norm_rslope < 0.02) and split_idx between((0.1*n)..(0.9*n)), jump_score1, 0.0);
let res = iff(advanced, bag_pack("n", n, "low_pct", low_pct, "high_pct", high_pct, "norm_range", norm_range, "slope", slope, "rsquare", rsquare, "rel_slope", rel_slope, "norm_slope", norm_slope,
"trend_score", trend_score, "split_idx", split_idx, "jump", jump, "rsquare2", rsquare2, "last_left", last_left, "first_right", first_right, "rel_jump", rel_jump,
"lslope", lslope, "rslope", rslope, "norm_rslope", norm_rslope, "norm_jump", norm_jump, "jump_score", jump_score)
, bag_pack("trend_score", trend_score, "jump_score", jump_score));
res
};
let ts_len = 100;
let noise_pct = 2;
let noise_gain = 3;
union
(print tsid=1 | extend y = array_concat(repeat(20, ts_len/2), repeat(150, ts_len/2))),
(print tsid=2 | extend y = array_concat(repeat(0, ts_len*3/4), repeat(-50, ts_len/4))),
(print tsid=3 | extend y = range(40, 139, 1)),
(print tsid=4 | extend y = range(-20, -109, -1))
| extend x = range(1, array_length(y), 1)
//
| extend shapes = series_shapes_fl(y)
| order by tsid asc
| fork (take 4) (project tsid, shapes)
| render timechart with(series=tsid, xcolumn=x, ycolumns=y)
Stored
let ts_len = 100;
let noise_pct = 2;
let noise_gain = 3;
union
(print tsid=1 | extend y = array_concat(repeat(20, ts_len/2), repeat(150, ts_len/2))),
(print tsid=2 | extend y = array_concat(repeat(0, ts_len*3/4), repeat(-50, ts_len/4))),
(print tsid=3 | extend y = range(40, 139, 1)),
(print tsid=4 | extend y = range(-20, -109, -1))
| extend x = range(1, array_length(y), 1)
//
| extend shapes = series_shapes_fl(y)
| order by tsid asc
| fork (take 4) (project tsid, shapes)
| render timechart with(series=tsid, xcolumn=x, ycolumns=y)
Output

The respective trend and jump scores:
tsid shapes
1 {
"trend_score": 0.703199714530169,
"jump_score": 0.90909090909090906
}
2 {
"trend_score": -0.51663751343174869,
"jump_score": -0.90909090909090906
}
3 {
"trend_score": 0.92592592592592582,
"jump_score": 0.0
}
4 {
"trend_score": -0.92592592592592582,
"jump_score": 0.0
}
59 - series_uv_anomalies_fl()
The function series_uv_anomalies_fl() is a user-defined function (UDF) that detects anomalies in time series by calling the Univariate Anomaly Detection API, part of Azure Cognitive Services. The function accepts a limited set of time series as numerical dynamic arrays and the required anomaly detection sensitivity level. Each time series is converted into the required JSON format and posts it to the Anomaly Detector service endpoint. The service response contains dynamic arrays of high/low/all anomalies, the modeled baseline time series, its normal high/low boundaries (a value above or below the high/low boundary is an anomaly) and the detected seasonality.
Prerequisites
- An Azure subscription. Create a free Azure account.
- A cluster and database Create a cluster and database or a KQL database with editing permissions and data.
- The Python plugin must be enabled on the cluster. This is required for the inline Python used in the function.
- Enable the http_request plugin / http_request_post plugin on the cluster to access the anomaly detection service endpoint.
- Modify the callout policy for type
webapito access the anomaly detection service endpoint.
In the following function example, replace YOUR-AD-RESOURCE-NAME in the uri and YOUR-KEY in the Ocp-Apim-Subscription-Key of the header with your Anomaly Detector resource name and key.
Syntax
T | invoke series_uv_anomalies_fl(y_series [, sensitivity [, tsid]])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| y_series | string | ✔️ | The name of the input table column containing the values of the series to be anomaly detected. |
| sensitivity | integer | An integer in the range [0-100] specifying the anomaly detection sensitivity. 0 is the least sensitive detection, while 100 is the most sensitive indicating even a small deviation from the expected baseline would be tagged as anomaly. Default value: 85 | |
| tsid | string | The name of the input table column containing the time series ID. Can be omitted when analyzing a single time series. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_uv_anomalies_fl=(tbl:(*), y_series:string, sensitivity:int=85, tsid:string='_tsid')
{
let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/entire/detect';
let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
let kwargs = bag_pack('y_series', y_series, 'sensitivity', sensitivity);
let code = ```if 1:
import json
y_series = kargs["y_series"]
sensitivity = kargs["sensitivity"]
json_str = []
for i in range(len(df)):
row = df.iloc[i, :]
ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
json_data = {'series': ts, "sensitivity":sensitivity} # auto-detect period, or we can force 'period': 84. We can also add 'maxAnomalyRatio':0.25 for maximum 25% anomalies
json_str = json_str + [json.dumps(json_data)]
result = df
result['json_str'] = json_str
```;
tbl
| evaluate python(typeof(*, json_str:string), code, kwargs)
| extend _tsid = column_ifexists(tsid, 1)
| partition by _tsid (
project json_str
| evaluate http_request_post(uri, headers, dynamic(null))
| project period=ResponseBody.period, baseline_ama=ResponseBody.expectedValues, ad_ama=series_add(0, ResponseBody.isAnomaly), pos_ad_ama=series_add(0, ResponseBody.isPositiveAnomaly)
, neg_ad_ama=series_add(0, ResponseBody.isNegativeAnomaly), upper_ama=series_add(ResponseBody.expectedValues, ResponseBody.upperMargins), lower_ama=series_subtract(ResponseBody.expectedValues, ResponseBody.lowerMargins)
| extend _tsid=toscalar(_tsid)
)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Time Series Anomaly Detection by Azure Cognitive Service")
series_uv_anomalies_fl(tbl:(*), y_series:string, sensitivity:int=85, tsid:string='_tsid')
{
let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/entire/detect';
let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
let kwargs = bag_pack('y_series', y_series, 'sensitivity', sensitivity);
let code = ```if 1:
import json
y_series = kargs["y_series"]
sensitivity = kargs["sensitivity"]
json_str = []
for i in range(len(df)):
row = df.iloc[i, :]
ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
json_data = {'series': ts, "sensitivity":sensitivity} # auto-detect period, or we can force 'period': 84. We can also add 'maxAnomalyRatio':0.25 for maximum 25% anomalies
json_str = json_str + [json.dumps(json_data)]
result = df
result['json_str'] = json_str
```;
tbl
| evaluate python(typeof(*, json_str:string), code, kwargs)
| extend _tsid = column_ifexists(tsid, 1)
| partition by _tsid (
project json_str
| evaluate http_request_post(uri, headers, dynamic(null))
| project period=ResponseBody.period, baseline_ama=ResponseBody.expectedValues, ad_ama=series_add(0, ResponseBody.isAnomaly), pos_ad_ama=series_add(0, ResponseBody.isPositiveAnomaly)
, neg_ad_ama=series_add(0, ResponseBody.isNegativeAnomaly), upper_ama=series_add(ResponseBody.expectedValues, ResponseBody.upperMargins), lower_ama=series_subtract(ResponseBody.expectedValues, ResponseBody.lowerMargins)
| extend _tsid=toscalar(_tsid)
)
}
Examples
The following examples use the invoke operator to run the function.
Use series_uv_anomalies_fl() to detect anomalies
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_uv_anomalies_fl=(tbl:(*), y_series:string, sensitivity:int=85, tsid:string='_tsid')
{
let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/entire/detect';
let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
let kwargs = bag_pack('y_series', y_series, 'sensitivity', sensitivity);
let code = ```if 1:
import json
y_series = kargs["y_series"]
sensitivity = kargs["sensitivity"]
json_str = []
for i in range(len(df)):
row = df.iloc[i, :]
ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
json_data = {'series': ts, "sensitivity":sensitivity} # auto-detect period, or we can force 'period': 84. We can also add 'maxAnomalyRatio':0.25 for maximum 25% anomalies
json_str = json_str + [json.dumps(json_data)]
result = df
result['json_str'] = json_str
```;
tbl
| evaluate python(typeof(*, json_str:string), code, kwargs)
| extend _tsid = column_ifexists(tsid, 1)
| partition by _tsid (
project json_str
| evaluate http_request_post(uri, headers, dynamic(null))
| project period=ResponseBody.period, baseline_ama=ResponseBody.expectedValues, ad_ama=series_add(0, ResponseBody.isAnomaly), pos_ad_ama=series_add(0, ResponseBody.isPositiveAnomaly)
, neg_ad_ama=series_add(0, ResponseBody.isNegativeAnomaly), upper_ama=series_add(ResponseBody.expectedValues, ResponseBody.upperMargins), lower_ama=series_subtract(ResponseBody.expectedValues, ResponseBody.lowerMargins)
| extend _tsid=toscalar(_tsid)
)
};
let etime=datetime(2017-03-02);
let stime=datetime(2017-01-01);
let dt=1h;
let ts = requests
| make-series value=avg(value) on timestamp from stime to etime step dt
| extend _tsid='TS1';
ts
| invoke series_uv_anomalies_fl('value')
| lookup ts on _tsid
| render anomalychart with(xcolumn=timestamp, ycolumns=value, anomalycolumns=ad_ama)
Stored
let etime=datetime(2017-03-02);
let stime=datetime(2017-01-01);
let dt=1h;
let ts = requests
| make-series value=avg(value) on timestamp from stime to etime step dt
| extend _tsid='TS1';
ts
| invoke series_uv_anomalies_fl('value')
| lookup ts on _tsid
| render anomalychart with(xcolumn=timestamp, ycolumns=value, anomalycolumns=ad_ama)
Output

Compare series_uv_anomalies_fl() and native series_decompose_anomalies()
The following example compares the Univariate Anomaly Detection API to the native series_decompose_anomalies() function over three time series and assumes the series_uv_anomalies_fl() function is already defined in the database:
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_uv_anomalies_fl=(tbl:(*), y_series:string, sensitivity:int=85, tsid:string='_tsid')
{
let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/entire/detect';
let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
let kwargs = bag_pack('y_series', y_series, 'sensitivity', sensitivity);
let code = ```if 1:
import json
y_series = kargs["y_series"]
sensitivity = kargs["sensitivity"]
json_str = []
for i in range(len(df)):
row = df.iloc[i, :]
ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
json_data = {'series': ts, "sensitivity":sensitivity} # auto-detect period, or we can force 'period': 84. We can also add 'maxAnomalyRatio':0.25 for maximum 25% anomalies
json_str = json_str + [json.dumps(json_data)]
result = df
result['json_str'] = json_str
```;
tbl
| evaluate python(typeof(*, json_str:string), code, kwargs)
| extend _tsid = column_ifexists(tsid, 1)
| partition by _tsid (
project json_str
| evaluate http_request_post(uri, headers, dynamic(null))
| project period=ResponseBody.period, baseline_ama=ResponseBody.expectedValues, ad_ama=series_add(0, ResponseBody.isAnomaly), pos_ad_ama=series_add(0, ResponseBody.isPositiveAnomaly)
, neg_ad_ama=series_add(0, ResponseBody.isNegativeAnomaly), upper_ama=series_add(ResponseBody.expectedValues, ResponseBody.upperMargins), lower_ama=series_subtract(ResponseBody.expectedValues, ResponseBody.lowerMargins)
| extend _tsid=toscalar(_tsid)
)
};
let ts = demo_make_series2
| summarize TimeStamp=make_list(TimeStamp), num=make_list(num) by sid;
ts
| invoke series_uv_anomalies_fl('num', 'sid', 90)
| join ts on $left._tsid == $right.sid
| project-away _tsid
| extend (ad_adx, score_adx, baseline_adx)=series_decompose_anomalies(num, 1.5, -1, 'linefit')
| project-reorder num, *
| render anomalychart with(series=sid, xcolumn=TimeStamp, ycolumns=num, baseline_adx, baseline_ama, lower_ama, upper_ama, anomalycolumns=ad_adx, ad_ama)
Stored
let ts = demo_make_series2
| summarize TimeStamp=make_list(TimeStamp), num=make_list(num) by sid;
ts
| invoke series_uv_anomalies_fl('num', 'sid', 90)
| join ts on $left._tsid == $right.sid
| project-away _tsid
| extend (ad_adx, score_adx, baseline_adx)=series_decompose_anomalies(num, 1.5, -1, 'linefit')
| project-reorder num, *
| render anomalychart with(series=sid, xcolumn=TimeStamp, ycolumns=num, baseline_adx, baseline_ama, lower_ama, upper_ama, anomalycolumns=ad_adx, ad_ama)
Output
The following graph shows anomalies detected by the Univariate Anomaly Detection API on TS1. You can also select TS2 or TS3 in the chart filter box.

The following graph shows the anomalies detected by native function on TS1.

60 - series_uv_change_points_fl()
The function series_uv_change_points_fl() is a user-defined function (UDF) that finds change points in time series by calling the Univariate Anomaly Detection API, part of Azure Cognitive Services. The function accepts a limited set of time series as numerical dynamic arrays, the change point detection threshold, and the minimum size of the stable trend window. Each time series is converted into the required JSON format and posts it to the Anomaly Detector service endpoint. The service response contains dynamic arrays of change points, their respective confidence, and the detected seasonality.
Prerequisites
- An Azure subscription. Create a free Azure account.
- A cluster and database Create a cluster and database or a KQL database with editing permissions and data.
- The Python plugin must be enabled on the cluster. This is required for the inline Python used in the function.
- Enable the http_request plugin / http_request_post plugin on the cluster to access the anomaly detection service endpoint.
- Modify the callout policy for type
webapito access the anomaly detection service endpoint.
Syntax
T | invoke series_uv_change_points_fl(y_series [, score_threshold [, trend_window [, tsid]]])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| y_series | string | ✔️ | The name of the input table column containing the values of the series to be anomaly detected. |
| score_threshold | real | A value specifying the minimum confidence to declare a change point. Each point whose confidence is above the threshold is defined as a change point. Default value: 0.9 | |
| trend_window | integer | A value specifying the minimal window size for robust calculation of trend changes. Default value: 5 | |
| tsid | string | The name of the input table column containing the time series ID. Can be omitted when analyzing a single time series. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required. In the following function definition, replace YOUR-AD-RESOURCE-NAME in the uri and YOUR-KEY in the Ocp-Apim-Subscription-Key of the header with your Anomaly Detector resource name and key.
let series_uv_change_points_fl=(tbl:(*), y_series:string, score_threshold:real=0.9, trend_window:int=5, tsid:string='_tsid')
{
let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/changepoint/detect';
let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
let kwargs = bag_pack('y_series', y_series, 'score_threshold', score_threshold, 'trend_window', trend_window);
let code = ```if 1:
import json
y_series = kargs["y_series"]
score_threshold = kargs["score_threshold"]
trend_window = kargs["trend_window"]
json_str = []
for i in range(len(df)):
row = df.iloc[i, :]
ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
json_data = {'series': ts, "threshold":score_threshold, "stableTrendWindow": trend_window} # auto-detect period, or we can force 'period': 84
json_str = json_str + [json.dumps(json_data)]
result = df
result['json_str'] = json_str
```;
tbl
| evaluate python(typeof(*, json_str:string), code, kwargs)
| extend _tsid = column_ifexists(tsid, 1)
| partition by _tsid (
project json_str
| evaluate http_request_post(uri, headers, dynamic(null))
| project period=ResponseBody.period, change_point=series_add(0, ResponseBody.isChangePoint), confidence=ResponseBody.confidenceScores
| extend _tsid=toscalar(_tsid)
)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required. In the following function definition, replace YOUR-AD-RESOURCE-NAME in the uri and YOUR-KEY in the Ocp-Apim-Subscription-Key of the header with your Anomaly Detector resource name and key.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Time Series Change Points Detection by Azure Cognitive Service")
series_uv_change_points_fl(tbl:(*), y_series:string, score_threshold:real=0.9, trend_window:int=5, tsid:string='_tsid')
{
let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/changepoint/detect';
let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
let kwargs = bag_pack('y_series', y_series, 'score_threshold', score_threshold, 'trend_window', trend_window);
let code = ```if 1:
import json
y_series = kargs["y_series"]
score_threshold = kargs["score_threshold"]
trend_window = kargs["trend_window"]
json_str = []
for i in range(len(df)):
row = df.iloc[i, :]
ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
json_data = {'series': ts, "threshold":score_threshold, "stableTrendWindow": trend_window} # auto-detect period, or we can force 'period': 84
json_str = json_str + [json.dumps(json_data)]
result = df
result['json_str'] = json_str
```;
tbl
| evaluate python(typeof(*, json_str:string), code, kwargs)
| extend _tsid = column_ifexists(tsid, 1)
| partition by _tsid (
project json_str
| evaluate http_request_post(uri, headers, dynamic(null))
| project period=ResponseBody.period, change_point=series_add(0, ResponseBody.isChangePoint), confidence=ResponseBody.confidenceScores
| extend _tsid=toscalar(_tsid)
)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_uv_change_points_fl=(tbl:(*), y_series:string, score_threshold:real=0.9, trend_window:int=5, tsid:string='_tsid')
{
let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/changepoint/detect';
let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
let kwargs = bag_pack('y_series', y_series, 'score_threshold', score_threshold, 'trend_window', trend_window);
let code = ```if 1:
import json
y_series = kargs["y_series"]
score_threshold = kargs["score_threshold"]
trend_window = kargs["trend_window"]
json_str = []
for i in range(len(df)):
row = df.iloc[i, :]
ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
json_data = {'series': ts, "threshold":score_threshold, "stableTrendWindow": trend_window} # auto-detect period, or we can force 'period': 84
json_str = json_str + [json.dumps(json_data)]
result = df
result['json_str'] = json_str
```;
tbl
| evaluate python(typeof(*, json_str:string), code, kwargs)
| extend _tsid = column_ifexists(tsid, 1)
| partition by _tsid (
project json_str
| evaluate http_request_post(uri, headers, dynamic(null))
| project period=ResponseBody.period, change_point=series_add(0, ResponseBody.isChangePoint), confidence=ResponseBody.confidenceScores
| extend _tsid=toscalar(_tsid)
)
};
let ts = range x from 1 to 300 step 1
| extend y=iff(x between (100 .. 110) or x between (200 .. 220), 20, 5)
| extend ts=datetime(2021-01-01)+x*1d
| extend y=y+4*rand()
| summarize ts=make_list(ts), y=make_list(y)
| extend sid=1;
ts
| invoke series_uv_change_points_fl('y', 0.8, 10, 'sid')
| join ts on $left._tsid == $right.sid
| project-away _tsid
| project-reorder y, * // just to visualize the anomalies on top of y series
| render anomalychart with(xcolumn=ts, ycolumns=y, confidence, anomalycolumns=change_point)
Stored
let ts = range x from 1 to 300 step 1
| extend y=iff(x between (100 .. 110) or x between (200 .. 220), 20, 5)
| extend ts=datetime(2021-01-01)+x*1d
| extend y=y+4*rand()
| summarize ts=make_list(ts), y=make_list(y)
| extend sid=1;
ts
| invoke series_uv_change_points_fl('y', 0.8, 10, 'sid')
| join ts on $left._tsid == $right.sid
| project-away _tsid
| project-reorder y, * // just to visualize the anomalies on top of y series
| render anomalychart with(xcolumn=ts, ycolumns=y, confidence, anomalycolumns=change_point)
Output
The following graph shows change points on a time series.

61 - time_weighted_avg_fl()
The function time_weighted_avg_fl() is a user-defined function (UDF) that calculates the time weighted average of a metric in a given time window, over input time bins. This function is similar to summarize operator. The function aggregates the metric by time bins, but instead of calculating simple avg() of the metric value in each bin, it weights each value by its duration. The duration is defined from the timestamp of the current value to the timestamp of the next value.
There are two options to calculate time weighted average. This function fills forward the value from the current sample until the next one. Alternatively time_weighted_avg2_fl() linearly interpolates the metric value between consecutive samples.
Syntax
T | invoke time_weighted_avg_fl(t_col, y_col, key_col, stime, etime, dt)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| t_col | string | ✔️ | The name of the column containing the time stamp of the records. |
| y_col | string | ✔️ | The name of the column containing the metric value of the records. |
| key_col | string | ✔️ | The name of the column containing the partition key of the records. |
| stime | datetime | ✔️ | The start time of the aggregation window. |
| etime | datetime | ✔️ | The end time of the aggregation window. |
| dt | timespan | ✔️ | The aggregation time bin. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let time_weighted_avg_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
let _etime = etime + dt;
let gridTimes = range _ts from stime to _etime step dt | extend _val=real(null), dummy=1;
let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
gridTimes
| join kind=fullouter keys on dummy
| project-away dummy, dummy1
| union tbl_ex
| where _ts between (stime.._etime)
| partition hint.strategy=native by _key (
order by _ts asc, _val nulls last
| scan declare(f_value:real=0.0) with (step s: true => f_value = iff(isnull(_val), s.f_value, _val);) // fill forward null values
| extend diff_t=(next(_ts)-_ts)/1m
)
| where isnotnull(diff_t)
| summarize tw_sum=sum(f_value*diff_t), t_sum =sum(diff_t) by bin_at(_ts, dt, stime), _key
| where t_sum > 0 and _ts <= etime
| extend tw_avg = tw_sum/t_sum
| project-away tw_sum, t_sum
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Time weighted average of a metric using fill forward interpolation")
time_weighted_avg_fl(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
let _etime = etime + dt;
let gridTimes = range _ts from stime to _etime step dt | extend _val=real(null), dummy=1;
let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
gridTimes
| join kind=fullouter keys on dummy
| project-away dummy, dummy1
| union tbl_ex
| where _ts between (stime.._etime)
| partition hint.strategy=native by _key (
order by _ts asc, _val nulls last
| scan declare(f_value:real=0.0) with (step s: true => f_value = iff(isnull(_val), s.f_value, _val);) // fill forward null values
| extend diff_t=(next(_ts)-_ts)/1m
)
| where isnotnull(diff_t)
| summarize tw_sum=sum(f_value*diff_t), t_sum =sum(diff_t) by bin_at(_ts, dt, stime), _key
| where t_sum > 0 and _ts <= etime
| extend tw_avg = tw_sum/t_sum
| project-away tw_sum, t_sum
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let time_weighted_avg_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
let _etime = etime + dt;
let gridTimes = range _ts from stime to _etime step dt | extend _val=real(null), dummy=1;
let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
gridTimes
| join kind=fullouter keys on dummy
| project-away dummy, dummy1
| union tbl_ex
| where _ts between (stime.._etime)
| partition hint.strategy=native by _key (
order by _ts asc, _val nulls last
| scan declare(f_value:real=0.0) with (step s: true => f_value = iff(isnull(_val), s.f_value, _val);) // fill forward null values
| extend diff_t=(next(_ts)-_ts)/1m
)
| where isnotnull(diff_t)
| summarize tw_sum=sum(f_value*diff_t), t_sum =sum(diff_t) by bin_at(_ts, dt, stime), _key
| where t_sum > 0 and _ts <= etime
| extend tw_avg = tw_sum/t_sum
| project-away tw_sum, t_sum
};
let tbl = datatable(ts:datetime, val:real, key:string) [
datetime(2021-04-26 00:00), 100, 'Device1',
datetime(2021-04-26 00:45), 300, 'Device1',
datetime(2021-04-26 01:15), 200, 'Device1',
datetime(2021-04-26 00:00), 600, 'Device2',
datetime(2021-04-26 00:30), 400, 'Device2',
datetime(2021-04-26 01:30), 500, 'Device2',
datetime(2021-04-26 01:45), 300, 'Device2'
];
let minmax=materialize(tbl | summarize mint=min(ts), maxt=max(ts));
let stime=toscalar(minmax | project mint);
let etime=toscalar(minmax | project maxt);
let dt = 1h;
tbl
| invoke time_weighted_avg_fl('ts', 'val', 'key', stime, etime, dt)
| project-rename val = tw_avg
| order by _key asc, _ts asc
Stored
let tbl = datatable(ts:datetime, val:real, key:string) [
datetime(2021-04-26 00:00), 100, 'Device1',
datetime(2021-04-26 00:45), 300, 'Device1',
datetime(2021-04-26 01:15), 200, 'Device1',
datetime(2021-04-26 00:00), 600, 'Device2',
datetime(2021-04-26 00:30), 400, 'Device2',
datetime(2021-04-26 01:30), 500, 'Device2',
datetime(2021-04-26 01:45), 300, 'Device2'
];
let minmax=materialize(tbl | summarize mint=min(ts), maxt=max(ts));
let stime=toscalar(minmax | project mint);
let etime=toscalar(minmax | project maxt);
let dt = 1h;
tbl
| invoke time_weighted_avg_fl('ts', 'val', 'key', stime, etime, dt)
| project-rename val = tw_avg
| order by _key asc, _ts asc
Output
| _ts | _key | val |
|---|---|---|
| 2021-04-26 00:00:00.0000000 | Device1 | 150 |
| 2021-04-26 01:00:00.0000000 | Device1 | 225 |
| 2021-04-26 00:00:00.0000000 | Device2 | 500 |
| 2021-04-26 01:00:00.0000000 | Device2 | 400 |
The first value of Device1 is (45m*100 + 15m*300)/60m = 150, the second value is (15m*300 + 45m*200)/60m = 225.
The first value of Device2 is (30m*600 + 30m*400)/60m = 500, the second value is (30m*400 + 15m*500 + 15m*300)/60m = 400.
62 - time_weighted_avg2_fl()
The function time_weighted_avg2_fl() is a user-defined function (UDF) that calculates the time weighted average of a metric in a given time window, over input time bins. This function is similar to summarize operator. The function aggregates the metric by time bins, but instead of calculating simple avg() of the metric value in each bin, it weights each value by its duration. The duration is defined from the timestamp of the current value to the timestamp of the next value.
There are two options to calculate time weighted average. This function linearly interpolates the metric value between consecutive samples. Alternatively time_weighted_avg_fl() fills forward the value from the current sample until the next one.
Syntax
T | invoke time_weighted_avg2_fl(t_col, y_col, key_col, stime, etime, dt)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| t_col | string | ✔️ | The name of the column containing the time stamp of the records. |
| y_col | string | ✔️ | The name of the column containing the metric value of the records. |
| key_col | string | ✔️ | The name of the column containing the partition key of the records. |
| stime | datetime | ✔️ | The start time of the aggregation window. |
| etime | datetime | ✔️ | The end time of the aggregation window. |
| dt | timespan | ✔️ | The aggregation time bin. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let time_weighted_avg2_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
let _etime = etime + dt;
let gridTimes = range _ts from stime to _etime step dt | extend _val=real(null), dummy=1;
let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
gridTimes
| join kind=fullouter keys on dummy
| project-away dummy, dummy1
| union tbl_ex
| where _ts between (stime.._etime)
| partition hint.strategy=native by _key (
order by _ts desc, _val nulls last
| scan declare(val1:real=0.0, t1:datetime) with ( // fill backward null values
step s: true => val1=iff(isnull(_val), s.val1, _val), t1=iff(isnull(_val), s.t1, _ts);)
| extend dt1=(t1-_ts)/1m
| order by _ts asc, _val nulls last
| scan declare(val0:real=0.0, t0:datetime) with ( // fill forward null values
step s: true => val0=iff(isnull(_val), s.val0, _val), t0=iff(isnull(_val), s.t0, _ts);)
| extend dt0=(_ts-t0)/1m
| extend _twa_val=iff(dt0+dt1 == 0, _val, ((val0*dt1)+(val1*dt0))/(dt0+dt1))
| scan with ( // fill forward null twa values
step s: true => _twa_val=iff(isnull(_twa_val), s._twa_val, _twa_val);)
| extend diff_t=(next(_ts)-_ts)/1m
)
| where isnotnull(diff_t)
| order by _key asc, _ts asc
| extend next_twa_val=iff(_key == next(_key), next(_twa_val), _twa_val)
| summarize tw_sum=sum((_twa_val+next_twa_val)*diff_t/2.0), t_sum =sum(diff_t) by bin_at(_ts, dt, stime), _key
| where t_sum > 0 and _ts <= etime
| extend tw_avg = tw_sum/t_sum
| project-away tw_sum, t_sum
| order by _key asc, _ts asc
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Time weighted average of a metric using linear interpolation")
time_weighted_avg2_fl(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
let _etime = etime + dt;
let gridTimes = range _ts from stime to _etime step dt | extend _val=real(null), dummy=1;
let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
gridTimes
| join kind=fullouter keys on dummy
| project-away dummy, dummy1
| union tbl_ex
| where _ts between (stime.._etime)
| partition hint.strategy=native by _key (
order by _ts desc, _val nulls last
| scan declare(val1:real=0.0, t1:datetime) with ( // fill backward null values
step s: true => val1=iff(isnull(_val), s.val1, _val), t1=iff(isnull(_val), s.t1, _ts);)
| extend dt1=(t1-_ts)/1m
| order by _ts asc, _val nulls last
| scan declare(val0:real=0.0, t0:datetime) with ( // fill forward null values
step s: true => val0=iff(isnull(_val), s.val0, _val), t0=iff(isnull(_val), s.t0, _ts);)
| extend dt0=(_ts-t0)/1m
| extend _twa_val=iff(dt0+dt1 == 0, _val, ((val0*dt1)+(val1*dt0))/(dt0+dt1))
| scan with ( // fill forward null twa values
step s: true => _twa_val=iff(isnull(_twa_val), s._twa_val, _twa_val);)
| extend diff_t=(next(_ts)-_ts)/1m
)
| where isnotnull(diff_t)
| order by _key asc, _ts asc
| extend next_twa_val=iff(_key == next(_key), next(_twa_val), _twa_val)
| summarize tw_sum=sum((_twa_val+next_twa_val)*diff_t/2.0), t_sum =sum(diff_t) by bin_at(_ts, dt, stime), _key
| where t_sum > 0 and _ts <= etime
| extend tw_avg = tw_sum/t_sum
| project-away tw_sum, t_sum
| order by _key asc, _ts asc
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let time_weighted_avg2_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
let _etime = etime + dt;
let gridTimes = range _ts from stime to _etime step dt | extend _val=real(null), dummy=1;
let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
gridTimes
| join kind=fullouter keys on dummy
| project-away dummy, dummy1
| union tbl_ex
| where _ts between (stime.._etime)
| partition hint.strategy=native by _key (
order by _ts desc, _val nulls last
| scan declare(val1:real=0.0, t1:datetime) with ( // fill backward null values
step s: true => val1=iff(isnull(_val), s.val1, _val), t1=iff(isnull(_val), s.t1, _ts);)
| extend dt1=(t1-_ts)/1m
| order by _ts asc, _val nulls last
| scan declare(val0:real=0.0, t0:datetime) with ( // fill forward null values
step s: true => val0=iff(isnull(_val), s.val0, _val), t0=iff(isnull(_val), s.t0, _ts);)
| extend dt0=(_ts-t0)/1m
| extend _twa_val=iff(dt0+dt1 == 0, _val, ((val0*dt1)+(val1*dt0))/(dt0+dt1))
| scan with ( // fill forward null twa values
step s: true => _twa_val=iff(isnull(_twa_val), s._twa_val, _twa_val);)
| extend diff_t=(next(_ts)-_ts)/1m
)
| where isnotnull(diff_t)
| order by _key asc, _ts asc
| extend next_twa_val=iff(_key == next(_key), next(_twa_val), _twa_val)
| summarize tw_sum=sum((_twa_val+next_twa_val)*diff_t/2.0), t_sum =sum(diff_t) by bin_at(_ts, dt, stime), _key
| where t_sum > 0 and _ts <= etime
| extend tw_avg = tw_sum/t_sum
| project-away tw_sum, t_sum
| order by _key asc, _ts asc
};
let tbl = datatable(ts:datetime, val:real, key:string) [
datetime(2021-04-26 00:00), 100, 'Device1',
datetime(2021-04-26 00:45), 300, 'Device1',
datetime(2021-04-26 01:15), 200, 'Device1',
datetime(2021-04-26 00:00), 600, 'Device2',
datetime(2021-04-26 00:30), 400, 'Device2',
datetime(2021-04-26 01:30), 500, 'Device2',
datetime(2021-04-26 01:45), 300, 'Device2'
];
let minmax=materialize(tbl | summarize mint=min(ts), maxt=max(ts));
let stime=toscalar(minmax | project mint);
let etime=toscalar(minmax | project maxt);
let dt = 1h;
tbl
| invoke time_weighted_avg2_fl('ts', 'val', 'key', stime, etime, dt)
| project-rename val = tw_avg
| order by _key asc, _ts asc
Stored
let tbl = datatable(ts:datetime, val:real, key:string) [
datetime(2021-04-26 00:00), 100, 'Device1',
datetime(2021-04-26 00:45), 300, 'Device1',
datetime(2021-04-26 01:15), 200, 'Device1',
datetime(2021-04-26 00:00), 600, 'Device2',
datetime(2021-04-26 00:30), 400, 'Device2',
datetime(2021-04-26 01:30), 500, 'Device2',
datetime(2021-04-26 01:45), 300, 'Device2'
];
let minmax=materialize(tbl | summarize mint=min(ts), maxt=max(ts));
let stime=toscalar(minmax | project mint);
let etime=toscalar(minmax | project maxt);
let dt = 1h;
tbl
| invoke time_weighted_avg2_fl('ts', 'val', 'key', stime, etime, dt)
| project-rename val = tw_avg
| order by _key asc, _ts asc
Output
| _ts | _key | val |
|---|---|---|
| 2021-04-26 00:00:00.0000000 | Device1 | 218.75 |
| 2021-04-26 01:00:00.0000000 | Device1 | 206.25 |
| 2021-04-26 00:00:00.0000000 | Device2 | 462.5 |
| 2021-04-26 01:00:00.0000000 | Device2 | 412.5 |
The first value of Device1 is (45m*(100+300)/2 + 15m*(300+250)/2)/60m = 218.75, the second value is (15m*(250+200)/2 + 45m*200)/60m = 206.25.
The first value of Device2 is (30m*(600+400)/2 + 30m*(400+450)/2)/60m = 462.5, the second value is (30m*(450+500)/2 + 15m*(500+300)/2 + 15m*300)/60m = 412.5.
63 - time_weighted_val_fl()
The function time_weighted_val_fl() is a user-defined function (UDF) that linearly interpolates metric value by time weighted average of the values of its previous point and its next point.
Syntax
T | invoke time_weighted_avg_fl(t_col, y_col, key_col, stime, etime, dt)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| t_col | string | ✔️ | The name of the column containing the time stamp of the records. |
| y_col | string | ✔️ | The name of the column containing the metric value of the records. |
| key_col | string | ✔️ | The name of the column containing the partition key of the records. |
| stime | datetime | ✔️ | The start time of the aggregation window. |
| etime | datetime | ✔️ | The end time of the aggregation window. |
| dt | timespan | ✔️ | The aggregation time bin. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let time_weighted_val_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
let gridTimes = range _ts from stime to etime step dt | extend _val=real(null), grid=1, dummy=1;
let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
gridTimes
| join kind=fullouter keys on dummy
| project-away dummy, dummy1
| union (tbl_ex | extend grid=0)
| where _ts between (stime..etime)
| partition hint.strategy=native by _key (
order by _ts desc, _val nulls last
| scan declare(val1:real=0.0, t1:datetime) with ( // fill backward null values
step s: true => val1=iff(isnull(_val), s.val1, _val), t1=iff(isnull(_val), s.t1, _ts);)
| extend dt1=(t1-_ts)/1m
| order by _ts asc, _val nulls last
| scan declare(val0:real=0.0, t0:datetime) with ( // fill forward null values
step s: true => val0=iff(isnull(_val), s.val0, _val), t0=iff(isnull(_val), s.t0, _ts);)
| extend dt0=(_ts-t0)/1m
| extend _twa_val=iff(dt0+dt1 == 0, _val, ((val0*dt1)+(val1*dt0))/(dt0+dt1))
| scan with ( // fill forward null twa values
step s: true => _twa_val=iff(isnull(_twa_val), s._twa_val, _twa_val);)
| where grid == 0 or (grid == 1 and _ts != prev(_ts))
)
| project _ts, _key, _twa_val, orig_val=iff(grid == 1, 0, 1)
| order by _key asc, _ts asc
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Linear interpolation of metric value by time weighted average")
time_weighted_val_fl(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
let gridTimes = range _ts from stime to etime step dt | extend _val=real(null), grid=1, dummy=1;
let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
gridTimes
| join kind=fullouter keys on dummy
| project-away dummy, dummy1
| union (tbl_ex | extend grid=0)
| where _ts between (stime..etime)
| partition hint.strategy=native by _key (
order by _ts desc, _val nulls last
| scan declare(val1:real=0.0, t1:datetime) with ( // fill backward null values
step s: true => val1=iff(isnull(_val), s.val1, _val), t1=iff(isnull(_val), s.t1, _ts);)
| extend dt1=(t1-_ts)/1m
| order by _ts asc, _val nulls last
| scan declare(val0:real=0.0, t0:datetime) with ( // fill forward null values
step s: true => val0=iff(isnull(_val), s.val0, _val), t0=iff(isnull(_val), s.t0, _ts);)
| extend dt0=(_ts-t0)/1m
| extend _twa_val=iff(dt0+dt1 == 0, _val, ((val0*dt1)+(val1*dt0))/(dt0+dt1))
| scan with ( // fill forward null twa values
step s: true => _twa_val=iff(isnull(_twa_val), s._twa_val, _twa_val);)
| where grid == 0 or (grid == 1 and _ts != prev(_ts))
)
| project _ts, _key, _twa_val, orig_val=iff(grid == 1, 0, 1)
| order by _key asc, _ts asc
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let time_weighted_val_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
let gridTimes = range _ts from stime to etime step dt | extend _val=real(null), grid=1, dummy=1;
let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
gridTimes
| join kind=fullouter keys on dummy
| project-away dummy, dummy1
| union (tbl_ex | extend grid=0)
| where _ts between (stime..etime)
| partition hint.strategy=native by _key (
order by _ts desc, _val nulls last
| scan declare(val1:real=0.0, t1:datetime) with ( // fill backward null values
step s: true => val1=iff(isnull(_val), s.val1, _val), t1=iff(isnull(_val), s.t1, _ts);)
| extend dt1=(t1-_ts)/1m
| order by _ts asc, _val nulls last
| scan declare(val0:real=0.0, t0:datetime) with ( // fill forward null values
step s: true => val0=iff(isnull(_val), s.val0, _val), t0=iff(isnull(_val), s.t0, _ts);)
| extend dt0=(_ts-t0)/1m
| extend _twa_val=iff(dt0+dt1 == 0, _val, ((val0*dt1)+(val1*dt0))/(dt0+dt1))
| scan with ( // fill forward null twa values
step s: true => _twa_val=iff(isnull(_twa_val), s._twa_val, _twa_val);)
| where grid == 0 or (grid == 1 and _ts != prev(_ts))
)
| project _ts, _key, _twa_val, orig_val=iff(grid == 1, 0, 1)
| order by _key asc, _ts asc
};
let tbl = datatable(ts:datetime, val:real, key:string) [
datetime(2021-04-26 00:00), 100, 'Device1',
datetime(2021-04-26 00:45), 300, 'Device1',
datetime(2021-04-26 01:15), 200, 'Device1',
datetime(2021-04-26 00:00), 600, 'Device2',
datetime(2021-04-26 00:30), 400, 'Device2',
datetime(2021-04-26 01:30), 500, 'Device2',
datetime(2021-04-26 01:45), 300, 'Device2'
];
let minmax=materialize(tbl | summarize mint=min(ts), maxt=max(ts));
let stime=toscalar(minmax | project mint);
let etime=toscalar(minmax | project maxt);
let dt = 1h;
tbl
| invoke time_weighted_val_fl('ts', 'val', 'key', stime, etime, dt)
| project-rename val = _twa_val
| order by _key asc, _ts asc
Stored
let tbl = datatable(ts:datetime, val:real, key:string) [
datetime(2021-04-26 00:00), 100, 'Device1',
datetime(2021-04-26 00:45), 300, 'Device1',
datetime(2021-04-26 01:15), 200, 'Device1',
datetime(2021-04-26 00:00), 600, 'Device2',
datetime(2021-04-26 00:30), 400, 'Device2',
datetime(2021-04-26 01:30), 500, 'Device2',
datetime(2021-04-26 01:45), 300, 'Device2'
];
let minmax=materialize(tbl | summarize mint=min(ts), maxt=max(ts));
let stime=toscalar(minmax | project mint);
let etime=toscalar(minmax | project maxt);
let dt = 1h;
tbl
| invoke time_weighted_val_fl('ts', 'val', 'key', stime, etime, dt)
| project-rename val = _twa_val
| order by _key asc, _ts asc
Output
| _ts | _key | val | orig_val |
|---|---|---|---|
| 2021-04-26 00:00:00.0000000 | Device1 | 100 | 1 |
| 2021-04-26 00:45:00.0000000 | Device1 | 300 | 1 |
| 2021-04-26 01:00:00.0000000 | Device1 | 250 | 0 |
| 2021-04-26 01:15:00.0000000 | Device1 | 200 | 1 |
| 2021-04-26 00:00:00.0000000 | Device2 | 600 | 1 |
| 2021-04-26 00:30:00.0000000 | Device2 | 400 | 1 |
| 2021-04-26 01:00:00.0000000 | Device2 | 450 | 0 |
| 2021-04-26 01:30:00.0000000 | Device2 | 500 | 1 |
| 2021-04-26 01:45:00.0000000 | Device2 | 300 | 1 |
64 - time_window_rolling_avg_fl()
The function time_window_rolling_avg_fl() is a user-defined function (UDF) that calculates the rolling average of the required value over a constant duration time window.
Calculating rolling average over a constant time window for regular time series (that is, having constant intervals) can be achieved using series_fir(), as the constant time window can be converted to a fixed width filter of equal coefficients. However, calculating it for irregular time series is more complex, as the actual number of samples in the window varies. Still it can be achieved using the powerful scan operator.
This type of rolling window calculation is required for use cases where the metric values are emitted only when changed (and not in constant intervals). For example in IoT, where edge devices send metrics to the cloud only upon changes, optimizing communication bandwidth.
Syntax
T | invoke time_window_rolling_avg_fl(t_col, y_col, key_col, dt [, direction ])
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| t_col | string | ✔️ | The name of the column containing the time stamp of the records. |
| y_col | string | ✔️ | The name of the column containing the metric value of the records. |
| key_col | string | ✔️ | The name of the column containing the partition key of the records. |
| dt | timespan | ✔️ | The duration of the rolling window. |
| direction | int | The aggregation direction. The possible values are +1 or -1. A rolling window is set from current time forward/backward respectively. Default is -1, as backward rolling window is the only possible method for streaming scenarios. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let time_window_rolling_avg_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, dt:timespan, direction:int=int(-1))
{
let tbl_ex = tbl | extend timestamp = column_ifexists(t_col, datetime(null)), value = column_ifexists(y_col, 0.0), key = column_ifexists(key_col, '');
tbl_ex
| partition hint.strategy=shuffle by key
(
extend timestamp=pack_array(timestamp, timestamp - direction*dt), delta = pack_array(-direction, direction)
| mv-expand timestamp to typeof(datetime), delta to typeof(long)
| sort by timestamp asc, delta desc
| scan declare (cum_sum:double=0.0, cum_count:long=0) with
(
step s: true => cum_count = s.cum_count + delta,
cum_sum = s.cum_sum + delta * value;
)
| extend avg_value = iff(direction == 1, prev(cum_sum)/prev(cum_count), cum_sum/cum_count)
| where delta == -direction
| project timestamp, value, avg_value, key
)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Time based rolling average of a metric")
time_window_rolling_avg_fl(tbl:(*), t_col:string, y_col:string, key_col:string, dt:timespan, direction:int=int(-1))
{
let tbl_ex = tbl | extend timestamp = column_ifexists(t_col, datetime(null)), value = column_ifexists(y_col, 0.0), key = column_ifexists(key_col, '');
tbl_ex
| partition hint.strategy=shuffle by key
(
extend timestamp=pack_array(timestamp, timestamp - direction*dt), delta = pack_array(-direction, direction)
| mv-expand timestamp to typeof(datetime), delta to typeof(long)
| sort by timestamp asc, delta desc
| scan declare (cum_sum:double=0.0, cum_count:long=0) with
(
step s: true => cum_count = s.cum_count + delta,
cum_sum = s.cum_sum + delta * value;
)
| extend avg_value = iff(direction == 1, prev(cum_sum)/prev(cum_count), cum_sum/cum_count)
| where delta == -direction
| project timestamp, value, avg_value, key
)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let time_window_rolling_avg_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, dt:timespan, direction:int=int(-1))
{
let tbl_ex = tbl | extend timestamp = column_ifexists(t_col, datetime(null)), value = column_ifexists(y_col, 0.0), key = column_ifexists(key_col, '');
tbl_ex
| partition hint.strategy=shuffle by key
(
extend timestamp=pack_array(timestamp, timestamp - direction*dt), delta = pack_array(-direction, direction)
| mv-expand timestamp to typeof(datetime), delta to typeof(long)
| sort by timestamp asc, delta desc
| scan declare (cum_sum:double=0.0, cum_count:long=0) with
(
step s: true => cum_count = s.cum_count + delta,
cum_sum = s.cum_sum + delta * value;
)
| extend avg_value = iff(direction == 1, prev(cum_sum)/prev(cum_count), cum_sum/cum_count)
| where delta == -direction
| project timestamp, value, avg_value, key
)
};
let tbl = datatable(ts:datetime, val:real, key:string) [
datetime(8:00), 1, 'Device1',
datetime(8:01), 2, 'Device1',
datetime(8:05), 3, 'Device1',
datetime(8:05), 10, 'Device2',
datetime(8:09), 20, 'Device2',
datetime(8:40), 4, 'Device1',
datetime(9:00), 5, 'Device1',
datetime(9:01), 6, 'Device1',
datetime(9:05), 30, 'Device2',
datetime(9:50), 7, 'Device1'
];
tbl
| invoke time_window_rolling_avg_fl('ts', 'val', 'key', 10m)
Stored
let tbl = datatable(ts:datetime, val:real, key:string) [
datetime(8:00), 1, 'Device1',
datetime(8:01), 2, 'Device1',
datetime(8:05), 3, 'Device1',
datetime(8:05), 10, 'Device2',
datetime(8:09), 20, 'Device2',
datetime(8:40), 4, 'Device1',
datetime(9:00), 5, 'Device1',
datetime(9:01), 6, 'Device1',
datetime(9:05), 30, 'Device2',
datetime(9:50), 7, 'Device1'
];
tbl
| invoke time_window_rolling_avg_fl('ts', 'val', 'key', 10m)
Output
| timestamp | value | avg_value | key |
|---|---|---|---|
| 2021-11-29 08:05:00.0000000 | 10 | 10 | Device2 |
| 2021-11-29 08:09:00.0000000 | 20 | 15 | Device2 |
| 2021-11-29 09:05:00.0000000 | 30 | 30 | Device2 |
| 2021-11-29 08:00:00.0000000 | 1 | 1 | Device1 |
| 2021-11-29 08:01:00.0000000 | 2 | 1.5 | Device1 |
| 2021-11-29 08:05:00.0000000 | 3 | 2 | Device1 |
| 2021-11-29 08:40:00.0000000 | 4 | 4 | Device1 |
| 2021-11-29 09:00:00.0000000 | 5 | 5 | Device1 |
| 2021-11-29 09:01:00.0000000 | 6 | 5.5 | Device1 |
| 2021-11-29 09:50:00.0000000 | 7 | 7 | Device1 |
The first value (10) at 8:05 contains only a single value, which fell in the 10-minute backward window, the second value (15) is the average of two samples at 8:09 and at 8:05, etc.
65 - two_sample_t_test_fl()
The function two_sample_t_test_fl() is a user-defined function (UDF) that performs the Two-Sample T-Test.
Syntax
T | invoke two_sample_t_test_fl(data1, data2, test_statistic,p_value, equal_var)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| data1 | string | ✔️ | The name of the column containing the first set of data to be used for the test. |
| data2 | string | ✔️ | The name of the column containing the second set of data to be used for the test. |
| test_statistic | string | ✔️ | The name of the column to store test statistic value for the results. |
| p_value | string | ✔️ | The name of the column to store p-value for the results. |
| equal_var | bool | If true (default), performs a standard independent 2 sample test that assumes equal population variances. If false, performs Welch’s t-test, which does not assume equal population variance. As mentioned above, consider using the native welch_test(). |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let two_sample_t_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string, equal_var:bool=true)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value, 'equal_var', equal_var);
let code = ```if 1:
from scipy import stats
import pandas
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
equal_var = kargs["equal_var"]
def func(row):
statistics = stats.ttest_ind(row[data1], row[data2], equal_var=equal_var)
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Two-Sample t-Test")
two_sample_t_test_fl(tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string, equal_var:bool=true)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value, 'equal_var', equal_var);
let code = ```if 1:
from scipy import stats
import pandas
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
equal_var = kargs["equal_var"]
def func(row):
statistics = stats.ttest_ind(row[data1], row[data2], equal_var=equal_var)
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let two_sample_t_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string, equal_var:bool=true)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value, 'equal_var', equal_var);
let code = ```if 1:
from scipy import stats
import pandas
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
equal_var = kargs["equal_var"]
def func(row):
statistics = stats.ttest_ind(row[data1], row[data2], equal_var=equal_var)
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke two_sample_t_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Stored
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke two_sample_t_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Output
| ID | sample1 | sample2 | test_stat | p_val |
|---|---|---|---|---|
| Test #1 | [23.64, 20.57, 20.42] | [27.1, 22.12, 33.56] | -1.7415675457565645 | 0.15655096653487446 |
| Test #2 | [20.85, 21.89, 23.41] | [35.09, 30.02, 26.52], -3.2711673491022579 | 0.030755331219276136 | |
| Test #3 | [20.13, 20.5, 21.7, 22.02] | [32.2, 32.79, 33.9, 34.22] | -18.5515946201742 | 1.5823717131966134E-06 |
66 - User-defined functions
User-defined functions are reusable subqueries that can be defined as part of the query itself (query-defined functions), or stored as part of the database metadata (stored functions). User-defined functions are invoked through a name, are provided with zero or more input arguments (which can be scalar or tabular), and produce a single value (which can be scalar or tabular) based on the function body.
A user-defined function belongs to one of two categories:
- Scalar functions
- Tabular functions
The function’s input arguments and output determine whether it’s scalar or tabular, which then establishes how it might be used.
To optimize multiple uses of the user-defined functions within a single query, see Optimize queries that use named expressions.
We’ve created an assortment of user-defined functions that you can use in your queries. For more information, see Functions library.
Scalar function
- Has zero input arguments, or all its input arguments are scalar values
- Produces a single scalar value
- Can be used wherever a scalar expression is allowed
- May only use the row context in which it’s defined
- Can only refer to tables (and views) that are in the accessible schema
Tabular function
- Accepts one or more tabular input arguments, and zero or more scalar input arguments, and/or:
- Produces a single tabular value
Function names
Valid user-defined function names must follow the same identifier naming rules as other entities.
The name must also be unique in its scope of definition.
Input arguments
Valid user-defined functions follow these rules:
- A user-defined function has a strongly typed list of zero or more input arguments.
- An input argument has a name, a type, and (for scalar arguments) a default value.
- The name of an input argument is an identifier.
- The type of an input argument is either one of the scalar data types, or a tabular schema.
Syntactically, the input arguments list is a comma-separated list of argument definitions, wrapped in parenthesis. Each argument definition is specified as
ArgName:ArgType [= ArgDefaultValue]
For tabular arguments, ArgType has the same syntax as the table definition (parenthesis and a list of column name/type pairs), with the addition of a solitary (*) indicating “any tabular schema”.
For example:
| Syntax | Input arguments list description |
|---|---|
() | No arguments |
(s:string) | Single scalar argument called s taking a value of type string |
(a:long, b:bool=true) | Two scalar arguments, the second of which has a default value |
(T1:(*), T2:(r:real), b:bool) | Three arguments (two tabular arguments and one scalar argument) |
Examples
Scalar function
let Add7 = (arg0:long = 5) { arg0 + 7 };
range x from 1 to 10 step 1
| extend x_plus_7 = Add7(x), five_plus_seven = Add7()
Tabular function with no arguments
let tenNumbers = () { range x from 1 to 10 step 1};
tenNumbers
| extend x_plus_7 = x + 7
Tabular function with arguments
let MyFilter = (T:(x:long), v:long) {
T | where x >= v
};
MyFilter((range x from 1 to 10 step 1), 9)
Output
| x |
|---|
| 9 |
| 10 |
A tabular function that uses a tabular input with no column specified. Any table can be passed to a function, and no table columns can be referenced inside the function.
let MyDistinct = (T:(*)) {
T | distinct *
};
MyDistinct((range x from 1 to 3 step 1))
Output
| x |
|---|
| 1 |
| 2 |
| 3 |
Declaring user-defined functions
The declaration of a user-defined function provides:
- Function name
- Function schema (parameters it accepts, if any)
- Function body
let f=(s:string, i:long) {
tolong(s) * i
};
The function body includes:
- Exactly one expression, which provides the function’s return value (scalar or tabular value).
- Any number (zero or more) of let statements, whose scope is that of the function body. If specified, the let statements must precede the expression defining the function’s return value.
- Any number (zero or more) of query parameters statements, which declare query parameters used by the function. If specified, they must precede the expression defining the function’s return value.
Examples of user-defined functions
The following section shows examples of how to use user-defined functions.
User-defined function that uses a let statement
The following example shows a user-defined function (lambda) that accepts a parameter named ID. The function is bound to the name Test and makes use of three let statements, in which the Test3 definition uses the ID parameter. When run, the output from the query is 70:
let Test = (id: int) {
let Test2 = 10;
let Test3 = 10 + Test2 + id;
let Test4 = (arg: int) {
let Test5 = 20;
Test2 + Test3 + Test5 + arg
};
Test4(10)
};
range x from 1 to Test(10) step 1
| count
User-defined function that defines a default value for a parameter
The following example shows a function that accepts three arguments. The latter two have a default value and don’t have to be present at the call site.
let f = (a:long, b:string = "b.default", c:long = 0) {
strcat(a, "-", b, "-", c)
};
print f(12, c=7) // Returns "12-b.default-7"
Invoking a user-defined function
The method to invoke a user-defined function depends on the arguments that the function expects to receive. The following sections cover how to invoke a UDF without arguments, invoke a UDF with scalar arguments, and invoke a UDF with tabular arguments.
Invoke a UDF without arguments
A user-defined function that takes no arguments and can be invoked either by its name, or by its name and an empty argument list in parentheses.
// Bind the identifier a to a user-defined function (lambda) that takes
// no arguments and returns a constant of type long:
let a=(){123};
// Invoke the function in two equivalent ways:
range x from 1 to 10 step 1
| extend y = x * a, z = x * a()
// Bind the identifier T to a user-defined function (lambda) that takes
// no arguments and returns a random two-by-two table:
let T=(){
range x from 1 to 2 step 1
| project x1 = rand(), x2 = rand()
};
// Invoke the function in two equivalent ways:
// (Note that the second invocation must be itself wrapped in
// an additional set of parentheses, as the union operator
// differentiates between "plain" names and expressions)
union T, (T())
Invoke a UDF with scalar arguments
A user-defined function that takes one or more scalar arguments can be invoked by using the function name and a concrete argument list in parentheses:
let f=(a:string, b:string) {
strcat(a, " (la la la)", b)
};
print f("hello", "world")
Invoke a UDF with tabular arguments
A user-defined function that takes one or more table arguments (with any number of scalar arguments) and can be invoked using the function name and a concrete argument list in parentheses:
let MyFilter = (T:(x:long), v:long) {
T | where x >= v
};
MyFilter((range x from 1 to 10 step 1), 9)
You can also use the operator invoke to invoke a user-defined function that
takes one or more table arguments and returns a table. This function is useful when the first concrete table argument to the function is the source of the invoke operator:
let append_to_column_a=(T:(a:string), what:string) {
T | extend a=strcat(a, " ", what)
};
datatable (a:string) ["sad", "really", "sad"]
| invoke append_to_column_a(":-)")
Default values
Functions may provide default values to some of their parameters under the following conditions:
- Default values may be provided for scalar parameters only.
- Default values are always literals (constants). They can’t be arbitrary calculations.
- Parameters with no default value always precede parameters that do have a default value.
- Callers must provide the value of all parameters with no default values arranged in the same order as the function declaration.
- Callers don’t need to provide the value for parameters with default values, but may do so.
- Callers may provide arguments in an order that doesn’t match the order of the parameters. If so, they must name their arguments.
The following example returns a table with two identical records. In the first invocation of f, the arguments are completely “scrambled”, so each one is explicitly given a name:
let f = (a:long, b:string = "b.default", c:long = 0) {
strcat(a, "-", b, "-", c)
};
union
(print x=f(c=7, a=12)), // "12-b.default-7"
(print x=f(12, c=7)) // "12-b.default-7"
Output
| x |
|---|
| 12-b.default-7 |
| 12-b.default-7 |
View functions
A user-defined function that takes no arguments and returns a tabular expression can be marked as a view. Marking a user-defined function as a view means that the function behaves like a table whenever a wildcard table name resolution is performed.
The following example shows two user-defined functions, T_view and T_notview, and shows how only the first one is resolved by the wildcard reference in the union:
let T_view = view () { print x=1 };
let T_notview = () { print x=2 };
union T*
Restrictions
The following restrictions apply:
- User-defined functions can’t pass into toscalar() invocation information that depends on the row-context in which the function is called.
- User-defined functions that return a tabular expression can’t be invoked with an argument that varies with the row context.
- A function taking at least one tabular input can’t be invoked on a remote cluster.
- A scalar function can’t be invoked on a remote cluster.
The only place a user-defined function may be invoked with an argument that varies with the row context is when the user-defined function is composed of scalar functions only and doesn’t use toscalar().
Examples
Supported scalar function
The following query is supported because f is a scalar function that doesn’t reference any tabular expression.
let Table1 = datatable(xdate:datetime)[datetime(1970-01-01)];
let Table2 = datatable(Column:long)[1235];
let f = (hours:long) { now() + hours*1h };
Table2 | where Column != 123 | project d = f(10)
The following query is supported because f is a scalar function that references the tabular expression Table1 but is invoked with no reference to the current row context f(10):
let Table1 = datatable(xdate:datetime)[datetime(1970-01-01)];
let Table2 = datatable(Column:long)[1235];
let f = (hours:long) { toscalar(Table1 | summarize min(xdate) - hours*1h) };
Table2 | where Column != 123 | project d = f(10)
Unsupported scalar function
The following query isn’t supported because f is a scalar function that references the tabular expression Table1, and is invoked with a reference to the current row context f(Column):
let Table1 = datatable(xdate:datetime)[datetime(1970-01-01)];
let Table2 = datatable(Column:long)[1235];
let f = (hours:long) { toscalar(Table1 | summarize min(xdate) - hours*1h) };
Table2 | where Column != 123 | project d = f(Column)
Unsupported tabular function
The following query isn’t supported because f is a tabular function that is invoked in a context that expects a scalar value.
let Table1 = datatable(xdate:datetime)[datetime(1970-01-01)];
let Table2 = datatable(Column:long)[1235];
let f = (hours:long) { range x from 1 to hours step 1 | summarize make_list(x) };
Table2 | where Column != 123 | project d = f(Column)
Features that are currently unsupported by user-defined functions
For completeness, here are some commonly requested features for user-defined functions that are currently not supported:
Function overloading: There’s currently no way to overload a function (a way to create multiple functions with the same name and different input schema).
Default values: The default value for a scalar parameter to a function must be a scalar literal (constant).
Related content
67 - wilcoxon_test_fl()
The function wilcoxon_test_fl() is a user-defined function (UDF) that performs the Wilcoxon Test.
Syntax
T | invoke wilcoxon_test_fl()(data, test_statistic,p_value)
Parameters
| Name | Type | Required | Description |
|---|---|---|---|
| data | string | ✔️ | The name of the column containing the data to be used for the test. |
| test_statistic | string | ✔️ | The name of the column to store test statistic value for the results. |
| p_value | string | ✔️ | The name of the column to store p-value for the results. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let wilcoxon_test_fl = (tbl:(*), data:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data', data, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data = kargs["data"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.wilcoxon(row[data])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Wilcoxon Test")
wilcoxon_test_fl(tbl:(*), data:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data', data, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data = kargs["data"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.wilcoxon(row[data])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let wilcoxon_test_fl = (tbl:(*), data:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data', data, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data = kargs["data"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.wilcoxon(row[data])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, sample1:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]),
'Test #2', dynamic([20.85, 21.89, 23.41]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke wilcoxon_test_fl('sample1', 'test_stat', 'p_val') -->
Stored
datatable(id:string, sample1:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]),
'Test #2', dynamic([20.85, 21.89, 23.41]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke wilcoxon_test_fl('sample1', 'test_stat', 'p_val')
Output
| ID | sample1 | test_stat | p_val |
|---|---|---|---|
| Test #1 | [23.64, 20.57, 20.42] | 0, 0.10880943004054568 | |
| Test #2 | [20.85, 21.89, 23.41] | 0, 0.10880943004054568 | |
| Test #3 | [20.13, 20.5, 21.7, 22.02] | 0, 0.06788915486182899 |