This is the multi-page printable view of this section. Click here to print.
Functions
- 1: bartlett_test_fl()
- 2: binomial_test_fl()
- 3: comb_fl()
- 4: dbscan_dynamic_fl()
- 5: dbscan_fl()
- 6: detect_anomalous_new_entity_fl()
- 7: factorial_fl()
- 8: Functions
- 9: Functions library
- 10: geoip_fl()
- 11: get_packages_version_fl()
- 12: kmeans_dynamic_fl()
- 13: kmeans_fl()
- 14: ks_test_fl()
- 15: levene_test_fl()
- 16: log_reduce_fl()
- 17: log_reduce_full_fl()
- 18: log_reduce_predict_fl()
- 19: log_reduce_predict_full_fl()
- 20: log_reduce_train_fl()
- 21: mann_whitney_u_test_fl()
- 22: normality_test_fl()
- 23: pair_probabilities_fl()
- 24: pairwise_dist_fl()
- 25: percentiles_linear_fl()
- 26: perm_fl()
- 27: plotly_anomaly_fl()
- 28: plotly_gauge_fl()
- 29: plotly_scatter3d_fl()
- 30: predict_fl()
- 31: predict_onnx_fl()
- 32: quantize_fl()
- 33: series_clean_anomalies_fl()
- 34: series_cosine_similarity_fl()
- 35: series_dbl_exp_smoothing_fl()
- 36: series_dot_product_fl()
- 37: series_downsample_fl()
- 38: series_exp_smoothing_fl()
- 39: series_fbprophet_forecast_fl()
- 40: series_fit_lowess_fl()
- 41: series_fit_poly_fl()
- 42: series_lag_fl()
- 43: series_metric_fl()
- 44: series_monthly_decompose_anomalies_fl()
- 45: series_moving_avg_fl()
- 46: series_moving_var_fl()
- 47: series_mv_ee_anomalies_fl()
- 48: series_mv_if_anomalies_fl()
- 49: series_mv_oc_anomalies_fl()
- 50: series_rate_fl()
- 51: series_rolling_fl()
- 52: series_shapes_fl()
- 53: series_uv_anomalies_fl()
- 54: series_uv_change_points_fl()
- 55: time_weighted_avg_fl()
- 56: time_weighted_avg2_fl()
- 57: time_weighted_val_fl()
- 58: time_window_rolling_avg_fl()
- 59: two_sample_t_test_fl()
- 60: User-defined functions
- 61: wilcoxon_test_fl()
1 - bartlett_test_fl()
The bartlett_test_fl()
function is a user-defined tabular function that performs the Bartlett Test.
Syntax
T | invoke bartlett_test_fl()(
data1,
data2,
test_statistic,
p_value)
Parameters
Name | Type | Required | Description |
---|---|---|---|
data1 | string | ✔️ | The name of the column containing the first set of data to be used for the test. |
data2 | string | ✔️ | The name of the column containing the second set of data to be used for the test. |
test_statistic | string | ✔️ | The name of the column to store test statistic value for the results. |
p_value | string | ✔️ | The name of the column to store p-value for the results. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let bartlett_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.bartlett(row[data1], row[data2])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Bartlett Test")
bartlett_test_fl(tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.bartlett(row[data1], row[data2])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let bartlett_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.bartlett(row[data1], row[data2])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Example query that uses the function
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke bartlett_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Stored
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke bartlett_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Output
id | sample1 | sample2 | test_stat | p_val |
---|---|---|---|---|
Test #1 | [23.64, 20.57, 20.42] | [27.1, 22.12, 33.56] | 1.7660796224425723 | 0.183868001738637 |
Test #2 | [20.85, 21.89, 23.41] | [35.09, 30.02, 26.52] | 1.9211710616896014 | 0.16572762069132516 |
Test #3 | [20.13, 20.5, 21.7, 22.02] | [32.2, 32.79, 33.9, 34.22] | 0.0026985713829234454 | 0.958570306268548 |
| Test #3 | [20.13, 20.5, 21.7, 22.02] | [32.2, 32.79, 33.9, 34.22] | 0.0026985713829234454 | 0.958570306268548 |
2 - binomial_test_fl()
The function binomial_test_fl()
is a UDF (user-defined function) that performs the binomial test.
Syntax
T | invoke binomial_test_fl(
successes,
trials [,
success_prob [,
alt_hypotheis ]])
Parameters
Name | Type | Required | Description |
---|---|---|---|
successes | string | ✔️ | The name of the column containing the number of success results. |
trials | string | ✔️ | The name of the column containing the total number of trials. |
p_value | string | ✔️ | The name of the column to store the results. |
success_prob | real | The success probability. The default is 0.5. | |
alt_hypotheis | string | The alternate hypothesis can be two-sided , greater , or less . The default is two-sided . |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let binomial_test_fl = (tbl:(*), successes:string, trials:string, p_value:string, success_prob:real=0.5, alt_hypotheis:string='two-sided')
{
let kwargs = bag_pack('successes', successes, 'trials', trials, 'p_value', p_value, 'success_prob', success_prob, 'alt_hypotheis', alt_hypotheis);
let code = ```if 1:
from scipy import stats
successes = kargs["successes"]
trials = kargs["trials"]
p_value = kargs["p_value"]
success_prob = kargs["success_prob"]
alt_hypotheis = kargs["alt_hypotheis"]
def func(row, prob, h1):
pv = stats.binom_test(row[successes], row[trials], p=prob, alternative=h1)
return pv
result = df
result[p_value] = df.apply(func, axis=1, args=(success_prob, alt_hypotheis), result_type="expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Binomial test")
binomial_test_fl(tbl:(*), successes:string, trials:string, p_value:string, success_prob:real=0.5, alt_hypotheis:string='two-sided')
{
let kwargs = bag_pack('successes', successes, 'trials', trials, 'p_value', p_value, 'success_prob', success_prob, 'alt_hypotheis', alt_hypotheis);
let code = ```if 1:
from scipy import stats
successes = kargs["successes"]
trials = kargs["trials"]
p_value = kargs["p_value"]
success_prob = kargs["success_prob"]
alt_hypotheis = kargs["alt_hypotheis"]
def func(row, prob, h1):
pv = stats.binom_test(row[successes], row[trials], p=prob, alternative=h1)
return pv
result = df
result[p_value] = df.apply(func, axis=1, args=(success_prob, alt_hypotheis), result_type="expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let binomial_test_fl = (tbl:(*), successes:string, trials:string, p_value:string, success_prob:real=0.5, alt_hypotheis:string='two-sided')
{
let kwargs = bag_pack('successes', successes, 'trials', trials, 'p_value', p_value, 'success_prob', success_prob, 'alt_hypotheis', alt_hypotheis);
let code = ```if 1:
from scipy import stats
successes = kargs["successes"]
trials = kargs["trials"]
p_value = kargs["p_value"]
success_prob = kargs["success_prob"]
alt_hypotheis = kargs["alt_hypotheis"]
def func(row, prob, h1):
pv = stats.binom_test(row[successes], row[trials], p=prob, alternative=h1)
return pv
result = df
result[p_value] = df.apply(func, axis=1, args=(success_prob, alt_hypotheis), result_type="expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, x:int, n:int) [
'Test #1', 3, 5,
'Test #2', 5, 5,
'Test #3', 3, 15
]
| extend p_val=0.0
| invoke binomial_test_fl('x', 'n', 'p_val', success_prob=0.2, alt_hypotheis='greater')
Stored
datatable(id:string, x:int, n:int) [
'Test #1', 3, 5,
'Test #2', 5, 5,
'Test #3', 3, 15
]
| extend p_val=0.0
| invoke binomial_test_fl('x', 'n', 'p_val', success_prob=0.2, alt_hypotheis='greater')
Output
id | x | n | p_val |
---|---|---|---|
Test #1 | 3 | 5 | 0.05792 |
Test #2 | 5 | 5 | 0.00032 |
Test #3 | 3 | 15 | 0.601976790745087 |
3 - comb_fl()
Calculate C(n, k)
The function comb_fl()
is a user-defined function (UDF) that calculates C(n, k), the number of combinations for selection of k items out of n, without order. It’s based on the native gamma() function to calculate factorial. For more information, see facorial_fl(). For a selection of k items with order, use perm_fl().
Syntax
comb_fl(
n, k)
Parameters
Name | Type | Required | Description |
---|
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let comb_fl=(n:int, k:int)
{
let fact_n = gamma(n+1);
let fact_nk = gamma(n-k+1);
let fact_k = gamma(k+1);
tolong(fact_n/fact_nk/fact_k)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Calculate number of combinations for selection of k items out of n items without order")
comb_fl(n:int, k:int)
{
let fact_n = gamma(n+1);
let fact_nk = gamma(n-k+1);
let fact_k = gamma(k+1);
tolong(fact_n/fact_nk/fact_k)
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let comb_fl=(n:int, k:int)
{
let fact_n = gamma(n+1);
let fact_nk = gamma(n-k+1);
let fact_k = gamma(k+1);
tolong(fact_n/fact_nk/fact_k)
};
range n from 3 to 10 step 3
| extend k = n-2
| extend cnk = comb_fl(n, k)
Stored
range n from 3 to 10 step 3
| extend k = n-2
| extend cnk = comb_fl(n, k)
Output
n | k | cnk |
---|---|---|
3 | 1 | 3 |
6 | 4 | 15 |
9 | 7 | 36 |
4 - dbscan_dynamic_fl()
The function dbscan_dynamic_fl()
is a UDF (user-defined function) that clusterizes a dataset using the DBSCAN algorithm. This function is similar to dbscan_fl() just the features are supplied by a single numerical array column and not by multiple scalar columns.
Syntax
T | invoke dbscan_fl(
features_col,
cluster_col,
epsilon,
min_samples,
metric,
metric_params)
Parameters
Name | Type | Required | Description |
---|---|---|---|
features_col | string | ✔️ | The name of the column containing the numeric array of features to be used for clustering. |
cluster_col | string | ✔️ | The name of the column to store the output cluster ID for each record. |
epsilon | real | ✔️ | The maximum distance between two samples to be considered as neighbors. |
min_samples | int | The number of samples in a neighborhood for a point to be considered as a core point. | |
metric | string | The metric to use when calculating distance between points. | |
metric_params | dynamic | Extra keyword arguments for the metric function. |
- For detailed description of the parameters, see DBSCAN documentation
- For the list of metrics see distance computations
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let dbscan_dynamic_fl=(tbl:(*), features_col:string, cluster_col:string, epsilon:double, min_samples:int=10, metric:string='minkowski', metric_params:dynamic=dynamic({'p': 2}))
{
let kwargs = bag_pack('features_col', features_col, 'cluster_col', cluster_col, 'epsilon', epsilon, 'min_samples', min_samples,
'metric', metric, 'metric_params', metric_params);
let code = ```if 1:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
features_col = kargs["features_col"]
cluster_col = kargs["cluster_col"]
epsilon = kargs["epsilon"]
min_samples = kargs["min_samples"]
metric = kargs["metric"]
metric_params = kargs["metric_params"]
df1 = df[features_col].apply(np.array)
mat = np.vstack(df1.values)
# Scale the dataframe
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
# see https://docs.scipy.org/doc/scipy/reference/spatial.distance.html for the various distance metrics
dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric, metric_params=metric_params) # 'minkowski', 'chebyshev'
labels = dbscan.fit_predict(mat)
result = df
result[cluster_col] = labels
```;
tbl
| evaluate python(typeof(*),code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\ML", docstring = "DBSCAN clustering of features passed as a single column containing numerical array")
dbscan_dynamic_fl(tbl:(*), features_col:string, cluster_col:string, epsilon:double, min_samples:int=10, metric:string='minkowski', metric_params:dynamic=dynamic({'p': 2}))
{
let kwargs = bag_pack('features_col', features_col, 'cluster_col', cluster_col, 'epsilon', epsilon, 'min_samples', min_samples,
'metric', metric, 'metric_params', metric_params);
let code = ```if 1:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
features_col = kargs["features_col"]
cluster_col = kargs["cluster_col"]
epsilon = kargs["epsilon"]
min_samples = kargs["min_samples"]
metric = kargs["metric"]
metric_params = kargs["metric_params"]
df1 = df[features_col].apply(np.array)
mat = np.vstack(df1.values)
# Scale the dataframe
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
# see https://docs.scipy.org/doc/scipy/reference/spatial.distance.html for the various distance metrics
dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric, metric_params=metric_params) # 'minkowski', 'chebyshev'
labels = dbscan.fit_predict(mat)
result = df
result[cluster_col] = labels
```;
tbl
| evaluate python(typeof(*),code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Clustering of artificial dataset with three clusters
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let dbscan_dynamic_fl=(tbl:(*), features_col:string, cluster_col:string, epsilon:double, min_samples:int=10, metric:string='minkowski', metric_params:dynamic=dynamic({'p': 2}))
{
let kwargs = bag_pack('features_col', features_col, 'cluster_col', cluster_col, 'epsilon', epsilon, 'min_samples', min_samples,
'metric', metric, 'metric_params', metric_params);
let code = ```if 1:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
features_col = kargs["features_col"]
cluster_col = kargs["cluster_col"]
epsilon = kargs["epsilon"]
min_samples = kargs["min_samples"]
metric = kargs["metric"]
metric_params = kargs["metric_params"]
df1 = df[features_col].apply(np.array)
mat = np.vstack(df1.values)
# Scale the dataframe
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
# see https://docs.scipy.org/doc/scipy/reference/spatial.distance.html for the various distance metrics
dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric, metric_params=metric_params) # 'minkowski', 'chebyshev'
labels = dbscan.fit_predict(mat)
result = df
result[cluster_col] = labels
```;
tbl
| evaluate python(typeof(*),code, kwargs)
};
union
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| project Features=pack_array(x, y), cluster_id=int(null)
| invoke dbscan_dynamic_fl("Features", "cluster_id", epsilon=0.6, min_samples=4, metric_params=dynamic({'p':2}))
| extend x=toreal(Features[0]), y=toreal(Features[1])
| render scatterchart with(series=cluster_id)
Stored
union
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| project Features=pack_array(x, y), cluster_id=int(null)
| invoke dbscan_dynamic_fl("Features", "cluster_id", epsilon=0.6, min_samples=4, metric_params=dynamic({'p':2}))
| extend x=toreal(Features[0]), y=toreal(Features[1])
| render scatterchart with(series=cluster_id)
5 - dbscan_fl()
The function dbscan_fl()
is a UDF (user-defined function) that clusterizes a dataset using the DBSCAN algorithm.
Syntax
T | invoke dbscan_fl(
features,
cluster_col,
epsilon,
min_samples,
metric,
metric_params)
Parameters
Name | Type | Required | Description |
---|---|---|---|
features | dynamic | ✔️ | An array containing the names of the features columns to use for clustering. |
cluster_col | string | ✔️ | The name of the column to store the output cluster ID for each record. |
epsilon | real | ✔️ | The maximum distance between two samples to be considered as neighbors. |
min_samples | int | The number of samples in a neighborhood for a point to be considered as a core point. | |
metric | string | The metric to use when calculating distance between points. | |
metric_params | dynamic | Extra keyword arguments for the metric function. |
- For detailed description of the parameters, see DBSCAN documentation
- For the list of metrics see distance computations
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let dbscan_fl=(tbl:(*), features:dynamic, cluster_col:string, epsilon:double, min_samples:int=10,
metric:string='minkowski', metric_params:dynamic=dynamic({'p': 2}))
{
let kwargs = bag_pack('features', features, 'cluster_col', cluster_col, 'epsilon', epsilon, 'min_samples', min_samples,
'metric', metric, 'metric_params', metric_params);
let code = ```if 1:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
features = kargs["features"]
cluster_col = kargs["cluster_col"]
epsilon = kargs["epsilon"]
min_samples = kargs["min_samples"]
metric = kargs["metric"]
metric_params = kargs["metric_params"]
df1 = df[features]
mat = df1.values
# Scale the dataframe
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
# see https://docs.scipy.org/doc/scipy/reference/spatial.distance.html for the various distance metrics
dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric, metric_params=metric_params) # 'minkowski', 'chebyshev'
labels = dbscan.fit_predict(mat)
result = df
result[cluster_col] = labels
```;
tbl
| evaluate python(typeof(*),code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\ML", docstring = "DBSCAN clustering")
dbscan_fl(tbl:(*), features:dynamic, cluster_col:string, epsilon:double, min_samples:int=10,
metric:string='minkowski', metric_params:dynamic=dynamic({'p': 2}))
{
let kwargs = bag_pack('features', features, 'cluster_col', cluster_col, 'epsilon', epsilon, 'min_samples', min_samples,
'metric', metric, 'metric_params', metric_params);
let code = ```if 1:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
features = kargs["features"]
cluster_col = kargs["cluster_col"]
epsilon = kargs["epsilon"]
min_samples = kargs["min_samples"]
metric = kargs["metric"]
metric_params = kargs["metric_params"]
df1 = df[features]
mat = df1.values
# Scale the dataframe
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
# see https://docs.scipy.org/doc/scipy/reference/spatial.distance.html for the various distance metrics
dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric, metric_params=metric_params) # 'minkowski', 'chebyshev'
labels = dbscan.fit_predict(mat)
result = df
result[cluster_col] = labels
```;
tbl
| evaluate python(typeof(*),code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Clustering of artificial dataset with three clusters
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let dbscan_fl=(tbl:(*), features:dynamic, cluster_col:string, epsilon:double, min_samples:int=10,
metric:string='minkowski', metric_params:dynamic=dynamic({'p': 2}))
{
let kwargs = bag_pack('features', features, 'cluster_col', cluster_col, 'epsilon', epsilon, 'min_samples', min_samples,
'metric', metric, 'metric_params', metric_params);
let code = ```if 1:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
features = kargs["features"]
cluster_col = kargs["cluster_col"]
epsilon = kargs["epsilon"]
min_samples = kargs["min_samples"]
metric = kargs["metric"]
metric_params = kargs["metric_params"]
df1 = df[features]
mat = df1.values
# Scale the dataframe
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
# see https://docs.scipy.org/doc/scipy/reference/spatial.distance.html for the various distance metrics
dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric, metric_params=metric_params) # 'minkowski', 'chebyshev'
labels = dbscan.fit_predict(mat)
result = df
result[cluster_col] = labels
```;
tbl
| evaluate python(typeof(*),code, kwargs)
};
union
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| extend cluster_id=int(null)
| invoke dbscan_fl(pack_array("x", "y"), "cluster_id", epsilon=0.6, min_samples=4, metric_params=dynamic({'p':2}))
| render scatterchart with(series=cluster_id)
Stored
union
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| extend cluster_id=int(null)
| invoke dbscan_fl(pack_array("x", "y"), "cluster_id", epsilon=0.6, min_samples=4, metric_params=dynamic({'p':2}))
| render scatterchart with(series=cluster_id)
6 - detect_anomalous_new_entity_fl()
Detect the appearance of anomalous new entities in timestamped data.
The function detect_anomalous_new_entity_fl()
is a UDF (user-defined function) that detects the appearance of anomalous new entities - such as IP addresses or users - in timestamped data, such as traffic logs. In cybersecurity context, such events might be suspicious and indicate a potential attack or compromise.
The anomaly model is based on a Poisson distribution representing the number of new entities appearing per time bin (such as day) for each scope. Poisson distribution parameter is estimated based on the rate of appearance of new entities in training period, with added decay factor reflecting the fact that recent appearances are more important than old ones. Thus we calculate the probability to encounter a new entity in defined detection period per some scope - such as a subscription or an account. The model output is controlled by several optional parameters, such as minimal threshold for anomaly, decay rate parameter, and others.
The model’s direct output is an anomaly score based on the inverse of estimated probability to encounter a new entity. The score is monotonous in the range of [0, 1], with 1 representing something anomalous. In addition to the anomaly score, there’s a binary flag for detected anomaly (controlled by a minimal threshold parameter), and other explanatory fields.
Syntax
detect_anomalous_new_entity_fl(
entityColumnName, scopeColumnName, timeColumnName, startTraining, startDetection, endDetection, [maxEntitiesThresh], [minTrainingDaysThresh], [decayParam], [anomalyScoreThresh])
Parameters
Name | Type | Required | Description |
---|---|---|---|
entityColumnName | string | ✔️ | The name of the input table column containing the names or IDs of the entities for which anomaly model is calculated. |
scopeColumnName | string | ✔️ | The name of the input table column containing the partition or scope, so that a different anomaly model is built for each scope. |
timeColumnName | string | ✔️ | The name of the input table column containing the timestamps, that are used to define the training and detection periods. |
startTraining | datetime | ✔️ | The beginning of the training period for the anomaly model. Its end is defined by the beginning of detection period. |
startDetection | datetime | ✔️ | The beginning of the detection period for anomaly detection. |
endDetection | datetime | ✔️ | The end of the detection period for anomaly detection. |
maxEntitiesThresh | int | The maximum number of existing entities in scope to calculate anomalies. If the number of entities is above the threshold, the scope is considered too noisy and anomalies aren’t calculated. The default value is 60. | |
minTrainingDaysThresh | int | The minimum number of days in training period that a scope exists to calculate anomalies. If it is below threshold, the scope is considered too new and unknown, so anomalies aren’t calculated. The default value is 14. | |
decayParam | real | The decay rate parameter for anomaly model, a number in range (0,1]. Lower values mean faster decay, so more importance is given to later appearances in training period. A value of 1 means no decay, so a simple average is used for Poisson distribution parameter estimation. The default value is 0.95. | |
anomalyScoreThresh | real | The minimum value of anomaly score for which an anomaly is detected, a number in range [0, 1]. Higher values mean that only more significant cases are considered anomalous, so fewer anomalies are detected (higher precision, lower recall). The default value is 0.9. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let detect_anomalous_new_entity_fl = (T:(*), entityColumnName:string, scopeColumnName:string
, timeColumnName:string, startTraining:datetime, startDetection:datetime, endDetection:datetime
, maxEntitiesThresh:int = 60, minTrainingDaysThresh:int = 14, decayParam:real = 0.95, anomalyScoreThresh:real = 0.9)
{
//pre-process the input data by adding standard column names and dividing to datasets
let timePeriodBinSize = 'day'; // we assume a reasonable bin for time is day, so the probability model is built per that bin size
let processedData = (
T
| extend scope = column_ifexists(scopeColumnName, '')
| extend entity = column_ifexists(entityColumnName, '')
| extend sliceTime = todatetime(column_ifexists(timeColumnName, ''))
| where isnotempty(scope) and isnotempty(entity) and isnotempty(sliceTime)
| extend dataSet = case((sliceTime >= startTraining and sliceTime < startDetection), 'trainSet'
, sliceTime >= startDetection and sliceTime <= endDetection, 'detectSet'
, 'other')
| where dataSet in ('trainSet', 'detectSet')
);
// summarize the data by scope and entity. this will be used to create a distribution of entity appearances based on first seen data
let entityData = (
processedData
| summarize countRowsEntity = count(), firstSeenEntity = min(sliceTime), lastSeenEntity = max(sliceTime), firstSeenSet = arg_min(sliceTime, dataSet)
by scope, entity
| extend firstSeenSet = dataSet
| project-away dataSet
);
// aggregate entity data per scope and get the number of entities appearing over time
let aggregatedCandidateScopeData = (
entityData
| summarize countRowsScope = sum(countRowsEntity), countEntitiesScope = dcount(entity), countEntitiesScopeInTrain = dcountif(entity, firstSeenSet == 'trainSet')
, firstSeenScope = min(firstSeenEntity), lastSeenScope = max(lastSeenEntity), hasNewEntities = iff(dcountif(entity,firstSeenSet == 'detectSet') > 0, 1, 0)
by scope
| extend slicesInTrainingScope = datetime_diff(timePeriodBinSize, startDetection, firstSeenScope)
| where countEntitiesScopeInTrain <= maxEntitiesThresh and slicesInTrainingScope >= minTrainingDaysThresh and lastSeenScope >= startDetection and hasNewEntities == 1
);
let modelData = (
entityData
| join kind = inner (aggregatedCandidateScopeData) on scope
| where firstSeenSet == 'trainSet'
| summarize countAddedEntities = dcount(entity), firstSeenScope = min(firstSeenScope), slicesInTrainingScope = max(slicesInTrainingScope), countEntitiesScope = max(countEntitiesScope)
by scope, firstSeenSet, firstSeenEntity
| extend diffInDays = datetime_diff(timePeriodBinSize, startDetection, firstSeenEntity)
// adding exponentially decaying weights to counts
| extend decayingWeight = pow(base = decayParam, exponent = diffInDays)
| extend decayingValue = countAddedEntities * decayingWeight
| summarize newEntityProbability = round(1 - exp(-1.0 * sum(decayingValue)/max(diffInDays)), 4)
, countKnownEntities = sum(countAddedEntities), lastNewEntityTimestamp = max(firstSeenEntity), slicesOnScope = max(slicesInTrainingScope)///for explainability
by scope, firstSeenSet
// anomaly score is based on probability to get no new entities, calculated using Poisson distribution (P(X=0) = exp(-avg)) with added decay on average
| extend newEntityAnomalyScore = round(1 - newEntityProbability, 4)
| extend isAnomalousNewEntity = iff(newEntityAnomalyScore >= anomalyScoreThresh, 1, 0)
);
let resultsData = (
processedData
| where dataSet == 'detectSet'
| join kind = inner (modelData) on scope
| project-away scope1
| where isAnomalousNewEntity == 1
| summarize arg_min(sliceTime, *) by scope, entity
| extend anomalyType = strcat('newEntity_', entityColumnName), anomalyExplainability = strcat('The ', entityColumnName, ' ', entity, ' wasn\'t seen on ', scopeColumnName, ' ', scope, ' during the last ', slicesOnScope, ' ', timePeriodBinSize, 's. Previously, ', countKnownEntities
, ' entities were seen, the last one of them appearing at ', format_datetime(lastNewEntityTimestamp, 'yyyy-MM-dd HH:mm'), '.')
| join kind = leftouter (entityData | where firstSeenSet == 'trainSet' | extend entityFirstSeens = strcat(entity, ' : ', format_datetime(firstSeenEntity, 'yyyy-MM-dd HH:mm')) | sort by scope, firstSeenEntity asc | summarize anomalyState = make_list(entityFirstSeens) by scope) on scope
| project-away scope1
);
resultsData
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (docstring = "Detect new and anomalous entity (such as username or IP) per scope (such as subscription or account)", skipvalidation = "true", folder = 'KCL')
detect_anomalous_new_entity_fl(T:(*), entityColumnName:string, scopeColumnName:string
, timeColumnName:string, startTraining:datetime, startDetection:datetime, endDetection:datetime
, maxEntitiesThresh:int = 60, minTrainingDaysThresh:int = 14, decayParam:real = 0.95, anomalyScoreThresh:real = 0.9)
{
//pre-process the input data by adding standard column names and dividing to datasets
let timePeriodBinSize = 'day'; // we assume a reasonable bin for time is day, so the probability model is built per that bin size
let processedData = (
T
| extend scope = column_ifexists(scopeColumnName, '')
| extend entity = column_ifexists(entityColumnName, '')
| extend sliceTime = todatetime(column_ifexists(timeColumnName, ''))
| where isnotempty(scope) and isnotempty(entity) and isnotempty(sliceTime)
| extend dataSet = case((sliceTime >= startTraining and sliceTime < startDetection), 'trainSet'
, sliceTime >= startDetection and sliceTime <= endDetection, 'detectSet'
, 'other')
| where dataSet in ('trainSet', 'detectSet')
);
// summarize the data by scope and entity. this will be used to create a distribution of entity appearances based on first seen data
let entityData = (
processedData
| summarize countRowsEntity = count(), firstSeenEntity = min(sliceTime), lastSeenEntity = max(sliceTime), firstSeenSet = arg_min(sliceTime, dataSet)
by scope, entity
| extend firstSeenSet = dataSet
| project-away dataSet
);
// aggregate entity data per scope and get the number of entities appearing over time
let aggregatedCandidateScopeData = (
entityData
| summarize countRowsScope = sum(countRowsEntity), countEntitiesScope = dcount(entity), countEntitiesScopeInTrain = dcountif(entity, firstSeenSet == 'trainSet')
, firstSeenScope = min(firstSeenEntity), lastSeenScope = max(lastSeenEntity), hasNewEntities = iff(dcountif(entity,firstSeenSet == 'detectSet') > 0, 1, 0)
by scope
| extend slicesInTrainingScope = datetime_diff(timePeriodBinSize, startDetection, firstSeenScope)
| where countEntitiesScopeInTrain <= maxEntitiesThresh and slicesInTrainingScope >= minTrainingDaysThresh and lastSeenScope >= startDetection and hasNewEntities == 1
);
let modelData = (
entityData
| join kind = inner (aggregatedCandidateScopeData) on scope
| where firstSeenSet == 'trainSet'
| summarize countAddedEntities = dcount(entity), firstSeenScope = min(firstSeenScope), slicesInTrainingScope = max(slicesInTrainingScope), countEntitiesScope = max(countEntitiesScope)
by scope, firstSeenSet, firstSeenEntity
| extend diffInDays = datetime_diff(timePeriodBinSize, startDetection, firstSeenEntity)
// adding exponentially decaying weights to counts of
| extend decayingWeight = pow(base = decayParam, exponent = diffInDays)
| extend decayingValue = countAddedEntities * decayingWeight
| summarize newEntityProbability = round(1 - exp(-1.0 * sum(decayingValue)/max(diffInDays)), 4)
, countKnownEntities = sum(countAddedEntities), lastNewEntityTimestamp = max(firstSeenEntity), slicesOnScope = max(slicesInTrainingScope)///for explainability
by scope, firstSeenSet
// anomaly score is based on probability to get no new entities, calculated using Poisson distribution (P(X=0) = exp(-avg)) with added decay on average
| extend newEntityAnomalyScore = round(1 - newEntityProbability, 4)
| extend isAnomalousNewEntity = iff(newEntityAnomalyScore >= anomalyScoreThresh, 1, 0)
);
let resultsData = (
processedData
| where dataSet == 'detectSet'
| join kind = inner (modelData) on scope
| project-away scope1
| where isAnomalousNewEntity == 1
| summarize arg_min(sliceTime, *) by scope, entity
| extend anomalyType = strcat('newEntity_', entityColumnName), anomalyExplainability = strcat('The ', entityColumnName, ' ', entity, ' wasn\'t seen on ', scopeColumnName, ' ', scope, ' during the last ', slicesOnScope, ' ', timePeriodBinSize, 's. Previously, ', countKnownEntities
, ' entities were seen, the last one of them appearing at ', format_datetime(lastNewEntityTimestamp, 'yyyy-MM-dd HH:mm'), '.')
| join kind = leftouter (entityData | where firstSeenSet == 'trainSet' | extend entityFirstSeens = strcat(entity, ' : ', format_datetime(firstSeenEntity, 'yyyy-MM-dd HH:mm')) | sort by scope, firstSeenEntity asc | summarize anomalyState = make_list(entityFirstSeens) by scope) on scope
| project-away scope1
);
resultsData
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let detect_anomalous_new_entity_fl = (T:(*), entityColumnName:string, scopeColumnName:string
, timeColumnName:string, startTraining:datetime, startDetection:datetime, endDetection:datetime
, maxEntitiesThresh:int = 60, minTrainingDaysThresh:int = 14, decayParam:real = 0.95, anomalyScoreThresh:real = 0.9)
{
//pre-process the input data by adding standard column names and dividing to datasets
let timePeriodBinSize = 'day'; // we assume a reasonable bin for time is day, so the probability model is built per that bin size
let processedData = (
T
| extend scope = column_ifexists(scopeColumnName, '')
| extend entity = column_ifexists(entityColumnName, '')
| extend sliceTime = todatetime(column_ifexists(timeColumnName, ''))
| where isnotempty(scope) and isnotempty(entity) and isnotempty(sliceTime)
| extend dataSet = case((sliceTime >= startTraining and sliceTime < startDetection), 'trainSet'
, sliceTime >= startDetection and sliceTime <= endDetection, 'detectSet'
, 'other')
| where dataSet in ('trainSet', 'detectSet')
);
// summarize the data by scope and entity. this will be used to create a distribution of entity appearances based on first seen data
let entityData = (
processedData
| summarize countRowsEntity = count(), firstSeenEntity = min(sliceTime), lastSeenEntity = max(sliceTime), firstSeenSet = arg_min(sliceTime, dataSet)
by scope, entity
| extend firstSeenSet = dataSet
| project-away dataSet
);
// aggregate entity data per scope and get the number of entities appearing over time
let aggregatedCandidateScopeData = (
entityData
| summarize countRowsScope = sum(countRowsEntity), countEntitiesScope = dcount(entity), countEntitiesScopeInTrain = dcountif(entity, firstSeenSet == 'trainSet')
, firstSeenScope = min(firstSeenEntity), lastSeenScope = max(lastSeenEntity), hasNewEntities = iff(dcountif(entity,firstSeenSet == 'detectSet') > 0, 1, 0)
by scope
| extend slicesInTrainingScope = datetime_diff(timePeriodBinSize, startDetection, firstSeenScope)
| where countEntitiesScopeInTrain <= maxEntitiesThresh and slicesInTrainingScope >= minTrainingDaysThresh and lastSeenScope >= startDetection and hasNewEntities == 1
);
let modelData = (
entityData
| join kind = inner (aggregatedCandidateScopeData) on scope
| where firstSeenSet == 'trainSet'
| summarize countAddedEntities = dcount(entity), firstSeenScope = min(firstSeenScope), slicesInTrainingScope = max(slicesInTrainingScope), countEntitiesScope = max(countEntitiesScope)
by scope, firstSeenSet, firstSeenEntity
| extend diffInDays = datetime_diff(timePeriodBinSize, startDetection, firstSeenEntity)
// adding exponentially decaying weights to counts
| extend decayingWeight = pow(base = decayParam, exponent = diffInDays)
| extend decayingValue = countAddedEntities * decayingWeight
| summarize newEntityProbability = round(1 - exp(-1.0 * sum(decayingValue)/max(diffInDays)), 4)
, countKnownEntities = sum(countAddedEntities), lastNewEntityTimestamp = max(firstSeenEntity), slicesOnScope = max(slicesInTrainingScope)///for explainability
by scope, firstSeenSet
// anomaly score is based on probability to get no new entities, calculated using Poisson distribution (P(X=0) = exp(-avg)) with added decay on average
| extend newEntityAnomalyScore = round(1 - newEntityProbability, 4)
| extend isAnomalousNewEntity = iff(newEntityAnomalyScore >= anomalyScoreThresh, 1, 0)
);
let resultsData = (
processedData
| where dataSet == 'detectSet'
| join kind = inner (modelData) on scope
| project-away scope1
| where isAnomalousNewEntity == 1
| summarize arg_min(sliceTime, *) by scope, entity
| extend anomalyType = strcat('newEntity_', entityColumnName), anomalyExplainability = strcat('The ', entityColumnName, ' ', entity, ' wasn\'t seen on ', scopeColumnName, ' ', scope, ' during the last ', slicesOnScope, ' ', timePeriodBinSize, 's. Previously, ', countKnownEntities
, ' entities were seen, the last one of them appearing at ', format_datetime(lastNewEntityTimestamp, 'yyyy-MM-dd HH:mm'), '.')
| join kind = leftouter (entityData | where firstSeenSet == 'trainSet' | extend entityFirstSeens = strcat(entity, ' : ', format_datetime(firstSeenEntity, 'yyyy-MM-dd HH:mm')) | sort by scope, firstSeenEntity asc | summarize anomalyState = make_list(entityFirstSeens) by scope) on scope
| project-away scope1
);
resultsData
};
// synthetic data generation
let detectPeriodStart = datetime(2022-04-30 05:00:00.0000000);
let trainPeriodStart = datetime(2022-03-01 05:00);
let names = pack_array("Admin", "Dev1", "Dev2", "IT-support");
let countNames = array_length(names);
let testData = range t from 1 to 24*60 step 1
| extend timeSlice = trainPeriodStart + 1h * t
| extend countEvents = round(2*rand() + iff((t/24)%7>=5, 10.0, 15.0) - (((t%24)/10)*((t%24)/10)), 2) * 100 // generate a series with weekly seasonality
| extend userName = tostring(names[toint(rand(countNames))])
| extend deviceId = hash_md5(rand())
| extend accountName = iff(((rand() < 0.2) and (timeSlice < detectPeriodStart)), 'testEnvironment', 'prodEnvironment')
| extend userName = iff(timeSlice == detectPeriodStart, 'H4ck3r', userName)
| extend deviceId = iff(timeSlice == detectPeriodStart, 'abcdefghijklmnoprtuvwxyz012345678', deviceId)
| sort by timeSlice desc
;
testData
| invoke detect_anomalous_new_entity_fl(entityColumnName = 'userName' //principalName for positive, deviceId for negative
, scopeColumnName = 'accountName'
, timeColumnName = 'timeSlice'
, startTraining = trainPeriodStart
, startDetection = detectPeriodStart
, endDetection = detectPeriodStart
)
Stored
let detectPeriodStart = datetime(2022-04-30 05:00:00.0000000);
let trainPeriodStart = datetime(2022-03-01 05:00);
let names = pack_array("Admin", "Dev1", "Dev2", "IT-support");
let countNames = array_length(names);
let testData = range t from 1 to 24*60 step 1
| extend timeSlice = trainPeriodStart + 1h * t
| extend countEvents = round(2*rand() + iff((t/24)%7>=5, 10.0, 15.0) - (((t%24)/10)*((t%24)/10)), 2) * 100 // generate a series with weekly seasonality
| extend userName = tostring(names[toint(rand(countNames))])
| extend deviceId = hash_md5(rand())
| extend accountName = iff(((rand() < 0.2) and (timeSlice < detectPeriodStart)), 'testEnvironment', 'prodEnvironment')
| extend userName = iff(timeSlice == detectPeriodStart, 'H4ck3r', userName)
| extend deviceId = iff(timeSlice == detectPeriodStart, 'abcdefghijklmnoprtuvwxyz012345678', deviceId)
| sort by timeSlice desc
;
testData
| invoke detect_anomalous_new_entity_fl(entityColumnName = 'userName'
, scopeColumnName = 'accountName'
, timeColumnName = 'timeSlice'
, startTraining = trainPeriodStart
, startDetection = detectPeriodStart
, endDetection = detectPeriodStart
)
Output
scope | entity | sliceTime | t | timeSlice | countEvents | userName | deviceId | accountName | dataSet | firstSeenSet | newEntityProbability | countKnownEntities | lastNewEntityTimestamp | slicesOnScope | newEntityAnomalyScore | isAnomalousNewEntity | anomalyType | anomalyExplainability | anomalyState |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
prodEnvironment | H4ck3r | 2022-04-30 05:00:00.0000000 | 1440 | 2022-04-30 05:00:00.0000000 | 1687 | H4ck3r | abcdefghijklmnoprtuvwxyz012345678 | prodEnvironment | detectSet | trainSet | 0.0031 | 4 | 2022-03-01 09:00:00.0000000 | 60 | 0.9969 | 1 | newEntity_userName | The userName H4ck3r wasn’t seen on accountName prodEnvironment during the last 60 days. Previously, four entities were seen, the last one of them appearing at 2022-03-01 09:00. | [“IT-support : 2022-03-01 07:00”, “Admin : 2022-03-01 08:00”, “Dev2 : 2022-03-01 09:00”, “Dev1 : 2022-03-01 14:00”] |
The output of running the function is the first-seen row in test dataset for each entity per scope, filtered for new entities (meaning they didn’t appear during the training period) that were tagged as anomalous (meaning that entity anomaly score was above anomalyScoreThresh). Some other fields are added for clarity:
dataSet
: current dataset (is alwaysdetectSet
).firstSeenSet
: dataset in which the scope was first seen (should be ’trainSet’).newEntityProbability
: probability to see any new entity based on Poisson model estimation.countKnownEntities
: existing entities on scope.lastNewEntityTimestamp
: last time a new entity was seen before the anomalous one.slicesOnScope
: count of slices per scope.newEntityAnomalyScore
: anomaly score was the new entity in range [0, 1], higher values meaning more anomaly.isAnomalousNewEntity
: binary flag for anomalous new entitiesanomalyType
: shows the type of anomaly (helpful when running several anomaly detection logics together).anomalyExplainability
: textual wrapper for generated anomaly and its explanation.anomalyState
: bag of existing entities on scope with their first seen times.
Running this function on user per account with default parameters gets a previously unseen and anomalous user (‘H4ck3r’) with high anomaly score of 0.9969, meaning that this is unexpected (due to small numbers of existing users in training period).
When we run the function with default parameters on deviceId as entity, we won’t see an anomaly, due to large number of existing devices which makes it expected. However, if we lower the parameter anomalyScoreThresh to 0.0001 and raise the parameter to maxEntitiesThresh to 10000, we’ll effectively decrease precision in favor of recall, and detect an anomaly (with a low anomaly score) on device ‘abcdefghijklmnoprtuvwxyz012345678’.
The output shows the anomalous entities together with explanation fields in standardized format. These fields are useful for investigating the anomaly and for running anomalous entity detection on several entities or running other algorithms together.
The suggested usage in cybersecurity context is running the function on meaningful entities - such as usernames or IP addresses - per meaningful scopes - such as subscription on accounts. A detected anomalous new entity means that its appearance isn’t expected on the scope, and might be suspicious.
7 - factorial_fl()
Calculate factorial.
The function factorial_fl()
is a UDF (user-defined function) that calculates factorial of positive integers (n!). It’s a simple wrapper of the native gamma() function.
Syntax
factorial_fl(
n)
Parameters
Name | Type | Required | Description |
---|---|---|---|
n | int | ✔️ | The input integer for which to calculate the factorial. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let factorial_fl=(n:int)
{
gamma(n+1)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Calculate factorial")
factorial_fl(n:int)
{
gamma(n+1)
}
Example
Query-defined
let factorial_fl=(n:int)
{
gamma(n+1)
};
range x from 1 to 10 step 3
| extend fx = factorial_fl(x)
Stored
range x from 1 to 10 step 3
| extend fx = factorial_fl(x)
Output
x | fx |
---|---|
1 | 1 |
4 | 24 |
7 | 5040 |
10 | 3628799 |
8 - Functions
Functions are reusable queries or query parts. Kusto supports two kinds of functions:
Built-in functions are hard-coded functions defined by Kusto that can’t be modified by users.
User-defined functions, which are divided into two types:
Stored functions: user-defined functions that are stored and managed database schema entities, similar to tables. For more information, see Stored functions. To create a stored function, use the .create function command.
Query-defined functions: user-defined functions that are defined and used within the scope of a single query. The definition of such functions is done through a let statement. For more information on how to create query-defined functions, see Create a user defined function.
For more information on user-defined functions, see User-defined functions.
9 - Functions library
The following article contains a categorized list of UDF (user-defined functions).
The user-defined functions code is given in the articles. It can be used within a let statement embedded in a query or can be persisted in a database using .create function
.
Cybersecurity functions
Function Name | Description |
---|---|
detect_anomalous_new_entity_fl() | Detect the appearance of anomalous new entities in timestamped data. |
detect_anomalous_spike_fl() | Detect the appearance of anomalous spikes in numeric variables in timestamped data. |
graph_blast_radius_fl() | Calculate the Blast Radius (list and score) of source nodes over path or edge data. |
graph_exposure_perimeter_fl() | Calculate the Exposure Perimeter (list and score) of target nodes over path or edge data. |
graph_path_discovery_fl() | Discover valid paths between relevant endpoints (sources and targets) over graph data (edge and nodes). |
General functions
Function Name | Description |
---|---|
geoip_fl() | Retrieves geographic information of ip address. |
get_packages_version_fl() | Returns version information of the Python engine and the specified packages. |
Machine learning functions
Function Name | Description |
---|---|
dbscan_fl() | Clusterize using the DBSCAN algorithm, features are in separate columns. |
dbscan_dynamic_fl() | Clusterize using the DBSCAN algorithm, features are in a single dynamic column. |
kmeans_fl() | Clusterize using the K-Means algorithm, features are in separate columns. |
kmeans_dynamic_fl() | Clusterize using the K-Means algorithm, features are in a single dynamic column. |
predict_fl() | Predict using an existing trained machine learning model. |
predict_onnx_fl() | Predict using an existing trained machine learning model in ONNX format. |
Plotly functions
The following section contains functions for rendering interactive Plotly charts.
Function Name | Description |
---|---|
plotly_anomaly_fl() | Render anomaly chart using a Plotly template. |
plotly_gauge_fl() | Render gauge chart using a Plotly template. |
plotly_scatter3d_fl() | Render 3D scatter chart using a Plotly template. |
PromQL functions
The following section contains common PromQL functions. These functions can be used for analysis of metrics ingested to your database by the Prometheus monitoring system. All functions assume that metrics in your database are structured using the Prometheus data model.
Function Name | Description |
---|---|
series_metric_fl() | Select and retrieve time series stored with the Prometheus data model. |
series_rate_fl() | Calculate the average rate of counter metric increase per second. |
Series processing functions
Function Name | Description |
---|---|
quantize_fl() | Quantize metric columns. |
series_clean_anomalies_fl() | Replace anomalies in a series by interpolated value. |
series_cosine_similarity_fl() | Calculate the cosine similarity of two numerical vectors. |
series_dbl_exp_smoothing_fl() | Apply a double exponential smoothing filter on series. |
series_dot_product_fl() | Calculate the dot product of two numerical vectors. |
series_downsample_fl() | Downsample time series by an integer factor. |
series_exp_smoothing_fl() | Apply a basic exponential smoothing filter on series. |
series_fit_lowess_fl() | Fit a local polynomial to series using LOWESS method. |
series_fit_poly_fl() | Fit a polynomial to series using regression analysis. |
series_fbprophet_forecast_fl() | Forecast time series values using the Prophet algorithm. |
series_lag_fl() | Apply a lag filter on series. |
series_monthly_decompose_anomalies_fl() | Detect anomalies in a series with monthly seasonality. |
series_moving_avg_fl() | Apply a moving average filter on series. |
series_moving_var_fl() | Apply a moving variance filter on series. |
series_mv_ee_anomalies_fl() | Multivariate Anomaly Detection for series using elliptical envelope model. |
series_mv_if_anomalies_fl() | Multivariate Anomaly Detection for series using isolation forest model. |
series_mv_oc_anomalies_fl() | Multivariate Anomaly Detection for series using one class SVM model. |
series_rolling_fl() | Apply a rolling aggregation function on series. |
series_shapes_fl() | Detects positive/negative trend or jump in series. |
series_uv_anomalies_fl() | Detect anomalies in time series using the Univariate Anomaly Detection Cognitive Service API. |
series_uv_change_points_fl() | Detect change points in time series using the Univariate Anomaly Detection Cognitive Service API. |
time_weighted_avg_fl() | Calculates the time weighted average of a metric using fill forward interpolation. |
time_weighted_avg2_fl() | Calculates the time weighted average of a metric using linear interpolation. |
time_weighted_val_fl() | Calculates the time weighted value of a metric using linear interpolation. |
time_window_rolling_avg_fl() | Calculates the rolling average of a metric over a constant duration time window. |
Statistical and probability functions
Function Name | Description |
---|---|
bartlett_test_fl() | Perform the Bartlett test. |
binomial_test_fl() | Perform the binomial test. |
comb_fl() | Calculate C(n, k), the number of combinations for selection of k items out of n. |
factorial_fl() | Calculate n!, the factorial of n. |
ks_test_fl() | Perform a Kolmogorov Smirnov test. |
levene_test_fl() | Perform a Levene test. |
normality_test_fl() | Performs the Normality Test. |
mann_whitney_u_test_fl() | Perform a Mann-Whitney U Test. |
pair_probabilities_fl() | Calculate various probabilities and related metrics for a pair of categorical variables. |
pairwise_dist_fl() | Calculate pairwise distances between entities based on multiple nominal and numerical variables. |
percentiles_linear_fl() | Calculate percentiles using linear interpolation between closest ranks |
perm_fl() | Calculate P(n, k), the number of permutations for selection of k items out of n. |
two_sample_t_test_fl() | Perform the two sample t-test. |
wilcoxon_test_fl() | Perform the Wilcoxon Test. |
Text analytics
Function Name | Description |
---|---|
log_reduce_fl() | Find common patterns in textual logs and output a summary table. |
log_reduce_full_fl() | Find common patterns in textual logs and output a full table. |
log_reduce_predict_fl() | Apply a trained model to find common patterns in textual logs and output a summary table. |
log_reduce_predict_full_fl() | Apply a trained model to find common patterns in textual logs and output a full table. |
log_reduce_train_fl() | Find common patterns in textual logs and output a model. |
10 - geoip_fl()
geoip_fl()
is a user-defined function that retrieves geographic information of ip address.
Syntax
T | invoke geoip_fl(
ip_col,
country_col,
state_col,
city_col,
longitude_col,
latitude_col)
Parameters
Name | Type | Required | Description |
---|---|---|---|
ip_col | string | ✔️ | The name of the column containing the IP addresses to resolve. |
country_col | string | ✔️ | The name of the column to store the retrieved country. |
state_col | string | ✔️ | The name of the column to store the retrieved state. |
city_col | string | ✔️ | The name of the column to store the retrieved city. |
longitude_col | real | ✔️ | The name of the column to store the retrieved longitude. |
latitude_col | real | ✔️ | The name of the column to store the retrieved latitude. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let geoip_fl=(tbl:(*), ip_col:string, country_col:string, state_col:string, city_col:string, longitude_col:string, latitude_col:string)
{
let kwargs = bag_pack('ip_col', ip_col, 'country_col', country_col, 'state_col', state_col, 'city_col', city_col, 'longitude_col', longitude_col, 'latitude_col', latitude_col);
let code= ```if 1:
from sandbox_utils import Zipackage
Zipackage.install('geoip2.zip')
import geoip2.database
ip_col = kargs['ip_col']
country_col = kargs['country_col']
state_col = kargs['state_col']
city_col = kargs['city_col']
longitude_col = kargs['longitude_col']
latitude_col = kargs['latitude_col']
result=df
reader = geoip2.database.Reader(r'C:\\Temp\\GeoLite2-City.mmdb')
def geodata(ip):
try:
gd = reader.city(ip)
geo = pd.Series((gd.country.name, gd.subdivisions.most_specific.name, gd.city.name, gd.location.longitude, gd.location.latitude))
except:
geo = pd.Series((None, None, None, None, None))
return geo
result[[country_col, state_col, city_col, longitude_col, latitude_col]] = result[ip_col].apply(geodata)
```;
tbl
| evaluate python(typeof(*), code, kwargs,
external_artifacts =
pack('geoip2.zip', 'https://artifactswestus.blob.core.windows.net/public/geoip2-4.6.0.zip',
'GeoLite2-City.mmdb', 'https://artifactswestus.blob.core.windows.net/public/GeoLite2-City-20230221.mmdb')
)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = 'Packages\\Utils', docstring = 'Retrieve geographics of ip address')
geoip_fl(tbl:(*), ip_col:string, country_col:string, state_col:string, city_col:string, longitude_col:string, latitude_col:string)
{
let kwargs = bag_pack('ip_col', ip_col, 'country_col', country_col, 'state_col', state_col, 'city_col', city_col, 'longitude_col', longitude_col, 'latitude_col', latitude_col);
let code= ```if 1:
from sandbox_utils import Zipackage
Zipackage.install('geoip2.zip')
import geoip2.database
ip_col = kargs['ip_col']
country_col = kargs['country_col']
state_col = kargs['state_col']
city_col = kargs['city_col']
longitude_col = kargs['longitude_col']
latitude_col = kargs['latitude_col']
result=df
reader = geoip2.database.Reader(r'C:\\Temp\\GeoLite2-City.mmdb')
def geodata(ip):
try:
gd = reader.city(ip)
geo = pd.Series((gd.country.name, gd.subdivisions.most_specific.name, gd.city.name, gd.location.longitude, gd.location.latitude))
except:
geo = pd.Series((None, None, None, None, None))
return geo
result[[country_col, state_col, city_col, longitude_col, latitude_col]] = result[ip_col].apply(geodata)
```;
tbl
| evaluate python(typeof(*), code, kwargs,
external_artifacts =
pack('geoip2.zip', 'https://artifactswestus.blob.core.windows.net/public/geoip2-4.6.0.zip',
'GeoLite2-City.mmdb', 'https://artifactswestus.blob.core.windows.net/public/GeoLite2-City-20230221.mmdb')
)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let geoip_fl=(tbl:(*), ip_col:string, country_col:string, state_col:string, city_col:string, longitude_col:string, latitude_col:string)
{
let kwargs = bag_pack('ip_col', ip_col, 'country_col', country_col, 'state_col', state_col, 'city_col', city_col, 'longitude_col', longitude_col, 'latitude_col', latitude_col);
let code= ```if 1:
from sandbox_utils import Zipackage
Zipackage.install('geoip2.zip')
import geoip2.database
ip_col = kargs['ip_col']
country_col = kargs['country_col']
state_col = kargs['state_col']
city_col = kargs['city_col']
longitude_col = kargs['longitude_col']
latitude_col = kargs['latitude_col']
result=df
reader = geoip2.database.Reader(r'C:\\Temp\\GeoLite2-City.mmdb')
def geodata(ip):
try:
gd = reader.city(ip)
geo = pd.Series((gd.country.name, gd.subdivisions.most_specific.name, gd.city.name, gd.location.longitude, gd.location.latitude))
except:
geo = pd.Series((None, None, None, None, None))
return geo
result[[country_col, state_col, city_col, longitude_col, latitude_col]] = result[ip_col].apply(geodata)
```;
tbl
| evaluate python(typeof(*), code, kwargs,
external_artifacts =
pack('geoip2.zip', 'https://artifactswestus.blob.core.windows.net/public/geoip2-4.6.0.zip',
'GeoLite2-City.mmdb', 'https://artifactswestus.blob.core.windows.net/public/GeoLite2-City-20230221.mmdb')
)
};
datatable(ip:string) [
'8.8.8.8',
'20.53.203.50',
'20.81.111.85',
'20.103.85.33',
'20.84.181.62',
'205.251.242.103',
]
| extend country='', state='', city='', longitude=real(null), latitude=real(null)
| invoke geoip_fl('ip','country', 'state', 'city', 'longitude', 'latitude')
Stored
datatable(ip:string) [
'8.8.8.8',
'20.53.203.50',
'20.81.111.85',
'20.103.85.33',
'20.84.181.62',
'205.251.242.103',
]
| extend country='', state='', city='', longitude=real(null), latitude=real(null)
| invoke geoip_fl('ip','country', 'state', 'city', 'longitude', 'latitude')
Output
ip | country | state | city | longitude | latitude |
---|---|---|---|---|---|
20.103.85.33 | Netherlands | North Holland | Amsterdam | 4.8883 | 52.3716 |
20.53.203.50 | Australia | New South Wales | Sydney | 151.2006 | -33.8715 |
20.81.111.85 | United States | Virginia | Tappahannock | -76.8545 | 37.9273 |
20.84.181.62 | United States | Iowa | Des Moines | -93.6124 | 41.6021 |
205.251.242.103 | United States | Virginia | Ashburn | -77.4903 | 39.0469 |
8.8.8.8 | United States | California | Los Angeles | -118.2441 | 34.0544 |
11 - get_packages_version_fl()
get_packages_version_fl()
is a user-defined function that retrieves the versions of the Python engine and packages of the inline python() plugin.
The function accepts a dynamic array containing the names of the packages to check, and returns their respective versions and the Python engine version.
Syntax
T | invoke get_packages_version_fl(
packages)
Parameters
Name | Type | Required | Description |
---|---|---|---|
packages | dynamic | A dynamic array containing the names of the packages. Default is empty list to retrieve only the Python engine version. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let get_packages_version_fl = (packages:dynamic=dynamic([]))
{
let kwargs = pack('packages', packages);
let code =
```if 1:
import importlib
import sys
packages = kargs["packages"]
result = pd.DataFrame(columns=["name", "ver"])
for i in range(len(packages)):
result.loc[i, "name"] = packages[i]
try:
m = importlib.import_module(packages[i])
result.loc[i, "ver"] = m.__version__ if hasattr(m, "__version__") else "missing __version__ attribute"
except Exception as ex:
result.loc[i, "ver"] = "ERROR: " + (ex.msg if hasattr(ex, "msg") else "exception, no msg")
id = result.shape[0]
result.loc[id, "name"] = "Python"
result.loc[id, "ver"] = sys.version
```;
print 1
| evaluate python(typeof(name:string , ver:string), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Utils", docstring = "Returns version information of the Python engine and the specified packages")
get_packages_version_fl(packages:dynamic=dynamic([]))
{
let kwargs = pack('packages', packages);
let code =
```if 1:
import importlib
import sys
packages = kargs["packages"]
result = pd.DataFrame(columns=["name", "ver"])
for i in range(len(packages)):
result.loc[i, "name"] = packages[i]
try:
m = importlib.import_module(packages[i])
result.loc[i, "ver"] = m.__version__ if hasattr(m, "__version__") else "missing __version__ attribute"
except Exception as ex:
result.loc[i, "ver"] = "ERROR: " + (ex.msg if hasattr(ex, "msg") else "exception, no msg")
id = result.shape[0]
result.loc[id, "name"] = "Python"
result.loc[id, "ver"] = sys.version
```;
print 1
| evaluate python(typeof(name:string , ver:string), code, kwargs)
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let get_packages_version_fl = (packages:dynamic=dynamic([]))
{
let kwargs = pack('packages', packages);
let code =
```if 1:
import importlib
import sys
packages = kargs["packages"]
result = pd.DataFrame(columns=["name", "ver"])
for i in range(len(packages)):
result.loc[i, "name"] = packages[i]
try:
m = importlib.import_module(packages[i])
result.loc[i, "ver"] = m.__version__ if hasattr(m, "__version__") else "missing __version__ attribute"
except Exception as ex:
result.loc[i, "ver"] = "ERROR: " + (ex.msg if hasattr(ex, "msg") else "exception, no msg")
id = result.shape[0]
result.loc[id, "name"] = "Python"
result.loc[id, "ver"] = sys.version
```;
print 1
| evaluate python(typeof(name:string , ver:string), code, kwargs)
};
get_packages_version_fl(pack_array('numpy', 'scipy', 'pandas', 'statsmodels', 'sklearn', 'onnxruntime', 'plotly'))
Stored
get_packages_version_fl(pack_array('numpy', 'scipy', 'pandas', 'statsmodels', 'sklearn', 'onnxruntime', 'plotly'))
Output
name | ver |
---|---|
numpy | 1.23.4 |
onnxruntime | 1.13.1 |
pandas | 1.5.1 |
plotly | 5.11.0 |
Python | 3.10.8 (tags/v3.10.8:aaaf517, Oct 11 2022, 16:50:30) [MSC v.1933 64 bit (AMD64)] |
scipy | 1.9.3 |
sklearn | 1.1.3 |
statsmodels | 0.13.2 |
12 - kmeans_dynamic_fl()
The function kmeans_dynamic_fl()
is a UDF (user-defined function) that clusterizes a dataset using the k-means algorithm. This function is similar to kmeans_fl() just the features are supplied by a single numerical array column and not by multiple scalar columns.
Syntax
T | invoke kmeans_dynamic_fl(
k,
features_col,
cluster_col)
Parameters
Name | Type | Required | Description |
---|---|---|---|
k | int | ✔️ | The number of clusters. |
features_col | string | ✔️ | The name of the column containing the numeric array of features to be used for clustering. |
cluster_col | string | ✔️ | The name of the column to store the output cluster ID for each record. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let kmeans_dynamic_fl=(tbl:(*),k:int, features_col:string, cluster_col:string)
{
let kwargs = bag_pack('k', k, 'features_col', features_col, 'cluster_col', cluster_col);
let code = ```if 1:
from sklearn.cluster import KMeans
k = kargs["k"]
features_col = kargs["features_col"]
cluster_col = kargs["cluster_col"]
df1 = df[features_col].apply(np.array)
matrix = np.vstack(df1.values)
kmeans = KMeans(n_clusters=k, random_state=0)
kmeans.fit(matrix)
result = df
result[cluster_col] = kmeans.labels_
```;
tbl
| evaluate python(typeof(*),code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\ML", docstring = "K-Means clustering of features passed as a single column containing numerical array")
kmeans_dynamic_fl(tbl:(*),k:int, features_col:string, cluster_col:string)
{
let kwargs = bag_pack('k', k, 'features_col', features_col, 'cluster_col', cluster_col);
let code = ```if 1:
from sklearn.cluster import KMeans
k = kargs["k"]
features_col = kargs["features_col"]
cluster_col = kargs["cluster_col"]
df1 = df[features_col].apply(np.array)
matrix = np.vstack(df1.values)
kmeans = KMeans(n_clusters=k, random_state=0)
kmeans.fit(matrix)
result = df
result[cluster_col] = kmeans.labels_
```;
tbl
| evaluate python(typeof(*),code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Clustering of artificial dataset with three clusters
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let kmeans_dynamic_fl=(tbl:(*),k:int, features_col:string, cluster_col:string)
{
let kwargs = bag_pack('k', k, 'features_col', features_col, 'cluster_col', cluster_col);
let code = ```if 1:
from sklearn.cluster import KMeans
k = kargs["k"]
features_col = kargs["features_col"]
cluster_col = kargs["cluster_col"]
df1 = df[features_col].apply(np.array)
matrix = np.vstack(df1.values)
kmeans = KMeans(n_clusters=k, random_state=0)
kmeans.fit(matrix)
result = df
result[cluster_col] = kmeans.labels_
```;
tbl
| evaluate python(typeof(*),code, kwargs)
};
union
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| project Features=pack_array(x, y), cluster_id=int(null)
| invoke kmeans_dynamic_fl(3, "Features", "cluster_id")
| extend x=toreal(Features[0]), y=toreal(Features[1])
| render scatterchart with(series=cluster_id)
Stored
union
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| project Features=pack_array(x, y), cluster_id=int(null)
| invoke kmeans_dynamic_fl(3, "Features", "cluster_id")
| extend x=toreal(Features[0]), y=toreal(Features[1])
| render scatterchart with(series=cluster_id)
13 - kmeans_fl()
The function kmeans_fl()
is a UDF (user-defined function) that clusterizes a dataset using the k-means algorithm.
Syntax
T | invoke kmeans_fl(
k,
features,
cluster_col)
Parameters
Name | Type | Required | Description |
---|---|---|---|
k | int | ✔️ | The number of clusters. |
features | dynamic | ✔️ | An array containing the names of the features columns to use for clustering. |
cluster_col | string | ✔️ | The name of the column to store the output cluster ID for each record. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let kmeans_fl=(tbl:(*), k:int, features:dynamic, cluster_col:string)
{
let kwargs = bag_pack('k', k, 'features', features, 'cluster_col', cluster_col);
let code = ```if 1:
from sklearn.cluster import KMeans
k = kargs["k"]
features = kargs["features"]
cluster_col = kargs["cluster_col"]
km = KMeans(n_clusters=k)
df1 = df[features]
km.fit(df1)
result = df
result[cluster_col] = km.labels_
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create function with (folder = "Packages\\ML", docstring = "K-Means clustering")
kmeans_fl(tbl:(*), k:int, features:dynamic, cluster_col:string)
{
let kwargs = bag_pack('k', k, 'features', features, 'cluster_col', cluster_col);
let code = ```if 1:
from sklearn.cluster import KMeans
k = kargs["k"]
features = kargs["features"]
cluster_col = kargs["cluster_col"]
km = KMeans(n_clusters=k)
df1 = df[features]
km.fit(df1)
result = df
result[cluster_col] = km.labels_
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Clusterize artificial dataset with three clusters
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let kmeans_fl=(tbl:(*), k:int, features:dynamic, cluster_col:string)
{
let kwargs = bag_pack('k', k, 'features', features, 'cluster_col', cluster_col);
let code = ```if 1:
from sklearn.cluster import KMeans
k = kargs["k"]
features = kargs["features"]
cluster_col = kargs["cluster_col"]
km = KMeans(n_clusters=k)
df1 = df[features]
km.fit(df1)
result = df
result[cluster_col] = km.labels_
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
OccupancyDetection
| extend cluster_id=int(null)
union
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| invoke kmeans_fl(3, bag_pack("x", "y"), "cluster_id")
| render scatterchart with(series=cluster_id)
Stored
union
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| invoke kmeans_fl(3, bag_pack("x", "y"), "cluster_id")
| render scatterchart with(series=cluster_id)
14 - ks_test_fl()
The function ks_test_fl()
is a UDF (user-defined function) that performs the Kolmogorov Smirnov Test.
Syntax
T | invoke ks_test_fl(
data1,
data2,
test_statistic,
p_value)
Parameters
Name | Type | Required | Description |
---|---|---|---|
data1 | string | ✔️ | The name of the column containing the first set of data to be used for the test. |
data2 | string | ✔️ | The name of the column containing the second set of data to be used for the test. |
test_statistic | string | ✔️ | The name of the column to store test statistic value for the results. |
p_value | string | ✔️ | The name of the column to store p-value for the results. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let ks_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.ks_2samp(row[data1], row[data2])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Kolmogorov Smirnov Test")
ks_test_fl(tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.ks_2samp(row[data1], row[data2])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let ks_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.ks_2samp(row[data1], row[data2])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke ks_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Stored
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke ks_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Output
id | sample1 | sample2 | test_stat | p_val |
---|---|---|---|---|
Test #1 | [23.64, 20.57, 20.42] | [27.1, 22.12, 33.56] | 0.66666666666666674 | 0.3197243332709643 |
Test #2 | [20.85, 21.89, 23.41] | [35.09, 30.02, 26.52] | 1 | 0.03262165165202116 |
Test #3 | [20.13, 20.5, 21.7, 22.02] | [32.2, 32.79, 33.9, 34.22] | 1 | 0.01106563701580386 |
15 - levene_test_fl()
The function levene_test_fl()
is a UDF (user-defined function) that performs the Levene Test.
Syntax
T | invoke levene_test_fl(
data1,
data2,
test_statistic,
p_value)
Parameters
Name | Type | Required | Description |
---|---|---|---|
data1 | string | ✔️ | The name of the column containing the first set of data to be used for the test. |
data2 | string | ✔️ | The name of the column containing the second set of data to be used for the test. |
test_statistic | string | ✔️ | The name of the column to store test statistic value for the results. |
p_value | string | ✔️ | The name of the column to store p-value for the results. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
<!-- let levene_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.levene(row[data1], row[data2])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Levene Test")
levene_test_fl(tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.levene(row[data1], row[data2])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
<!-- let levene_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.levene(row[data1], row[data2])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke levene_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Stored
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke levene_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Output
id | sample1 | sample2 | test_stat | p_val |
---|---|---|---|---|
Test #1 | [23.64, 20.57, 20.42] | [27.1, 22.12, 33.56] | 1.5587395987367387 | 0.27993504690044563 |
Test #2 | [20.85, 21.89, 23.41] | [35.09, 30.02, 26.52] | 1.6402495788130482 | 0.26950872948841353 |
Test #3 | [20.13, 20.5, 21.7, 22.02] | [32.2, 32.79, 33.9, 34.22] | 0.0032989690721642395 | 0.95606240301049072 |
16 - log_reduce_fl()
The function log_reduce_fl()
finds common patterns in semi-structured textual columns, such as log lines, and clusters the lines according to the extracted patterns. It outputs a summary table containing the found patterns sorted top down by their respective frequency.
Syntax
T |
invoke
log_reduce_fl(
reduce_col [,
use_logram [,
use_drain [,
custom_regexes [,
custom_regexes_policy [,
delimiters [,
similarity_th [,
tree_depth [,
trigram_th [,
bigram_th ]]]]]]]]])
Parameters
The following parameters description is a summary. For more information, see More about the algorithm section.
Name | Type | Required | Description |
---|---|---|---|
reduce_col | string | ✔️ | The name of the string column the function is applied to. |
use_logram | bool | Enable or disable the Logram algorithm. Default value is true . | |
use_drain | bool | Enable or disable the Drain algorithm. Default value is true . | |
custom_regexes | dynamic | A dynamic array containing pairs of regular expression and replacement symbols to be searched in each input row, and replaced with their respective matching symbol. Default value is dynamic([]) . The default regex table replaces numbers, IP addresses, and GUIDs. | |
custom_regexes_policy | string | Either ‘prepend’, ‘append’ or ‘replace’. Controls whether custom_regexes are prepend/append/replace the default ones. Default value is ‘prepend’. | |
delimiters | dynamic | A dynamic array containing delimiter strings. Default value is dynamic([" "]) , defining space as the only single character delimiter. | |
similarity_th | real | Similarity threshold, used by the Drain algorithm. Increasing similarity_th results in more refined databases. Default value is 0.5. If Drain is disabled, then this parameter has no effect. | |
tree_depth | int | Increasing tree_depth improves the runtime of the Drain algorithm, but might reduce its accuracy. Default value is 4. If Drain is disabled, then this parameter has no effect. | |
trigram_th | int | Decreasing trigram_th increases the chances of Logram to replace tokens with wildcards. Default value is 10. If Logram is disabled, then this parameter has no effect. | |
bigram_th | int | Decreasing bigram_th increases the chances of Logram to replace tokens with wildcards. Default value is 15. If Logram is disabled, then this parameter has no effect. |
More about the algorithm
The function runs multiples passes over the rows to be reduced to common patterns. The following list explains the passes:
Regular expression replacements: In this pass, each line is independently matched to a set of regular expressions, and each matched expression is replaced by a replacement symbol. The default regular expressions replace IP addresses, numbers, and GUIDs with /<IP>, <GUID> and /<NUM>. The user can prepend/append more regular expressions to those, or replace it with new ones or empty list by modifying custom_regexes and custom_regexes_policy. For example, to replace whole numbers with <WNUM> set custom_regexes=pack_array(’/^\d+$/’, ‘<WNUM>’); to cancel regular expressions replacement set custom_regexes_policy=‘replace’. For each line, the function keeps list of the original expressions (before replacements) to be output as parameters of the generic replacement tokens.
Tokenization: similar to the previous step, each line is processed independently and broken into tokens based on set of delimiters. For example, to define breaking to tokens by either comma, period or semicolon set delimiters=pack_array(’,’, ‘.’, ‘;’).
Apply Logram algorithm: this pass is optional, pending use_logram is true. We recommend using Logram when large scale is required, and when parameters can appear in the first tokens of the log entry. OTOH, disable it when the log entries are short, as the algorithm tends to replace tokens with wildcards too often in such cases. The Logram algorithm considers 3-tuples and 2-tuples of tokens. If a 3-tuple of tokens is common in the log lines (it appears more than trigram_th times), then it’s likely that all three tokens are part of the pattern. If the 3-tuple is rare, then it’s likely that it contains a variable that should be replaced by a wildcard. For rare 3-tuples, we consider the frequency with which 2-tuples contained in the 3-tuple appear. If a 2-tuple is common (it appears more than bigram_th times), then the remaining token is likely to be a parameter, and not part of the pattern.
The Logram algorithm is easy to parallelize. It requires two passes on the log corpus: the first one to count the frequency of each 3-tuple and 2-tuple, and the second one to apply the logic previously described to each entry. To parallelize the algorithm, we only need to partition the log entries, and unify the frequency counts of different workers.Apply Drain algorithm: this pass is optional, pending use_drain is true. Drain is a log parsing algorithm based on a truncated depth prefix tree. Log messages are split according to their length, and for each length the first tree_depth tokens of the log message are used to build a prefix tree. If no match for the prefix tokens was found, a new branch is created. If a match for the prefix was found, we search for the most similar pattern among the patterns contained in the tree leaf. Pattern similarity is measured by the ratio of matched nonwildcard tokens out of all tokens. If the similarity of the most similar pattern is above the similarity threshold (the parameter similarity_th), then the log entry is matched to the pattern. For that pattern, the function replaces all nonmatching tokens by wildcards. If the similarity of the most similar pattern is below the similarity threshold, a new pattern containing the log entry is created.
We set default tree_depth to 4 based on testing various logs. Increasing this depth can improve runtime but might degrade patterns accuracy; decreasing it’s more accurate but slower, as each node performs many more similarity tests.
Usually, Drain efficiently generalizes and reduces patterns (though it’s hard to be parallelized). However, as it relies on a prefix tree, it might not be optimal in log entries containing parameters in the first tokens. This can be resolved in most cases by applying Logram first.
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let log_reduce_fl=(tbl:(*), reduce_col:string,
use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>',
'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>',
'(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', 'LogReduce', 'parameters_column', '',
'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table,
'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th,
'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn',
'output_type', 'summary');
let code = ```if 1:
from log_cluster import log_reduce
result = log_reduce.log_reduce(df, kargs)
```;
tbl
| extend LogReduce=''
| evaluate python(typeof(Count:int, LogReduce:string, example:string), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = 'Packages\\Text', docstring = 'Find common patterns in textual logs, output a summary table')
log_reduce_fl(tbl:(*), reduce_col:string,
use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>',
'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>',
'(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', 'LogReduce', 'parameters_column', '',
'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table,
'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th,
'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn',
'output_type', 'summary');
let code = ```if 1:
from log_cluster import log_reduce
result = log_reduce.log_reduce(df, kargs)
```;
tbl
| extend LogReduce=''
| evaluate python(typeof(Count:int, LogReduce:string, example:string), code, kwargs)
}
Example
The following example uses the invoke operator to run the function. This example uses Apache Hadoop distributed file system logs.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let log_reduce_fl=(tbl:(*), reduce_col:string,
use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>',
'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>',
'(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', 'LogReduce', 'parameters_column', '',
'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table,
'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th,
'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn',
'output_type', 'summary');
let code = ```if 1:
from log_cluster import log_reduce
result = log_reduce.log_reduce(df, kargs)
```;
tbl
| extend LogReduce=''
| evaluate python(typeof(Count:int, LogReduce:string, example:string), code, kwargs)
};
//
// Finding common patterns in HDFS logs, a commonly used benchmark for log parsing
//
HDFS_log
| take 100000
| invoke log_reduce_fl(reduce_col="data")
Stored
//
// Finding common patterns in HDFS logs, a commonly used benchmark for log parsing
//
HDFS_log
| take 100000
| invoke log_reduce_fl(reduce_col="data")
Output
Count | LogReduce | Example |
---|---|---|
55356 | 081110 | <NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* NameSystem.delete: blk_<NUM> is added to invalidSet of <IP> 081110 220623 26 INFO dfs.FSNamesystem: BLOCK* NameSystem.delete: blk_1239016582509138045 is added to invalidSet of 10.251.123.195:50010 |
10278 | 081110 | <NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: <IP> is added to blk_<NUM> size <NUM> 081110 215858 27 INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.11.85:50010 is added to blk_5080254298708411681 size 67108864 |
10256 | 081110 | <NUM> <NUM> INFO dfs.DataNode$PacketResponder: PacketResponder <NUM> for block blk_<NUM> terminating 081110 215858 15496 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-7746692545918257727 terminating |
10256 | 081110 | <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> 081110 215858 15485 INFO dfs.DataNode$PacketResponder: Received block blk_5080254298708411681 of size 67108864 from /10.251.43.21 |
9140 | 081110 | <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> 081110 215858 15494 INFO dfs.DataNode$DataXceiver: Receiving block blk_-7037346755429293022 src: /10.251.43.21:45933 dest: /10.251.43.21:50010 |
3047 | 081110 | <NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: /user/root/rand3/temporary/task<NUM><NUM>m<NUM>_<NUM>/part-<NUM>. <> 081110 215858 26 INFO dfs.FSNamesystem: BLOCK NameSystem.allocateBlock: /user/root/rand3/_temporary/task_200811101024_0005_m_001805_0/part-01805. blk-7037346755429293022 |
1402 | 081110 | <NUM> <NUM> INFO <>: <> block blk_<NUM> <> <> 081110 215957 15556 INFO dfs.DataNode$DataTransfer: 10.250.15.198:50010:Transmitted block blk_-3782569120714539446 to /10.251.203.129:50010 |
177 | 081110 | <NUM> <NUM> INFO <>: <> <> <> <*> 081110 215859 13 INFO dfs.DataBlockScanner: Verification succeeded for blk_-7244926816084627474 |
36 | 081110 | <NUM> <NUM> INFO <>: <> <> <> for block <*> 081110 215924 15636 INFO dfs.DataNode$BlockReceiver: Receiving empty packet for block blk_3991288654265301939 |
12 | 081110 | <NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* <> <> <> <> <> <> <> <> 081110 215953 19 INFO dfs.FSNamesystem: BLOCK* ask 10.250.15.198:50010 to replicate blk_-3782569120714539446 to datanode(s) 10.251.203.129:50010 |
12 | 081110 | <NUM> <NUM> INFO <>: <> <> <> <> <> block blk_<NUM> <> <> 081110 215955 18 INFO dfs.DataNode: 10.250.15.198:50010 Starting thread to transfer block blk_-3782569120714539446 to 10.251.203.129:50010 |
12 | 081110 | <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Received block blk_<NUM> src: <IP> dest: <IP> of size <NUM> 081110 215957 15226 INFO dfs.DataNode$DataXceiver: Received block blk_-3782569120714539446 src: /10.250.15.198:51013 dest: /10.250.15.198:50010 of size 14474705 |
6 | 081110 | <NUM> <NUM> <> dfs.FSNamesystem: BLOCK NameSystem.addStoredBlock: <> <> <> <> <> <> <> <> size <NUM> 081110 215924 27 WARN dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: Redundant addStoredBlock request received for blk_2522553781740514003 on 10.251.202.134:50010 size 67108864 |
6 | 081110 | <NUM> <NUM> INFO dfs.DataNode$DataXceiver: <> <> <> <> <>: <> <> <> <> <> 081110 215936 15714 INFO dfs.DataNode$DataXceiver: writeBlock blk_720939897861061328 received exception java.io.IOException: Couldn’t read from stream |
3 | 081110 | <NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: <> <> <> <> <> <> <> size <NUM> <> <> <> <> <> <> <> <>. 081110 220635 28 INFO dfs.FSNamesystem: BLOCK NameSystem.addStoredBlock: addStoredBlock request received for blk_-81196479666306310 on 10.250.17.177:50010 size 53457811 But it doesn’t belong to any file. |
1 | 081110 | <NUM> <NUM> <> <>: <> <> <> <> <> <> <>. <> <> <> <> <>. 081110 220631 19 WARN dfs.FSDataset: Unexpected error trying to delete block blk_-2012154052725261337. BlockInfo not found in volumeMap. |
17 - log_reduce_full_fl()
The function log_reduce_full_fl()
finds common patterns in semi structured textual columns, such as log lines, and clusters the lines according to the extracted patterns. The function’s algorithm and most of the parameters are identical to log_reduce_fl(). However, log_reduce_fl()
outputs a patterns summary table, whereas this function outputs a full table containing the pattern and parameters per each line.
Syntax
T |
invoke
log_reduce_full_fl(
reduce_col [,
pattern_col [,
parameters_col [,
use_logram [,
use_drain [,
custom_regexes [,
custom_regexes_policy [,
delimiters [,
similarity_th [,
tree_depth [,
trigram_th [,
bigram_th ]]]]]]]]]]])
Parameters
The following parameters description is a summary. For more information, see More about the algorithm section.
Name | Type | Required | Description |
---|---|---|---|
reduce_col | string | ✔️ | The name of the string column the function is applied to. |
pattern_col | string | ✔️ | The name of the string column to populate the pattern. |
parameters_col | string | ✔️ | The name of the string column to populate the pattern’s parameters. |
use_logram | bool | Enable or disable the Logram algorithm. Default value is true . | |
use_drain | bool | Enable or disable the Drain algorithm. Default value is true . | |
custom_regexes | dynamic | A dynamic array containing pairs of regular expression and replacement symbols to be searched in each input row, and replaced with their respective matching symbol. Default value is dynamic([]) . The default regex table replaces numbers, IPs and GUIDs. | |
custom_regexes_policy | string | Either ‘prepend’, ‘append’ or ‘replace’. Controls whether custom_regexes are prepend/append/replace the default ones. Default value is ‘prepend’. | |
delimiters | dynamic | A dynamic array containing delimiter strings. Default value is dynamic([" "]) , defining space as the only single character delimiter. | |
similarity_th | real | Similarity threshold, used by the Drain algorithm. Increasing similarity_th results in more refined clusters. Default value is 0.5. If Drain is disabled, then this parameter has no effect. | |
tree_depth | int | Increasing tree_depth improves the runtime of the Drain algorithm, but might reduce its accuracy. Default value is 4. If Drain is disabled, then this parameter has no effect. | |
trigram_th | int | Decreasing trigram_th increases the chances of Logram to replace tokens with wildcards. Default value is 10. If Logram is disabled, then this parameter has no effect. | |
bigram_th | int | Decreasing bigram_th increases the chances of Logram to replace tokens with wildcards. Default value is 15. If Logram is disabled, then this parameter has no effect. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let log_reduce_full_fl=(tbl:(*), reduce_col:string, pattern_col:string, parameters_col:string,
use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>',
'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>',
'(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', pattern_col, 'parameters_column', parameters_col,
'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table,
'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th,
'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn',
'output_type', 'full');
let code = ```if 1:
from log_cluster import log_reduce
result = log_reduce.log_reduce(df, kargs)
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = 'Packages\\Text', docstring = 'Find common patterns in textual logs, output a full table')
log_reduce_full_fl(tbl:(*), reduce_col:string, pattern_col:string, parameters_col:string,
use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>',
'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>',
'(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', pattern_col, 'parameters_column', parameters_col,
'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table,
'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th,
'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn',
'output_type', 'full');
let code = ```if 1:
from log_cluster import log_reduce
result = log_reduce.log_reduce(df, kargs)
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let log_reduce_full_fl=(tbl:(*), reduce_col:string, pattern_col:string, parameters_col:string,
use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>',
'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>',
'(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', pattern_col, 'parameters_column', parameters_col,
'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table,
'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th,
'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn',
'output_type', 'full');
let code = ```if 1:
from log_cluster import log_reduce
result = log_reduce.log_reduce(df, kargs)
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
//
// Finding common patterns in HDFS logs, a commonly used benchmark for log parsing
//
HDFS_log
| take 100000
| extend Patterns="", Parameters=""
| invoke log_reduce_full_fl(reduce_col="data", pattern_col="Patterns", parameters_col="Parameters")
| take 10
Stored
//
// Finding common patterns in HDFS logs, a commonly used benchmark for log parsing
//
HDFS_log
| take 100000
| extend Patterns="", Parameters=""
| invoke log_reduce_full_fl(reduce_col="data", pattern_col="Patterns", parameters_col="Parameters")
| take 10
Output
data | Patterns | Parameters |
---|---|---|
081110 | 215858 | 15485 INFO dfs.DataNode$PacketResponder: Received block blk_5080254298708411681 of size 67108864 from /10.251.43.21 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15485"”, ““parameter_2"”: ““5080254298708411681"”, ““parameter_3"”: ““67108864"”, ““parameter_4"”: “"/10.251.43.21"”}” |
081110 | 215858 | 15494 INFO dfs.DataNode$DataXceiver: Receiving block blk_-7037346755429293022 src: /10.251.43.21:45933 dest: /10.251.43.21:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15494"”, ““parameter_2"”: “"-7037346755429293022"”, ““parameter_3"”: “"/10.251.43.21:45933"”, ““parameter_4"”: “"/10.251.43.21:50010"”}” |
081110 | 215858 | 15496 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-7746692545918257727 terminating 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: PacketResponder <NUM> for block blk_<NUM> terminating “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15496"”, ““parameter_2"”: ““2"”, ““parameter_3"”: “"-7746692545918257727"”}” |
081110 | 215858 | 15496 INFO dfs.DataNode$PacketResponder: Received block blk_-7746692545918257727 of size 67108864 from /10.251.107.227 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15496"”, ““parameter_2"”: “"-7746692545918257727"”, ““parameter_3"”: ““67108864"”, ““parameter_4"”: “"/10.251.107.227"”}” |
081110 | 215858 | 15511 INFO dfs.DataNode$DataXceiver: Receiving block blk_-8578644687709935034 src: /10.251.107.227:39600 dest: /10.251.107.227:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15511"”, ““parameter_2"”: “"-8578644687709935034"”, ““parameter_3"”: “"/10.251.107.227:39600"”, ““parameter_4"”: “"/10.251.107.227:50010"”}” |
081110 | 215858 | 15514 INFO dfs.DataNode$DataXceiver: Receiving block blk_722881101738646364 src: /10.251.75.79:58213 dest: /10.251.75.79:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15514"”, ““parameter_2"”: ““722881101738646364"”, ““parameter_3"”: “"/10.251.75.79:58213"”, ““parameter_4"”: “"/10.251.75.79:50010"”}” |
081110 | 215858 | 15517 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-7110736255599716271 terminating 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: PacketResponder <NUM> for block blk_<NUM> terminating “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15517"”, ““parameter_2"”: ““2"”, ““parameter_3"”: “"-7110736255599716271"”}” |
081110 | 215858 | 15517 INFO dfs.DataNode$PacketResponder: Received block blk_-7110736255599716271 of size 67108864 from /10.251.42.246 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15517"”, ““parameter_2"”: “"-7110736255599716271"”, ““parameter_3"”: ““67108864"”, ““parameter_4"”: “"/10.251.42.246"”}” |
081110 | 215858 | 15533 INFO dfs.DataNode$DataXceiver: Receiving block blk_7257432994295824826 src: /10.251.26.8:41803 dest: /10.251.26.8:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15533"”, ““parameter_2"”: ““7257432994295824826"”, ““parameter_3"”: “"/10.251.26.8:41803"”, ““parameter_4"”: “"/10.251.26.8:50010"”}” |
081110 | 215858 | 15533 INFO dfs.DataNode$DataXceiver: Receiving block blk_-7771332301119265281 src: /10.251.43.210:34258 dest: /10.251.43.210:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15533"”, ““parameter_2"”: “"-7771332301119265281"”, ““parameter_3"”: “"/10.251.43.210:34258"”, ““parameter_4"”: “"/10.251.43.210:50010"”}” |
18 - log_reduce_predict_fl()
The function log_reduce_predict_fl()
parses semi structured textual columns, such as log lines, and for each line it matches the respective pattern from a pretrained model or reports an anomaly if no matching pattern was found. The function’s’ output is similar to log_reduce_fl(), though the patterns are retrieved from a pretrained model that generated by log_reduce_train_fl().
Syntax
T |
invoke
log_reduce_predict_fl(
models_tbl,
model_name,
reduce_col [,
anomaly_str ])
Parameters
Name | Type | Required | Description |
---|---|---|---|
models_tbl | table | ✔️ | A table containing models generated by log_reduce_train_fl(). The table’s schema should be (name:string, timestamp: datetime, model:string). |
model_name | string | ✔️ | The name of the model that will be retrieved from models_tbl. If the table contains few models matching the model name, the latest one is used. |
reduce_col | string | ✔️ | The name of the string column the function is applied to. |
anomaly_str | string | This string is output for lines that have no matched pattern in the model. Default value is “ANOMALY”. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let log_reduce_predict_fl=(tbl:(*), models_tbl: (name:string, timestamp: datetime, model:string),
model_name:string, reduce_col:string, anomaly_str: string = 'ANOMALY')
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('logs_col', reduce_col, 'output_patterns_col', 'LogReduce','output_parameters_col', '',
'model', model_str, 'anomaly_str', anomaly_str, 'output_type', 'summary');
let code = ```if 1:
from log_cluster import log_reduce_predict
result = log_reduce_predict.log_reduce_predict(df, kargs)
```;
tbl
| evaluate hint.distribution=per_node python(typeof(Count:int, LogReduce:string, example:string), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = 'Packages\\Text', docstring = 'Apply a trained model to find common patterns in textual logs, output a summary table')
log_reduce_predict_fl(tbl:(*), models_tbl: (name:string, timestamp: datetime, model:string),
model_name:string, reduce_col:string, anomaly_str: string = 'ANOMALY')
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('logs_col', reduce_col, 'output_patterns_col', 'LogReduce','output_parameters_col', '',
'model', model_str, 'anomaly_str', anomaly_str, 'output_type', 'summary');
let code = ```if 1:
from log_cluster import log_reduce_predict
result = log_reduce_predict.log_reduce_predict(df, kargs)
```;
tbl
| evaluate hint.distribution=per_node python(typeof(Count:int, LogReduce:string, example:string), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let log_reduce_predict_fl=(tbl:(*), models_tbl: (name:string, timestamp: datetime, model:string),
model_name:string, reduce_col:string, anomaly_str: string = 'ANOMALY')
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('logs_col', reduce_col, 'output_patterns_col', 'LogReduce','output_parameters_col', '',
'model', model_str, 'anomaly_str', anomaly_str, 'output_type', 'summary');
let code = ```if 1:
from log_cluster import log_reduce_predict
result = log_reduce_predict.log_reduce_predict(df, kargs)
```;
tbl
| evaluate hint.distribution=per_node python(typeof(Count:int, LogReduce:string, example:string), code, kwargs)
};
HDFS_log_100k
| take 1000
| invoke log_reduce_predict_fl(models_tbl=ML_Models, model_name="HDFS_100K", reduce_col="data")
Stored
HDFS_log_100k
| take 1000
| invoke log_reduce_predict_fl(models_tbl=ML_Models, model_name="HDFS_100K", reduce_col="data")
Output
Count | LogReduce | example |
---|---|---|
239 | 081110 | <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> 081110 215858 15494 INFO dfs.DataNode$DataXceiver: Receiving block blk_-7037346755429293022 src: /10.251.43.21:45933 dest: /10.251.43.21:50010 |
231 | 081110 | <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> 081110 215858 15485 INFO dfs.DataNode$PacketResponder: Received block blk_5080254298708411681 of size 67108864 from /10.251.43.21 |
230 | 081110 | <NUM> <NUM> INFO dfs.DataNode$PacketResponder: PacketResponder <NUM> for block blk_<NUM> terminating 081110 215858 15496 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-7746692545918257727 terminating |
218 | 081110 | <NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: <IP> is added to blk_<NUM> size <NUM> 081110 215858 27 INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.11.85:50010 is added to blk_5080254298708411681 size 67108864 |
79 | 081110 | <NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: <>. <> 081110 215858 26 INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: /user/root/rand3/_temporary/task_200811101024_0005_m_001805_0/part-01805. blk-7037346755429293022 |
3 | 081110 | <NUM> <NUM> INFO dfs.DataBlockScanner: Verification succeeded for <*> 081110 215859 13 INFO dfs.DataBlockScanner: Verification succeeded for blk_-7244926816084627474 |
19 - log_reduce_predict_full_fl()
The function log_reduce_predict_full_fl()
parses semi structured textual columns, such as log lines, and for each line it matches the respective pattern from a pretrained model or reports an anomaly if no matching pattern was found. The patterns are retrieved from a pretrained model, generated by log_reduce_train_fl()
. The function is similar to log_reduce_predict_fl(), but unlike log_reduce_predict_fl() that outputs a patterns summary table, this function outputs a full table containing the pattern and parameters per each line.
Syntax
T |
invoke
log_reduce_predict_full_fl(
models_tbl,
model_name,
reduce_col,
pattern_col,
parameters_col [,
anomaly_str ])
Parameters
Name | Type | Required | Description |
---|---|---|---|
models_tbl | table | ✔️ | A table containing models generated by log_reduce_train_fl(). The table’s schema should be (name:string, timestamp: datetime, model:string). |
model_name | string | ✔️ | The name of the model that will be retrieved from models_tbl. If the table contains few models matching the model name, the latest one is used. |
reduce_col | string | ✔️ | The name of the string column the function is applied to. |
pattern_col | string | ✔️ | The name of the string column to populate the pattern. |
parameters_col | string | ✔️ | The name of the string column to populate the pattern’s parameters. |
anomaly_str | string | This string is output for lines that have no matched pattern in the model. Default value is “ANOMALY”. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let log_reduce_predict_full_fl=(tbl:(*), models_tbl: (name:string, timestamp: datetime, model:string),
model_name:string, reduce_col:string, pattern_col:string, parameters_col:string,
anomaly_str: string = 'ANOMALY')
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('logs_col', reduce_col, 'output_patterns_col', pattern_col,'output_parameters_col',
parameters_col, 'model', model_str, 'anomaly_str', anomaly_str, 'output_type', 'full');
let code = ```if 1:
from log_cluster import log_reduce_predict
result = log_reduce_predict.log_reduce_predict(df, kargs)
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = 'Packages\\Text', docstring = 'Apply a trained model to find common patterns in textual logs, output a full table')
log_reduce_predict_full_fl(tbl:(*), models_tbl: (name:string, timestamp: datetime, model:string),
model_name:string, reduce_col:string, pattern_col:string, parameters_col:string,
anomaly_str: string = 'ANOMALY')
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('logs_col', reduce_col, 'output_patterns_col', pattern_col,'output_parameters_col',
parameters_col, 'model', model_str, 'anomaly_str', anomaly_str, 'output_type', 'full');
let code = ```if 1:
from log_cluster import log_reduce_predict
result = log_reduce_predict.log_reduce_predict(df, kargs)
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let log_reduce_predict_full_fl=(tbl:(*), models_tbl: (name:string, timestamp: datetime, model:string),
model_name:string, reduce_col:string, pattern_col:string, parameters_col:string,
anomaly_str: string = 'ANOMALY')
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('logs_col', reduce_col, 'output_patterns_col', pattern_col,'output_parameters_col',
parameters_col, 'model', model_str, 'anomaly_str', anomaly_str, 'output_type', 'full');
let code = ```if 1:
from log_cluster import log_reduce_predict
result = log_reduce_predict.log_reduce_predict(df, kargs)
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
HDFS_log_100k
| extend Patterns='', Parameters=''
| take 10
| invoke log_reduce_predict_full_fl(models_tbl=ML_Models, model_name="HDFS_100K", reduce_col="data", pattern_col="Patterns", parameters_col="Parameters")
Stored
HDFS_log_100k
| extend Patterns='', Parameters=''
| take 10
| invoke log_reduce_predict_full_fl(models_tbl=ML_Models, model_name="HDFS_100K", reduce_col="data", pattern_col="Patterns", parameters_col="Parameters")
Output
data | Patterns | Parameters |
---|---|---|
081110 | 215858 | 15485 INFO dfs.DataNode$PacketResponder: Received block blk_5080254298708411681 of size 67108864 from /10.251.43.21 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> {“parameter_0”: “215858”, “parameter_1”: “15485”, “parameter_2”: “5080254298708411681”, “parameter_3”: “67108864”, “parameter_4”: “/10.251.43.21”} |
081110 | 215858 | 15494 INFO dfs.DataNode$DataXceiver: Receiving block blk_-7037346755429293022 src: /10.251.43.21:45933 dest: /10.251.43.21:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> {“parameter_0”: “215858”, “parameter_1”: “15494”, “parameter_2”: “-7037346755429293022”, “parameter_3”: “/10.251.43.21:45933”, “parameter_4”: “/10.251.43.21:50010”} |
081110 | 215858 | 15496 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-7746692545918257727 terminating 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: PacketResponder <NUM> for block blk_<NUM> terminating {“parameter_0”: “215858”, “parameter_1”: “15496”, “parameter_2”: “2”, “parameter_3”: “-7746692545918257727”} |
081110 | 215858 | 15496 INFO dfs.DataNode$PacketResponder: Received block blk_-7746692545918257727 of size 67108864 from /10.251.107.227 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> {“parameter_0”: “215858”, “parameter_1”: “15496”, “parameter_2”: “-7746692545918257727”, “parameter_3”: “67108864”, “parameter_4”: “/10.251.107.227”} |
081110 | 215858 | 15511 INFO dfs.DataNode$DataXceiver: Receiving block blk_-8578644687709935034 src: /10.251.107.227:39600 dest: /10.251.107.227:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> {“parameter_0”: “215858”, “parameter_1”: “15511”, “parameter_2”: “-8578644687709935034”, “parameter_3”: “/10.251.107.227:39600”, “parameter_4”: “/10.251.107.227:50010”} |
081110 | 215858 | 15514 INFO dfs.DataNode$DataXceiver: Receiving block blk_722881101738646364 src: /10.251.75.79:58213 dest: /10.251.75.79:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> {“parameter_0”: “215858”, “parameter_1”: “15514”, “parameter_2”: “722881101738646364”, “parameter_3”: “/10.251.75.79:58213”, “parameter_4”: “/10.251.75.79:50010”} |
081110 | 215858 | 15517 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-7110736255599716271 terminating 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: PacketResponder <NUM> for block blk_<NUM> terminating {“parameter_0”: “215858”, “parameter_1”: “15517”, “parameter_2”: “2”, “parameter_3”: “-7110736255599716271”} |
081110 | 215858 | 15517 INFO dfs.DataNode$PacketResponder: Received block blk_-7110736255599716271 of size 67108864 from /10.251.42.246 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> {“parameter_0”: “215858”, “parameter_1”: “15517”, “parameter_2”: “-7110736255599716271”, “parameter_3”: “67108864”, “parameter_4”: “/10.251.42.246”} |
081110 | 215858 | 15533 INFO dfs.DataNode$DataXceiver: Receiving block blk_7257432994295824826 src: /10.251.26.8:41803 dest: /10.251.26.8:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> {“parameter_0”: “215858”, “parameter_1”: “15533”, “parameter_2”: “7257432994295824826”, “parameter_3”: “/10.251.26.8:41803”, “parameter_4”: “/10.251.26.8:50010”} |
081110 | 215858 | 15533 INFO dfs.DataNode$DataXceiver: Receiving block blk_-7771332301119265281 src: /10.251.43.210:34258 dest: /10.251.43.210:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> {“parameter_0”: “215858”, “parameter_1”: “15533”, “parameter_2”: “-7771332301119265281”, “parameter_3”: “/10.251.43.210:34258”, “parameter_4”: “/10.251.43.210:50010”} |
20 - log_reduce_train_fl()
The function log_reduce_train_fl()
finds common patterns in semi structured textual columns, such as log lines, and clusters the lines according to the extracted patterns. The function’s algorithm and most of the parameters are identical to log_reduce_fl(), but unlike log_reduce_fl() that outputs a patterns summary table, this function outputs the serialized model. The model can be used by the function log_reduce_predict_fl()/log_reduce_predict_full_fl() to predict the matched pattern for new log lines.
Syntax
T |
invoke
log_reduce_train_fl(
reduce_col,
model_name [,
use_logram [,
use_drain [,
custom_regexes [,
custom_regexes_policy [,
delimiters [,
similarity_th [,
tree_depth [,
trigram_th [,
bigram_th ]]]]]]]]])
Parameters
The following parameters description is a summary. For more information, see More about the algorithm section.
Name | Type | Required | Description |
---|---|---|---|
reduce_col | string | ✔️ | The name of the string column the function is applied to. |
model_name | string | ✔️ | The name of the output model. |
use_logram | bool | Enable or disable the Logram algorithm. Default value is true . | |
use_drain | bool | Enable or disable the Drain algorithm. Default value is true . | |
custom_regexes | dynamic | A dynamic array containing pairs of regular expression and replacement symbols to be searched in each input row, and replaced with their respective matching symbol. Default value is dynamic([]) . The default regex table replaces numbers, IPs and GUIDs. | |
custom_regexes_policy | string | Either ‘prepend’, ‘append’ or ‘replace’. Controls whether custom_regexes are prepend/append/replace the default ones. Default value is ‘prepend’. | |
delimiters | dynamic | A dynamic array containing delimiter strings. Default value is dynamic([" "]) , defining space as the only single character delimiter. | |
similarity_th | real | Similarity threshold, used by the Drain algorithm. Increasing similarity_th results in more refined databases. Default value is 0.5. If Drain is disabled, then this parameter has no effect. | |
tree_depth | int | Increasing tree_depth improves the runtime of the Drain algorithm, but might reduce its accuracy. Default value is 4. If Drain is disabled, then this parameter has no effect. | |
trigram_th | int | Decreasing trigram_th increases the chances of Logram to replace tokens with wildcards. Default value is 10. If Logram is disabled, then this parameter has no effect. | |
bigram_th | int | Decreasing bigram_th increases the chances of Logram to replace tokens with wildcards. Default value is 15. If Logram, then is disabled this parameter has no effect. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let log_reduce_train_fl=(tbl:(*), reduce_col:string, model_name:string,
use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>',
'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>',
'(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', 'LogReduce', 'parameters_column', '',
'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table,
'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th,
'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn',
'output_type', 'model');
let code = ```if 1:
from log_cluster import log_reduce
result = log_reduce.log_reduce(df, kargs)
```;
tbl
| extend LogReduce=''
| evaluate python(typeof(model:string), code, kwargs)
| project name=model_name, timestamp=now(), model
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = 'Packages\\Text', docstring = 'Find common patterns in textual logs, output a model')
log_reduce_train_fl(tbl:(*), reduce_col:string, model_name:string,
use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>',
'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>',
'(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', 'LogReduce', 'parameters_column', '',
'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table,
'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th,
'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn',
'output_type', 'model');
let code = ```if 1:
from log_cluster import log_reduce
result = log_reduce.log_reduce(df, kargs)
```;
tbl
| extend LogReduce=''
| evaluate python(typeof(model:string), code, kwargs)
| project name=model_name, timestamp=now(), model
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
//
// Finding common patterns in HDFS logs, export and store the trained model in ML_Models table
//
.set-or-append ML_Models <|
//
let log_reduce_train_fl=(tbl:(*), reduce_col:string, model_name:string,
use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>',
'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>',
'(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', 'LogReduce', 'parameters_column', '',
'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table,
'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th,
'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn',
'output_type', 'model');
let code = ```if 1:
from log_cluster import log_reduce
result = log_reduce.log_reduce(df, kargs)
```;
tbl
| extend LogReduce=''
| evaluate python(typeof(model:string), code, kwargs)
| project name=model_name, timestamp=now(), model
};
HDFS_log_100k
| take 100000
| invoke log_reduce_train_fl(reduce_col="data", model_name="HDFS_100K")
Stored
//
// Finding common patterns in HDFS logs, export and store the trained model in ML_Models table
//
.set-or-append ML_Models <|
//
HDFS_log_100k
| take 100000
| invoke log_reduce_train_fl(reduce_col="data", model_name="HDFS_100K")
Output
ExtentId | OriginalSize | ExtentSize | CompressedSize | IndexSize | RowCount |
---|---|---|---|---|---|
3734a525-cc08-44b9-a992-72de97b32414 | 10383 | 11546 | 10834 | 712 | 1 |
21 - mann_whitney_u_test_fl()
The function mann_whitney_u_test_fl()
is a UDF (user-defined function) that performs the Mann-Whitney U Test.
Syntax
T | mann_whitney_u_test_fl(
data1,
data2,
test_statistic,
p_value [,
use_continuity ])
Parameters
Name | Type | Required | Description |
---|---|---|---|
data1 | string | ✔️ | The name of the column containing the first set of data to be used for the test. |
data2 | string | ✔️ | The name of the column containing the second set of data to be used for the test. |
test_statistic | string | ✔️ | The name of the column to store test statistic value for the results. |
p_value | string | ✔️ | The name of the column to store p-value for the results. |
use_continuity | bool | Determines if a continuity correction (1/2) is applied. Default is true . |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let mann_whitney_u_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string, use_continuity:bool=true)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value, 'use_continuity', use_continuity);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
use_continuity = kargs["use_continuity"]
def func(row):
statistics = stats.mannwhitneyu(row[data1], row[data2], use_continuity=use_continuity)
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Mann-Whitney U Test")
mann_whitney_u_test_fl(tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string, use_continuity:bool=true)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value, 'use_continuity', use_continuity);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
use_continuity = kargs["use_continuity"]
def func(row):
statistics = stats.mannwhitneyu(row[data1], row[data2], use_continuity=use_continuity)
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let mann_whitney_u_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string, use_continuity:bool=true)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value, 'use_continuity', use_continuity);
let code = ```if 1:
from scipy import stats
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
use_continuity = kargs["use_continuity"]
def func(row):
statistics = stats.mannwhitneyu(row[data1], row[data2], use_continuity=use_continuity)
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke mann_whitney_u_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Stored
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke mann_whitney_u_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Output
id | sample1 | sample2 | test_stat | p_val |
---|---|---|---|---|
Test #1 | [23.64, 20.57, 20.42] | [27.1, 22.12, 33.56] | 1 | 0.095215131912761986 |
Test #2 | [20.85, 21.89, 23.41] | [35.09, 30.02, 26.52] | 0 | 0.04042779918502612 |
Test #3 | [20.13, 20.5, 21.7, 22.02] | [32.2, 32.79, 33.9, 34.22] | 0 | 0.015191410988288745 |
22 - normality_test_fl()
The function normality_test_fl()
is a UDF (user-defined function) that performs the Normality Test.
Syntax
T | invoke normality_test_fl(
data,
test_statistic,
p_value)
Parameters
Name | Type | Required | Description |
---|---|---|---|
data | string | ✔️ | The name of the column containing the data to be used for the test. |
test_statistic | string | ✔️ | The name of the column to store test statistic value for the results. |
p_value | string | ✔️ | The name of the column to store p-value for the results. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let normality_test_fl = (tbl:(*), data:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data', data, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data = kargs["data"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.normaltest(row[data])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Normality Test")
normality_test_fl(tbl:(*), data:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data', data, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data = kargs["data"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.normaltest(row[data])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let normality_test_fl = (tbl:(*), data:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data', data, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data = kargs["data"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.normaltest(row[data])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, sample1:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42, 27.1, 22.12, 33.56, 23.64, 20.57]),
'Test #2', dynamic([20.85, 21.89, 23.41, 35.09, 30.02, 26.52, 20.85, 21.89]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02, 32.2, 32.79, 33.9, 34.22, 20.13, 20.5])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke normality_test_fl('sample1', 'test_stat', 'p_val')
Stored
datatable(id:string, sample1:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42, 27.1, 22.12, 33.56, 23.64, 20.57]),
'Test #2', dynamic([20.85, 21.89, 23.41, 35.09, 30.02, 26.52, 20.85, 21.89]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02, 32.2, 32.79, 33.9, 34.22, 20.13, 20.5])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke normality_test_fl('sample1', 'test_stat', 'p_val')
Output
id | sample1 | test_stat | p_val |
---|---|---|---|
Test #1 | [23.64, 20.57, 20.42, 27.1, 22.12, 33.56, 23.64, 20.57] | 7.4881873153941036 | 0.023657060728893706 |
Test #2 | [20.85, 21.89, 23.41, 35.09, 30.02, 26.52, 20.85, 21.89] | 3.29982750330276 | 0.19206647332255408 |
Test #3 | [20.13, 20.5, 21.7, 22.02, 32.2, 32.79, 33.9, 34.22, 20.13, 20.5] | 6.9868433851364324 | 0.030396685911910585 |
23 - pair_probabilities_fl()
Calculate various probabilities and related metrics for a pair of categorical variables.
The function pair_probabilities_fl()
is a UDF (user-defined function) that calculates the following probabilities and related metrics for a pair of categorical variables, A and B, as follows:
- P(A) is the probability of each value A=a
- P(B) is the probability of each value B=b
- P(A|B) is the conditional probability of A=a given B=b
- P(B|A) is the conditional probability of B=b given A=a
- P(A∪B) is the union probability (A=a or B=b)
- P(A∩B) is the intersection probability (A=a and B=b)
- The lift metric is calculated as P(A∩B)/P(A)*P(B). For more information, see lift metric.
- A lift near 1 means that the joint probability of two values is similar to what is expected in case that both variables are independent.
- Lift » 1 means that values cooccur more often than expected under independence assumption.
- Lift « 1 means that values are less likely to cooccur than expected under independence assumption.
- The Jaccard similarity coefficient is calculated as P(A∩B)/P(A∪B). For more information, see Jaccard similarity coefficient.
- A high Jaccard coefficient, close to 1, means that the values tend to occur together.
- A low Jaccard coefficient, close to 0, means that the values tend to stay apart.
Syntax
pair_probabilities_fl(
A, B, Scope)
Parameters
Name | Type | Required | Description |
---|---|---|---|
A | scalar | ✔️ | The first categorical variable. |
B | scalar | ✔️ | The second categorical variable. |
Scope | scalar | ✔️ | The field that contains the scope, so that the probabilities for A and B are calculated independently for each scope value. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let pair_probabilities_fl = (tbl:(*), A_col:string, B_col:string, scope_col:string)
{
let T = materialize(tbl | extend _A = column_ifexists(A_col, ''), _B = column_ifexists(B_col, ''), _scope = column_ifexists(scope_col, ''));
let countOnScope = T | summarize countAllOnScope = count() by _scope;
let probAB = T | summarize countAB = count() by _A, _B, _scope | join kind = leftouter (countOnScope) on _scope | extend P_AB = todouble(countAB)/countAllOnScope;
let probA = probAB | summarize countA = sum(countAB), countAllOnScope = max(countAllOnScope) by _A, _scope | extend P_A = todouble(countA)/countAllOnScope;
let probB = probAB | summarize countB = sum(countAB), countAllOnScope = max(countAllOnScope) by _B, _scope | extend P_B = todouble(countB)/countAllOnScope;
probAB
| join kind = leftouter (probA) on _A, _scope // probability for each value of A
| join kind = leftouter (probB) on _B, _scope // probability for each value of B
| extend P_AUB = P_A + P_B - P_AB // union probability
, P_AIB = P_AB/P_B // conditional probability of A on B
, P_BIA = P_AB/P_A // conditional probability of B on A
| extend Lift_AB = P_AB/(P_A * P_B) // lift metric
, Jaccard_AB = P_AB/P_AUB // Jaccard similarity index
| project _A, _B, _scope, bin(P_A, 0.00001), bin(P_B, 0.00001), bin(P_AB, 0.00001), bin(P_AUB, 0.00001), bin(P_AIB, 0.00001)
, bin(P_BIA, 0.00001), bin(Lift_AB, 0.00001), bin(Jaccard_AB, 0.00001)
| sort by _scope, _A, _B
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Calculate probabilities and related metrics for a pair of categorical variables")
pair_probabilities_fl (tbl:(*), A_col:string, B_col:string, scope_col:string)
{
let T = materialize(tbl | extend _A = column_ifexists(A_col, ''), _B = column_ifexists(B_col, ''), _scope = column_ifexists(scope_col, ''));
let countOnScope = T | summarize countAllOnScope = count() by _scope;
let probAB = T | summarize countAB = count() by _A, _B, _scope | join kind = leftouter (countOnScope) on _scope | extend P_AB = todouble(countAB)/countAllOnScope;
let probA = probAB | summarize countA = sum(countAB), countAllOnScope = max(countAllOnScope) by _A, _scope | extend P_A = todouble(countA)/countAllOnScope;
let probB = probAB | summarize countB = sum(countAB), countAllOnScope = max(countAllOnScope) by _B, _scope | extend P_B = todouble(countB)/countAllOnScope;
probAB
| join kind = leftouter (probA) on _A, _scope // probability for each value of A
| join kind = leftouter (probB) on _B, _scope // probability for each value of B
| extend P_AUB = P_A + P_B - P_AB // union probability
, P_AIB = P_AB/P_B // conditional probability of A on B
, P_BIA = P_AB/P_A // conditional probability of B on A
| extend Lift_AB = P_AB/(P_A * P_B) // lift metric
, Jaccard_AB = P_AB/P_AUB // Jaccard similarity index
| project _A, _B, _scope, bin(P_A, 0.00001), bin(P_B, 0.00001), bin(P_AB, 0.00001), bin(P_AUB, 0.00001), bin(P_AIB, 0.00001)
, bin(P_BIA, 0.00001), bin(Lift_AB, 0.00001), bin(Jaccard_AB, 0.00001)
| sort by _scope, _A, _B
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let pair_probabilities_fl = (tbl:(*), A_col:string, B_col:string, scope_col:string)
{
let T = materialize(tbl | extend _A = column_ifexists(A_col, ''), _B = column_ifexists(B_col, ''), _scope = column_ifexists(scope_col, ''));
let countOnScope = T | summarize countAllOnScope = count() by _scope;
let probAB = T | summarize countAB = count() by _A, _B, _scope | join kind = leftouter (countOnScope) on _scope | extend P_AB = todouble(countAB)/countAllOnScope;
let probA = probAB | summarize countA = sum(countAB), countAllOnScope = max(countAllOnScope) by _A, _scope | extend P_A = todouble(countA)/countAllOnScope;
let probB = probAB | summarize countB = sum(countAB), countAllOnScope = max(countAllOnScope) by _B, _scope | extend P_B = todouble(countB)/countAllOnScope;
probAB
| join kind = leftouter (probA) on _A, _scope // probability for each value of A
| join kind = leftouter (probB) on _B, _scope // probability for each value of B
| extend P_AUB = P_A + P_B - P_AB // union probability
, P_AIB = P_AB/P_B // conditional probability of A on B
, P_BIA = P_AB/P_A // conditional probability of B on A
| extend Lift_AB = P_AB/(P_A * P_B) // lift metric
, Jaccard_AB = P_AB/P_AUB // Jaccard similarity index
| project _A, _B, _scope, bin(P_A, 0.00001), bin(P_B, 0.00001), bin(P_AB, 0.00001), bin(P_AUB, 0.00001), bin(P_AIB, 0.00001)
, bin(P_BIA, 0.00001), bin(Lift_AB, 0.00001), bin(Jaccard_AB, 0.00001)
| sort by _scope, _A, _B
};
//
let dancePairs = datatable(boy:string, girl:string, dance_class:string)[
'James', 'Mary', 'Modern',
'James', 'Mary', 'Modern',
'Robert', 'Mary', 'Modern',
'Robert', 'Mary', 'Modern',
'Michael', 'Patricia', 'Modern',
'Michael', 'Patricia', 'Modern',
'James', 'Patricia', 'Modern',
'Robert', 'Patricia', 'Modern',
'Michael', 'Patricia', 'Modern',
'Michael', 'Patricia', 'Modern',
'James', 'Linda', 'Modern',
'James', 'Linda', 'Modern',
'Robert', 'Linda', 'Modern',
'Robert', 'Linda', 'Modern',
'James', 'Linda', 'Modern',
'Robert', 'Mary', 'Modern',
'Michael', 'Patricia', 'Modern',
'Michael', 'Patricia', 'Modern',
'James', 'Linda', 'Modern',
'Robert', 'Mary', 'Classic',
'Robert', 'Linda', 'Classic',
'James', 'Mary', 'Classic',
'James', 'Linda', 'Classic'
];
dancePairs
| invoke pair_probabilities_fl('boy','girl', 'dance_class')
Stored
let dancePairs = datatable(boy:string, girl:string, dance_class:string)[
'James', 'Mary', 'Modern',
'James', 'Mary', 'Modern',
'Robert', 'Mary', 'Modern',
'Robert', 'Mary', 'Modern',
'Michael', 'Patricia', 'Modern',
'Michael', 'Patricia', 'Modern',
'James', 'Patricia', 'Modern',
'Robert', 'Patricia', 'Modern',
'Michael', 'Patricia', 'Modern',
'Michael', 'Patricia', 'Modern',
'James', 'Linda', 'Modern',
'James', 'Linda', 'Modern',
'Robert', 'Linda', 'Modern',
'Robert', 'Linda', 'Modern',
'James', 'Linda', 'Modern',
'Robert', 'Mary', 'Modern',
'Michael', 'Patricia', 'Modern',
'Michael', 'Patricia', 'Modern',
'James', 'Linda', 'Modern',
'Robert', 'Mary', 'Classic',
'Robert', 'Linda', 'Classic',
'James', 'Mary', 'Classic',
'James', 'Linda', 'Classic'
];
dancePairs
| invoke pair_probabilities_fl('boy','girl', 'dance_class')
Output
Let’s look at list of pairs of people dancing at two dance classes supposedly at random to find out if anything looks anomalous (meaning, not random). We’ll start by looking at each class by itself.
The Michael-Patricia pair has a lift metric of 2.375, which is significantly above 1. This value means that they’re seen together much more often that what would be expected if this pairing was random. Their Jaccard coefficient is 0.75, which is close to 1. When the pair dances, they prefer to dance together.
A | B | scope | P_A | P_B | P_AB | P_AUB | P_AIB | P_BIA | Lift_AB | Jaccard_AB |
---|---|---|---|---|---|---|---|---|---|---|
Robert | Patricia | Modern | 0.31578 | 0.42105 | 0.05263 | 0.68421 | 0.12499 | 0.16666 | 0.39583 | 0.07692 |
Robert | Mary | Modern | 0.31578 | 0.26315 | 0.15789 | 0.42105 | 0.59999 | 0.49999 | 1.89999 | 0.37499 |
Robert | Linda | Modern | 0.31578 | 0.31578 | 0.10526 | 0.52631 | 0.33333 | 0.33333 | 1.05555 | 0.2 |
Michael | Patricia | Modern | 0.31578 | 0.42105 | 0.31578 | 0.42105 | 0.75 | 0.99999 | 2.375 | 0.75 |
James | Patricia | Modern | 0.36842 | 0.42105 | 0.05263 | 0.73684 | 0.12499 | 0.14285 | 0.33928 | 0.07142 |
James | Mary | Modern | 0.36842 | 0.26315 | 0.10526 | 0.52631 | 0.4 | 0.28571 | 1.08571 | 0.2 |
James | Linda | Modern | 0.36842 | 0.31578 | 0.21052 | 0.47368 | 0.66666 | 0.57142 | 1.80952 | 0.44444 |
Robert | Mary | Classic | 0.49999 | 0.49999 | 0.24999 | 0.75 | 0.49999 | 0.49999 | 0.99999 | 0.33333 |
Robert | Linda | Classic | 0.49999 | 0.49999 | 0.24999 | 0.75 | 0.49999 | 0.49999 | 0.99999 | 0.33333 |
James | Mary | Classic | 0.49999 | 0.49999 | 0.24999 | 0.75 | 0.49999 | 0.49999 | 0.99999 | 0.33333 |
James | Linda | Classic | 0.49999 | 0.49999 | 0.24999 | 0.75 | 0.49999 | 0.49999 | 0.99999 | 0.33333 |
24 - pairwise_dist_fl()
Calculate pairwise distances between entities based on multiple nominal and numerical variables.
The function pairwise_dist_fl()
is a UDF (user-defined function) that calculates the multivariate distance between data points belonging to the same partition, taking into account nominal and numerical variables.
- All string fields, besides entity and partition names, are considered nominal variables. The distance is equal to 1 if the values are different, and 0 if they’re the same.
- All numerical fields are considered numerical variables. They’re normalized by transforming to z-scores and the distance is calculated as the absolute value of the difference. The total multivariate distance between data points is calculated as the average of the distances between variables.
A distance close to zero means that the entities are similar and a distance above 1 means they’re different. Similarly, an entity with an average distance close to or above one indicates that it’s different from many other entities in the partition, indicating a potential outlier.
Syntax
pairwise_dist_fl(
entity, partition)
Parameters
Name | Type | Required | Description |
---|---|---|---|
entity | string | ✔️ | The name of the input table column containing the names or IDs of the entities for which the distances will be calculated. |
partition | string | ✔️ | The name of the input table column containing the partition or scope, so that the distances are calculated for all pairs of entities under the same partition. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let pairwise_dist_fl = (tbl:(*), id_col:string, partition_col:string)
{
let generic_dist = (value1:dynamic, value2:dynamic)
{
// Calculates the distance between two values; treats all strings as nominal values and numbers as numerical,
// can be extended to other data types or tweaked by adding weights or changing formulas.
iff(gettype(value1[0]) == "string", todouble(tostring(value1[0]) != tostring(value2[0])), abs(todouble(value1[0]) - todouble(value2[0])))
};
let T = (tbl | extend _entity = column_ifexists(id_col, ''), _partition = column_ifexists(partition_col, '') | project-reorder _entity, _partition);
let sum_data = (
// Calculates summary statistics to be used for normalization.
T
| project-reorder _entity
| project _partition, p = pack_array(*)
| mv-expand with_itemindex=idx p
| summarize count(), avg(todouble(p)), stdev(todouble(p)) by _partition, idx
| sort by _partition, idx asc
| summarize make_list(avg_p), make_list(stdev_p) by _partition
);
let normalized_data = (
// Performs normalization on numerical variables by substrcting mean and scaling by standard deviation. Other normalization techniques can be used
// by adding metrics to previous function and using here.
T
| project _partition, p = pack_array(*)
| join kind = leftouter (sum_data) on _partition
| mv-apply p, list_avg_p, list_stdev_p on (
extend normalized = iff((not(isnan(todouble(list_avg_p))) and (list_stdev_p > 0)), pack_array((todouble(p) - todouble(list_avg_p))/todouble(list_stdev_p)), p)
| summarize a = make_list(normalized) by _partition
)
| project _partition, a
);
let dist_data = (
// Calculates distances of included variables and sums them up to get a multivariate distance between all entities under the same partition.
normalized_data
| join kind = inner (normalized_data) on _partition
| project entity = tostring(a[0]), entity1 = tostring(a1[0]), a = array_slice(a, 1, -1), a1 = array_slice(a1, 1, -1), _partition
| mv-apply a, a1 on
(
project d = generic_dist(pack_array(a), pack_array(a1))
| summarize d = make_list(d)
)
| extend dist = bin((1.0*array_sum(d)-1.0)/array_length(d), 0.0001) // -1 cancels the artifact distance calculated between entity names appearing in the bag and normalizes by number of features
| project-away d
| where entity != entity1
| sort by _partition asc, entity asc, dist asc
);
dist_data
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Calculate distances between pairs of entites based on multiple nominal and numerical variables")
pairwise_dist_fl (tbl:(*), id_col:string, partition_col:string)
{
let generic_dist = (value1:dynamic, value2:dynamic)
{
// Calculates the distance between two values; treats all strings as nominal values and numbers as numerical,
// can be extended to other data types or tweaked by adding weights or changing formulas.
iff(gettype(value1[0]) == "string", todouble(tostring(value1[0]) != tostring(value2[0])), abs(todouble(value1[0]) - todouble(value2[0])))
};
let T = (tbl | extend _entity = column_ifexists(id_col, ''), _partition = column_ifexists(partition_col, '') | project-reorder _entity, _partition);
let sum_data = (
// Calculates summary statistics to be used for normalization.
T
| project-reorder _entity
| project _partition, p = pack_array(*)
| mv-expand with_itemindex=idx p
| summarize count(), avg(todouble(p)), stdev(todouble(p)) by _partition, idx
| sort by _partition, idx asc
| summarize make_list(avg_p), make_list(stdev_p) by _partition
);
let normalized_data = (
// Performs normalization on numerical variables by substrcting mean and scaling by standard deviation. Other normalization techniques can be used
// by adding metrics to previous function and using here.
T
| project _partition, p = pack_array(*)
| join kind = leftouter (sum_data) on _partition
| mv-apply p, list_avg_p, list_stdev_p on (
extend normalized = iff((not(isnan(todouble(list_avg_p))) and (list_stdev_p > 0)), pack_array((todouble(p) - todouble(list_avg_p))/todouble(list_stdev_p)), p)
| summarize a = make_list(normalized) by _partition
)
| project _partition, a
);
let dist_data = (
// Calculates distances of included variables and sums them up to get a multivariate distance between all entities under the same partition.
normalized_data
| join kind = inner (normalized_data) on _partition
| project entity = tostring(a[0]), entity1 = tostring(a1[0]), a = array_slice(a, 1, -1), a1 = array_slice(a1, 1, -1), _partition
| mv-apply a, a1 on
(
project d = generic_dist(pack_array(a), pack_array(a1))
| summarize d = make_list(d)
)
| extend dist = bin((1.0*array_sum(d)-1.0)/array_length(d), 0.0001) // -1 cancels the artifact distance calculated between entity names appearing in the bag and normalizes by number of features
| project-away d
| where entity != entity1
| sort by _partition asc, entity asc, dist asc
);
dist_data
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let pairwise_dist_fl = (tbl:(*), id_col:string, partition_col:string)
{
let generic_dist = (value1:dynamic, value2:dynamic)
{
// Calculates the distance between two values; treats all strings as nominal values and numbers as numerical,
// can be extended to other data types or tweaked by adding weights or changing formulas.
iff(gettype(value1[0]) == "string", todouble(tostring(value1[0]) != tostring(value2[0])), abs(todouble(value1[0]) - todouble(value2[0])))
};
let T = (tbl | extend _entity = column_ifexists(id_col, ''), _partition = column_ifexists(partition_col, '') | project-reorder _entity, _partition);
let sum_data = (
// Calculates summary statistics to be used for normalization.
T
| project-reorder _entity
| project _partition, p = pack_array(*)
| mv-expand with_itemindex=idx p
| summarize count(), avg(todouble(p)), stdev(todouble(p)) by _partition, idx
| sort by _partition, idx asc
| summarize make_list(avg_p), make_list(stdev_p) by _partition
);
let normalized_data = (
// Performs normalization on numerical variables by substrcting mean and scaling by standard deviation. Other normalization techniques can be used
// by adding metrics to previous function and using here.
T
| project _partition, p = pack_array(*)
| join kind = leftouter (sum_data) on _partition
| mv-apply p, list_avg_p, list_stdev_p on (
extend normalized = iff((not(isnan(todouble(list_avg_p))) and (list_stdev_p > 0)), pack_array((todouble(p) - todouble(list_avg_p))/todouble(list_stdev_p)), p)
| summarize a = make_list(normalized) by _partition
)
| project _partition, a
);
let dist_data = (
// Calculates distances of included variables and sums them up to get a multivariate distance between all entities under the same partition.
normalized_data
| join kind = inner (normalized_data) on _partition
| project entity = tostring(a[0]), entity1 = tostring(a1[0]), a = array_slice(a, 1, -1), a1 = array_slice(a1, 1, -1), _partition
| mv-apply a, a1 on
(
project d = generic_dist(pack_array(a), pack_array(a1))
| summarize d = make_list(d)
)
| extend dist = bin((1.0*array_sum(d)-1.0)/array_length(d), 0.0001) // -1 cancels the artifact distance calculated between entity names appearing in the bag and normalizes by number of features
| project-away d
| where entity != entity1
| sort by _partition asc, entity asc, dist asc
);
dist_data
};
//
let raw_data = datatable(name:string, gender: string, height:int, weight:int, limbs:int, accessory:string, type:string)[
'Andy', 'M', 160, 80, 4, 'Hat', 'Person',
'Betsy', 'F', 170, 70, 4, 'Bag', 'Person',
'Cindy', 'F', 130, 30, 4, 'Hat', 'Person',
'Dan', 'M', 190, 105, 4, 'Hat', 'Person',
'Elmie', 'M', 110, 30, 4, 'Toy', 'Person',
'Franny', 'F', 170, 65, 4, 'Bag', 'Person',
'Godzilla', '?', 260, 210, 5, 'Tail', 'Person',
'Hannie', 'F', 112, 28, 4, 'Toy', 'Person',
'Ivie', 'F', 105, 20, 4, 'Toy', 'Person',
'Johnnie', 'M', 107, 21, 4, 'Toy', 'Person',
'Kyle', 'M', 175, 76, 4, 'Hat', 'Person',
'Laura', 'F', 180, 70, 4, 'Bag', 'Person',
'Mary', 'F', 160, 60, 4, 'Bag', 'Person',
'Noah', 'M', 178, 90, 4, 'Hat', 'Person',
'Odelia', 'F', 186, 76, 4, 'Bag', 'Person',
'Paul', 'M', 158, 69, 4, 'Bag', 'Person',
'Qui', 'F', 168, 62, 4, 'Bag', 'Person',
'Ronnie', 'M', 108, 26, 4, 'Toy', 'Person',
'Sonic', 'F', 52, 20, 6, 'Tail', 'Pet',
'Tweety', 'F', 52, 20, 6, 'Tail', 'Pet' ,
'Ulfie', 'M', 39, 29, 4, 'Wings', 'Pet',
'Vinnie', 'F', 53, 22, 1, 'Tail', 'Pet',
'Waldo', 'F', 51, 21, 4, 'Tail', 'Pet',
'Xander', 'M', 50, 24, 4, 'Tail', 'Pet'
];
raw_data
| invoke pairwise_dist_fl('name', 'type')
| where _partition == 'Person' | sort by entity asc, entity1 asc
| evaluate pivot (entity, max(dist), entity1) | sort by entity1 asc
Stored
let raw_data = datatable(name:string, gender: string, height:int, weight:int, limbs:int, accessory:string, type:string)[
'Andy', 'M', 160, 80, 4, 'Hat', 'Person',
'Betsy', 'F', 170, 70, 4, 'Bag', 'Person',
'Cindy', 'F', 130, 30, 4, 'Hat', 'Person',
'Dan', 'M', 190, 105, 4, 'Hat', 'Person',
'Elmie', 'M', 110, 30, 4, 'Toy', 'Person',
'Franny', 'F', 170, 65, 4, 'Bag', 'Person',
'Godzilla', '?', 260, 210, 5, 'Tail', 'Person',
'Hannie', 'F', 112, 28, 4, 'Toy', 'Person',
'Ivie', 'F', 105, 20, 4, 'Toy', 'Person',
'Johnnie', 'M', 107, 21, 4, 'Toy', 'Person',
'Kyle', 'M', 175, 76, 4, 'Hat', 'Person',
'Laura', 'F', 180, 70, 4, 'Bag', 'Person',
'Mary', 'F', 160, 60, 4, 'Bag', 'Person',
'Noah', 'M', 178, 90, 4, 'Hat', 'Person',
'Odelia', 'F', 186, 76, 4, 'Bag', 'Person',
'Paul', 'M', 158, 69, 4, 'Bag', 'Person',
'Qui', 'F', 168, 62, 4, 'Bag', 'Person',
'Ronnie', 'M', 108, 26, 4, 'Toy', 'Person',
'Sonic', 'F', 52, 20, 6, 'Tail', 'Pet',
'Tweety', 'F', 52, 20, 6, 'Tail', 'Pet' ,
'Ulfie', 'M', 39, 29, 4, 'Wings', 'Pet',
'Vinnie', 'F', 53, 22, 1, 'Tail', 'Pet',
'Woody', 'F', 51, 21, 4, 'Tail', 'Pet',
'Xander', 'M', 50, 24, 4, 'Tail', 'Pet'
];
raw_data
| invoke pairwise_dist_fl('name', 'type')
| where _partition == 'Person' | sort by entity asc, entity1 asc
| evaluate pivot (entity, max(dist), entity1) | sort by entity1 asc
Output
entity1 | Andy | Betsy | Cindy | Dan | Elmie | Franny | Godzilla | Hannie | … |
---|---|---|---|---|---|---|---|---|---|
Andy | 0.354 | 0.4125 | 0.1887 | 0.4843 | 0.3702 | 1.2087 | 0.6265 | … | |
Betsy | 0.354 | 0.416 | 0.4708 | 0.6307 | 0.0161 | 1.2051 | 0.4872 | … | |
Cindy | 0.4125 | 0.416 | 0.6012 | 0.3575 | 0.3998 | 1.4783 | 0.214 | … | |
Dan | 0.1887 | 0.4708 | 0.6012 | 0.673 | 0.487 | 1.0199 | 0.8152 | … | |
Elmie | 0.4843 | 0.6307 | 0.3575 | 0.673 | 0.6145 | 1.5502 | 0.1565 | … | |
Franny | 0.3702 | 0.0161 | 0.3998 | 0.487 | 0.6145 | 1.2213 | 0.471 | … | |
Godzilla | 1.2087 | 1.2051 | 1.4783 | 1.0199 | 1.5502 | 1.2213 | 1.5495 | … | |
Hannie | 0.6265 | 0.4872 | 0.214 | 0.8152 | 0.1565 | 0.471 | 1.5495 | … | |
… | … | … | … | … | … | … | … | … | … |
Looking at entities of two different types, we would like to calculate distance between entities belonging to the same type, by taking into account both nominal variables (such as gender or preferred accessory) and numerical variables (such as the number of limbs, height, and weight). The numerical variables are on different scales and must be centralized and scaled, which is done automatically. The output is pairs of entities under the same partition with calculated multivariate distance. It can be analyzed directly, visualized as a distance matrix or scatterplot, or used as input data for outlier detection algorithm by calculating mean distance per entity, with entities with high values indicating global outliers. For example, when adding an optional visualization using a distance matrix, you get a table as shown in the sample. From the sample, you can see that:
- Some pairs of entities (Betsy and Franny) have a low distance value (close to 0) indicating they’re similar.
- Some pairs of entities (Godzilla and Elmie) have a high distance value (1 or above) indicating they’re different.
The output can further be used to calculate the average distance per entity. A high average distance might indicate global outliers. For example, we can see that on average Godzilla has a high distance from the others indicating that it’s a probable global outlier.
25 - percentiles_linear_fl()
The function percentiles_linear_fl()
is a user-defined function (UDF) that calculates percentiles using linear interpolation between closest ranks, the same method used by Excel’s PERCENTILES.INC function. Kusto native percentile functions use the nearest rank method. For large sets of values the difference between both methods is insignificant, and we recommend using the native function for best performance. For further details on these and additional percentile calculation methods have a look at percentile article on Wikipedia.
The function accepts a table containing the column to calculate on and an optional grouping key, and a dynamic array of the required percentiles, and returns a column containing dynamic array of the percentiles’ values per each group.
Syntax
T | invoke percentiles_linear_fl(
val_col,
pct_arr [,
aggr_col ])
Parameters
Name | Type | Required | Description |
---|---|---|---|
val_col | string | ✔️ | The name of the column that contains the values with which to calculate the percentiles. |
pct_arr | dynamic | ✔️ | A numerical array containing the required percentiles. Each percentile should be in the range [0-100]. |
aggr_col | string | The name of the column that contains the grouping key. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let percentiles_linear_fl=(tbl:(*), val_col:string, pct_arr:dynamic, aggr_col:string='')
{
tbl
| extend _vals = column_ifexists(val_col, 0.0)
| extend _key = column_ifexists(aggr_col, 'ALL')
| order by _key asc, _vals asc
| summarize _vals=make_list(_vals) by _key
| extend n = array_length(_vals)
| extend pct=pct_arr
| mv-apply pct to typeof(real) on (
extend index=pct/100.0*(n-1)
| extend low_index=tolong(floor(index, 1)), high_index=tolong(ceiling(index))
| extend interval=todouble(_vals[high_index])-todouble(_vals[low_index])
| extend pct_val=todouble(_vals[low_index])+(index-low_index)*interval
| summarize pct_arr=make_list(pct), pct_val=make_list(pct_val))
| project-away n
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Calculate linear interpolated percentiles (identical to Excel's PERCENTILE.INC)")
percentiles_linear_fl(tbl:(*), val_col:string, pct_arr:dynamic, aggr_col:string='')
{
tbl
| extend _vals = column_ifexists(val_col, 0.0)
| extend _key = column_ifexists(aggr_col, 'ALL')
| order by _key asc, _vals asc
| summarize _vals=make_list(_vals) by _key
| extend n = array_length(_vals)
| extend pct=pct_arr
| mv-apply pct to typeof(real) on (
extend index=pct/100.0*(n-1)
| extend low_index=tolong(floor(index, 1)), high_index=tolong(ceiling(index))
| extend interval=todouble(_vals[high_index])-todouble(_vals[low_index])
| extend pct_val=todouble(_vals[low_index])+(index-low_index)*interval
| summarize pct_arr=make_list(pct), pct_val=make_list(pct_val))
| project-away n
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let percentiles_linear_fl=(tbl:(*), val_col:string, pct_arr:dynamic, aggr_col:string='')
{
tbl
| extend _vals = column_ifexists(val_col, 0.0)
| extend _key = column_ifexists(aggr_col, 'ALL')
| order by _key asc, _vals asc
| summarize _vals=make_list(_vals) by _key
| extend n = array_length(_vals)
| extend pct=pct_arr
| mv-apply pct to typeof(real) on (
extend index=pct/100.0*(n-1)
| extend low_index=tolong(floor(index, 1)), high_index=tolong(ceiling(index))
| extend interval=todouble(_vals[high_index])-todouble(_vals[low_index])
| extend pct_val=todouble(_vals[low_index])+(index-low_index)*interval
| summarize pct_arr=make_list(pct), pct_val=make_list(pct_val))
| project-away n
};
datatable(x:long, name:string) [
5, 'A',
9, 'A',
7, 'A',
5, 'B',
7, 'B',
7, 'B',
10, 'B',
]
| invoke percentiles_linear_fl('x', dynamic([0, 25, 50, 75, 100]), 'name')
| project-rename name=_key, x=_vals
Stored
datatable(x:long, name:string) [
5, 'A',
9, 'A',
7, 'A',
5, 'B',
7, 'B',
7, 'B',
10, 'B',
]
| invoke percentiles_linear_fl('x', dynamic([0, 25, 50, 75, 100]), 'name')
| project-rename name=_key, x=_vals
Output
name | x | pct_arr | pct_val |
---|---|---|---|
A | [5,7,9] | [0,25,50,75,100] | [5,6,7,8,9] |
B | [5,7,7,10] | [0,25,50,75,100] | [5,6.5,7,7.75,10] |
26 - perm_fl()
Calculate P(n, k)
The function perm_fl()
is a user-defined function (UDF) that calculates P(n, k), the number of permutations for selection of k items out of n, with order. It’s based on the native gamma() function to calculate factorial, (see facorial_fl()). For selection of k items without order, use comb_fl().
Syntax
perm_fl(
n, k)
Parameters
Name | Type | Required | Description |
---|
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let perm_fl=(n:int, k:int)
{
let fact_n = gamma(n+1);
let fact_nk = gamma(n-k+1);
tolong(fact_n/fact_nk)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Calculate number of permutations for selection of k items out of n items with order")
perm_fl(n:int, k:int)
{
let fact_n = gamma(n+1);
let fact_nk = gamma(n-k+1);
tolong(fact_n/fact_nk)
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let perm_fl=(n:int, k:int)
{
let fact_n = gamma(n+1);
let fact_nk = gamma(n-k+1);
tolong(fact_n/fact_nk)
}
;
range n from 3 to 10 step 3
| extend k = n-2
| extend pnk = perm_fl(n, k)
Stored
range n from 3 to 10 step 3
| extend k = n-2
| extend pnk = perm_fl(n, k)
Output
n | k | pnk |
---|---|---|
3 | 1 | 3 |
6 | 4 | 360 |
9 | 7 | 181440 |
27 - plotly_anomaly_fl()
The function plotly_anomaly_fl()
is a user-defined function (UDF) that allows you to customize a plotly template to create an interactive anomaly chart.
The function accepts a table containing the source and the baseline time series, lists of positive and negative anomalies with their respective sizes, and chart labeling string. The function returns a single cell table containing plotly JSON. Optionally, you can render the data in an Azure Data Explorer dashboard tile. For more information, see Plotly (preview). The function accepts a table containing the source and the baseline time series, lists of positive and negative anomalies with their respective sizes, and chart labeling string. The function returns a single cell table containing plotly JSON. Optionally, you can render the data in a Real-Time dashboard tile. For more information, see Plotly (preview).
Prerequisite
Extract the required ‘anomaly’ template from the publicly available PlotlyTemplate
table. Copy this table from the Samples database to your database by running the following KQL command from your target database:
.set PlotlyTemplate <| cluster('help.kusto.windows.net').database('Samples').PlotlyTemplate
Syntax
T | invoke plotly_anomaly_fl(
time_col,
val_col,
baseline_col,
time_high_col,
val_high_col,
size_high_col,
time_low_col,
val_low__col,
size_low_col,
chart_title,
series_name,
val_name)
Parameters
Name | Type | Required | Description |
---|---|---|---|
time_col | string | ✔️ | The name of the column containing the dynamic array of the time points of the original time series |
val_col | string | ✔️ | The name of the column containing the values of the original time series |
baseline_col | string | ✔️ | The name of the column containing the values of the baseline time series. Anomalies are usually detected by large value offset from the expected baseline value. |
time_high_col | string | ✔️ | The name of the column containing the time points of high (above the baseline) anomalies |
val_high_col | string | ✔️ | The name of the column containing the values of the high anomalies |
size_high_col | string | ✔️ | The name of the column containing the marker sizes of the high anomalies |
time_low_col | string | ✔️ | The name of the column containing the time points of low anomalies |
val_low_col | string | ✔️ | The name of the column containing the values of the low anomalies |
size_low_col | string | ✔️ | The name of the column containing the marker sizes of the low anomalies |
chart_title | string | Chart title, default is ‘Anomaly Chart’ | |
series_name | string | Time series name, default is ‘Metric’ | |
val_name | string | Value axis name, default is ‘Value’ |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let plotly_anomaly_fl=(tbl:(*), time_col:string, val_col:string, baseline_col:string, time_high_col:string , val_high_col:string, size_high_col:string,
time_low_col:string, val_low_col:string, size_low_col:string,
chart_title:string='Anomaly chart', series_name:string='Metric', val_name:string='Value')
{
let anomaly_chart = toscalar(PlotlyTemplate | where name == "anomaly" | project plotly);
let tbl_ex = tbl | extend _timestamp = column_ifexists(time_col, datetime(null)), _values = column_ifexists(val_col, 0.0), _baseline = column_ifexists(baseline_col, 0.0),
_high_timestamp = column_ifexists(time_high_col, datetime(null)), _high_values = column_ifexists(val_high_col, 0.0), _high_size = column_ifexists(size_high_col, 1),
_low_timestamp = column_ifexists(time_low_col, datetime(null)), _low_values = column_ifexists(val_low_col, 0.0), _low_size = column_ifexists(size_low_col, 1);
tbl_ex
| extend plotly = anomaly_chart
| extend plotly=replace_string(plotly, '$TIME_STAMPS$', tostring(_timestamp))
| extend plotly=replace_string(plotly, '$SERIES_VALS$', tostring(_values))
| extend plotly=replace_string(plotly, '$BASELINE_VALS$', tostring(_baseline))
| extend plotly=replace_string(plotly, '$TIME_STAMPS_HIGH_ANOMALIES$', tostring(_high_timestamp))
| extend plotly=replace_string(plotly, '$HIGH_ANOMALIES_VALS$', tostring(_high_values))
| extend plotly=replace_string(plotly, '$HIGH_ANOMALIES_MARKER_SIZE$', tostring(_high_size))
| extend plotly=replace_string(plotly, '$TIME_STAMPS_LOW_ANOMALIES$', tostring(_low_timestamp))
| extend plotly=replace_string(plotly, '$LOW_ANOMALIES_VALS$', tostring(_low_values))
| extend plotly=replace_string(plotly, '$LOW_ANOMALIES_MARKER_SIZE$', tostring(_low_size))
| extend plotly=replace_string(plotly, '$TITLE$', chart_title)
| extend plotly=replace_string(plotly, '$SERIES_NAME$', series_name)
| extend plotly=replace_string(plotly, '$Y_NAME$', val_name)
| project plotly
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Plotly", docstring = "Render anomaly chart using plotly template")
plotly_anomaly_fl(tbl:(*), time_col:string, val_col:string, baseline_col:string, time_high_col:string , val_high_col:string, size_high_col:string,
time_low_col:string, val_low_col:string, size_low_col:string,
chart_title:string='Anomaly chart', series_name:string='Metric', val_name:string='Value')
{
let anomaly_chart = toscalar(PlotlyTemplate | where name == "anomaly" | project plotly);
let tbl_ex = tbl | extend _timestamp = column_ifexists(time_col, datetime(null)), _values = column_ifexists(val_col, 0.0), _baseline = column_ifexists(baseline_col, 0.0),
_high_timestamp = column_ifexists(time_high_col, datetime(null)), _high_values = column_ifexists(val_high_col, 0.0), _high_size = column_ifexists(size_high_col, 1),
_low_timestamp = column_ifexists(time_low_col, datetime(null)), _low_values = column_ifexists(val_low_col, 0.0), _low_size = column_ifexists(size_low_col, 1);
tbl_ex
| extend plotly = anomaly_chart
| extend plotly=replace_string(plotly, '$TIME_STAMPS$', tostring(_timestamp))
| extend plotly=replace_string(plotly, '$SERIES_VALS$', tostring(_values))
| extend plotly=replace_string(plotly, '$BASELINE_VALS$', tostring(_baseline))
| extend plotly=replace_string(plotly, '$TIME_STAMPS_HIGH_ANOMALIES$', tostring(_high_timestamp))
| extend plotly=replace_string(plotly, '$HIGH_ANOMALIES_VALS$', tostring(_high_values))
| extend plotly=replace_string(plotly, '$HIGH_ANOMALIES_MARKER_SIZE$', tostring(_high_size))
| extend plotly=replace_string(plotly, '$TIME_STAMPS_LOW_ANOMALIES$', tostring(_low_timestamp))
| extend plotly=replace_string(plotly, '$LOW_ANOMALIES_VALS$', tostring(_low_values))
| extend plotly=replace_string(plotly, '$LOW_ANOMALIES_MARKER_SIZE$', tostring(_low_size))
| extend plotly=replace_string(plotly, '$TITLE$', chart_title)
| extend plotly=replace_string(plotly, '$SERIES_NAME$', series_name)
| extend plotly=replace_string(plotly, '$Y_NAME$', val_name)
| project plotly
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let plotly_anomaly_fl=(tbl:(*), time_col:string, val_col:string, baseline_col:string, time_high_col:string , val_high_col:string, size_high_col:string,
time_low_col:string, val_low_col:string, size_low_col:string,
chart_title:string='Anomaly chart', series_name:string='Metric', val_name:string='Value')
{
let anomaly_chart = toscalar(PlotlyTemplate | where name == "anomaly" | project plotly);
let tbl_ex = tbl | extend _timestamp = column_ifexists(time_col, datetime(null)), _values = column_ifexists(val_col, 0.0), _baseline = column_ifexists(baseline_col, 0.0),
_high_timestamp = column_ifexists(time_high_col, datetime(null)), _high_values = column_ifexists(val_high_col, 0.0), _high_size = column_ifexists(size_high_col, 1),
_low_timestamp = column_ifexists(time_low_col, datetime(null)), _low_values = column_ifexists(val_low_col, 0.0), _low_size = column_ifexists(size_low_col, 1);
tbl_ex
| extend plotly = anomaly_chart
| extend plotly=replace_string(plotly, '$TIME_STAMPS$', tostring(_timestamp))
| extend plotly=replace_string(plotly, '$SERIES_VALS$', tostring(_values))
| extend plotly=replace_string(plotly, '$BASELINE_VALS$', tostring(_baseline))
| extend plotly=replace_string(plotly, '$TIME_STAMPS_HIGH_ANOMALIES$', tostring(_high_timestamp))
| extend plotly=replace_string(plotly, '$HIGH_ANOMALIES_VALS$', tostring(_high_values))
| extend plotly=replace_string(plotly, '$HIGH_ANOMALIES_MARKER_SIZE$', tostring(_high_size))
| extend plotly=replace_string(plotly, '$TIME_STAMPS_LOW_ANOMALIES$', tostring(_low_timestamp))
| extend plotly=replace_string(plotly, '$LOW_ANOMALIES_VALS$', tostring(_low_values))
| extend plotly=replace_string(plotly, '$LOW_ANOMALIES_MARKER_SIZE$', tostring(_low_size))
| extend plotly=replace_string(plotly, '$TITLE$', chart_title)
| extend plotly=replace_string(plotly, '$SERIES_NAME$', series_name)
| extend plotly=replace_string(plotly, '$Y_NAME$', val_name)
| project plotly
};
let min_t = datetime(2017-01-05);
let max_t = datetime(2017-02-03 22:00);
let dt = 2h;
let marker_scale = 8;
let s_name = 'TS1';
demo_make_series2
| make-series num=avg(num) on TimeStamp from min_t to max_t step dt by sid
| where sid == s_name
| extend (anomalies, score, baseline) = series_decompose_anomalies(num, 1.5, -1, 'linefit')
| mv-apply num1=num to typeof(double), anomalies1=anomalies to typeof(double), score1=score to typeof(double), TimeStamp1=TimeStamp to typeof(datetime) on (
summarize pAnomalies=make_list_if(num1, anomalies1 > 0), pTimeStamp=make_list_if(TimeStamp1, anomalies1 > 0), pSize=make_list_if(toint(score1*marker_scale), anomalies1 > 0),
nAnomalies=make_list_if(num1, anomalies1 < 0), nTimeStamp=make_list_if(TimeStamp1, anomalies1 < 0), nSize=make_list_if(toint(-score1*marker_scale), anomalies1 < 0)
)
| invoke plotly_anomaly_fl('TimeStamp', 'num', 'baseline', 'pTimeStamp', 'pAnomalies', 'pSize', 'nTimeStamp', 'nAnomalies', 'nSize',
chart_title='Anomaly chart using plotly_anomaly_fl()', series_name=s_name, val_name='# of requests')
| render plotly
Stored
let min_t = datetime(2017-01-05);
let max_t = datetime(2017-02-03 22:00);
let dt = 2h;
let marker_scale = 8;
let s_name = 'TS1';
demo_make_series2
| make-series num=avg(num) on TimeStamp from min_t to max_t step dt by sid
| where sid == s_name
| extend (anomalies, score, baseline) = series_decompose_anomalies(num, 1.5, -1, 'linefit')
| mv-apply num1=num to typeof(double), anomalies1=anomalies to typeof(double), score1=score to typeof(double), TimeStamp1=TimeStamp to typeof(datetime) on (
summarize pAnomalies=make_list_if(num1, anomalies1 > 0), pTimeStamp=make_list_if(TimeStamp1, anomalies1 > 0), pSize=make_list_if(toint(score1*marker_scale), anomalies1 > 0),
nAnomalies=make_list_if(num1, anomalies1 < 0), nTimeStamp=make_list_if(TimeStamp1, anomalies1 < 0), nSize=make_list_if(toint(-score1*marker_scale), anomalies1 < 0)
)
| invoke plotly_anomaly_fl('TimeStamp', 'num', 'baseline', 'pTimeStamp', 'pAnomalies', 'pSize', 'nTimeStamp', 'nAnomalies', 'nSize',
chart_title='Anomaly chart using plotly_anomaly_fl()', series_name=s_name, val_name='# of requests')
| render plotly
Output
The output is a Plotly JSON string that can be rendered using ‘| render plotly’ or in an Azure Data Explorer dashboard tile. For more information on creating dashboard tiles, see Visualize data with Azure Data Explorer dashboards . The output is a Plotly JSON string that can be rendered in a Real-Time dashboard tile. For more information on creating dashboard tiles, see Real-Time dashboards.
The following image shows a sample anomaly chart using the above function:
You can zoom in and hover over anomalies:
28 - plotly_gauge_fl()
The function plotly_gauge_fl()
is a user-defined function (UDF) that allows you to customize a plotly template to create a gauge chart.
The function accepts few parameters to customize the gauge chart and returns a single cell table containing plotly JSON. Optionally, you can render the data in an Azure Data Explorer dashboard tile. For more information, see Plotly (preview). The function accepts few parameters to customize the gauge chart and returns a single cell table containing plotly JSON. Optionally, you can render the data in a Real-Time dashboard tile. For more information, see Plotly (preview).
Prerequisite
Extract the required ‘gauge’ template from the publicly available PlotlyTemplate
table. Copy this table from the Samples database to your database by running the following KQL command from your target database:
.set PlotlyTemplate <| cluster('help.kusto.windows.net').database('Samples').PlotlyTemplate
Syntax
T | invoke plotly_gauge_fl(
value,
max_range,
mode,
chart_title,
font_color,
bar_color,
bar_bg_color,
tick_color,
tick_width)
Parameters
Name | Type | Required | Description |
---|---|---|---|
value | real | ✔️ | The number to be displayed. |
max_range | range | The maximum range of the gauge. | |
mode | string | Specifies how the value is displayed on the graph. Default is ‘gauge+number’. | |
chart_title | string | The chart title. The default is empty title. | |
font_color | string | The chart’s font color. Default is ‘black’. | |
bar_color | string | The gauge’s filled bar color. Default is ‘green’. | |
bar_bg_color | string | The gauge’s not filled bar color. Default is ’lightgreen’. | |
tick_color | string | The gauge’s ticks color. Default is ‘darkblue’. | |
tick_width | int | The gauge’s ticks width. Default is 1. |
Plotly gauge charts support many parameters, still this function exposes only the above ones. For more information, see indicator traces reference.
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let plotly_gauge_fl=(value:real, max_range:real=real(null), mode:string='gauge+number', chart_title:string='',font_color:string='black',
bar_color:string='green', bar_bg_color:string='lightgreen', tick_color:string='darkblue', tick_width:int=1)
{
let gauge_chart = toscalar(PlotlyTemplate | where name == "gauge" | project plotly);
print plotly = gauge_chart
| extend plotly=replace_string(plotly, '$VALUE$', tostring(value))
| extend plotly=replace_string(plotly, '$MAX_RANGE$', iff(isnull(max_range), 'null', tostring(max_range)))
| extend plotly=replace_string(plotly, '$MODE$', mode)
| extend plotly=replace_string(plotly, '$TITLE$', chart_title)
| extend plotly=replace_string(plotly, '$FONT_COLOR$', font_color)
| extend plotly=replace_string(plotly, '$BAR_COLOR$', bar_color)
| extend plotly=replace_string(plotly, '$BAR_BG_COLOR$', bar_bg_color)
| extend plotly=replace_string(plotly, '$TICK_COLOR$', tick_color)
| extend plotly=replace_string(plotly, '$TICK_WIDTH$', tostring(tick_width))
| project plotly
};
// Write your query to use your function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Plotly", docstring = "Render gauge chart using plotly template")
plotly_gauge_fl(value:real, max_range:real=real(null), mode:string='gauge+number', chart_title:string='',font_color:string='black',
bar_color:string='green', bar_bg_color:string='lightgreen', tick_color:string='darkblue', tick_width:int=1)
{
let gauge_chart = toscalar(PlotlyTemplate | where name == "gauge" | project plotly);
print plotly = gauge_chart
| extend plotly=replace_string(plotly, '$VALUE$', tostring(value))
| extend plotly=replace_string(plotly, '$MAX_RANGE$', iff(isnull(max_range), 'null', tostring(max_range)))
| extend plotly=replace_string(plotly, '$MODE$', mode)
| extend plotly=replace_string(plotly, '$TITLE$', chart_title)
| extend plotly=replace_string(plotly, '$FONT_COLOR$', font_color)
| extend plotly=replace_string(plotly, '$BAR_COLOR$', bar_color)
| extend plotly=replace_string(plotly, '$BAR_BG_COLOR$', bar_bg_color)
| extend plotly=replace_string(plotly, '$TICK_COLOR$', tick_color)
| extend plotly=replace_string(plotly, '$TICK_WIDTH$', tostring(tick_width))
| project plotly
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let plotly_gauge_fl=(value:real, max_range:real=real(null), mode:string='gauge+number', chart_title:string='',font_color:string='black',
bar_color:string='green', bar_bg_color:string='lightgreen', tick_color:string='darkblue', tick_width:int=1)
{
let gauge_chart = toscalar(PlotlyTemplate | where name == "gauge" | project plotly);
print plotly = gauge_chart
| extend plotly=replace_string(plotly, '$VALUE$', tostring(value))
| extend plotly=replace_string(plotly, '$MAX_RANGE$', iff(isnull(max_range), 'null', tostring(max_range)))
| extend plotly=replace_string(plotly, '$MODE$', mode)
| extend plotly=replace_string(plotly, '$TITLE$', chart_title)
| extend plotly=replace_string(plotly, '$FONT_COLOR$', font_color)
| extend plotly=replace_string(plotly, '$BAR_COLOR$', bar_color)
| extend plotly=replace_string(plotly, '$BAR_BG_COLOR$', bar_bg_color)
| extend plotly=replace_string(plotly, '$TICK_COLOR$', tick_color)
| extend plotly=replace_string(plotly, '$TICK_WIDTH$', tostring(tick_width))
| project plotly
};
plotly_gauge_fl(value=180, chart_title='Speed', font_color='purple', tick_width=5)
| render plotly
Stored
plotly_gauge_fl(value=180, chart_title='Speed', font_color='purple', tick_width=5)
| render plotly
Output
The output is a Plotly JSON string that can be rendered in an Azure Data Explorer dashboard tile. For more information on creating dashboard tiles, see Visualize data with Azure Data Explorer dashboards. The output is a Plotly JSON string that can be rendered in a Real-Time dashboard tile. For more information on creating dashboard tiles, see Real-Time dashboards.
29 - plotly_scatter3d_fl()
The function plotly_scatter3d_fl()
is a user-defined function (UDF) that allows you to customize a plotly template to create an interactive 3D scatter chart.
The function accepts a table containing the records to be rendered, the names of the x, y, z & aggregation columns, and the chart title string. The function returns a single cell table containing plotly JSON. Optionally, you can render the data in an Azure Data Explorer dashboard tile. For more information, see Plotly (preview). The function accepts a table containing the records to be rendered, the names of the x, y, z & aggregation columns, and the chart title string. The function returns a single cell table containing plotly JSON. Optionally, you can render the data in a Real-Time dashboard tile. For more information, see Plotly (preview).
Prerequisite
Extract the required ‘scatter3d’ template from the publicly available PlotlyTemplate
table. Copy this table from the Samples database to your database by running the following KQL command from your target database:
.set PlotlyTemplate <| cluster('help.kusto.windows.net').database('Samples').PlotlyTemplate
Syntax
T | invoke plotly_scatter3d_fl(
x_col,
y_col,
z_col,
aggr_col [,
chart_title ])
Parameters
Name | Type | Required | Description |
---|---|---|---|
x_col | string | ✔️ | The name of the column for the X coordinated of the 3D plot. |
y_col | string | ✔️ | The name of the column for the Y coordinated of the 3D plot. |
z_col | string | ✔️ | The name of the column for the Z coordinated of the 3D plot. |
aggr_col | string | ✔️ | The name of the grouping column. Records in the same group are rendered in distinct color. |
chart_title | string | The chart title. The default is ‘3D Scatter chart’. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let plotly_scatter3d_fl=(tbl:(*), x_col:string, y_col:string, z_col:string, aggr_col:string='', chart_title:string='3D Scatter chart')
{
let scatter3d_chart = toscalar(PlotlyTemplate | where name == "scatter3d" | project plotly);
let tbl_ex = tbl | extend _x = column_ifexists(x_col, 0.0), _y = column_ifexists(y_col, 0.0), _z = column_ifexists(z_col, 0.0), _aggr = column_ifexists(aggr_col, 'ALL');
tbl_ex
| serialize
| summarize _x=pack_array(make_list(_x)), _y=pack_array(make_list(_y)), _z=pack_array(make_list(_z)) by _aggr
| summarize _aggr=make_list(_aggr), _x=make_list(_x), _y=make_list(_y), _z=make_list(_z)
| extend plotly = scatter3d_chart
| extend plotly=replace_string(plotly, '$CLASS1$', tostring(_aggr[0]))
| extend plotly=replace_string(plotly, '$CLASS2$', tostring(_aggr[1]))
| extend plotly=replace_string(plotly, '$CLASS3$', tostring(_aggr[2]))
| extend plotly=replace_string(plotly, '$X_NAME$', x_col)
| extend plotly=replace_string(plotly, '$Y_NAME$', y_col)
| extend plotly=replace_string(plotly, '$Z_NAME$', z_col)
| extend plotly=replace_string(plotly, '$CLASS1_X$', tostring(_x[0]))
| extend plotly=replace_string(plotly, '$CLASS1_Y$', tostring(_y[0]))
| extend plotly=replace_string(plotly, '$CLASS1_Z$', tostring(_z[0]))
| extend plotly=replace_string(plotly, '$CLASS2_X$', tostring(_x[1]))
| extend plotly=replace_string(plotly, '$CLASS2_Y$', tostring(_y[1]))
| extend plotly=replace_string(plotly, '$CLASS2_Z$', tostring(_z[1]))
| extend plotly=replace_string(plotly, '$CLASS3_X$', tostring(_x[2]))
| extend plotly=replace_string(plotly, '$CLASS3_Y$', tostring(_y[2]))
| extend plotly=replace_string(plotly, '$CLASS3_Z$', tostring(_z[2]))
| extend plotly=replace_string(plotly, '$TITLE$', chart_title)
| project plotly
};
// Write your query to use your function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Plotly", docstring = "Render 3D scatter chart using plotly template")
plotly_scatter3d_fl(tbl:(*), x_col:string, y_col:string, z_col:string, aggr_col:string='', chart_title:string='3D Scatter chart')
{
let scatter3d_chart = toscalar(PlotlyTemplate | where name == "scatter3d" | project plotly);
let tbl_ex = tbl | extend _x = column_ifexists(x_col, 0.0), _y = column_ifexists(y_col, 0.0), _z = column_ifexists(z_col, 0.0), _aggr = column_ifexists(aggr_col, 'ALL');
tbl_ex
| serialize
| summarize _x=pack_array(make_list(_x)), _y=pack_array(make_list(_y)), _z=pack_array(make_list(_z)) by _aggr
| summarize _aggr=make_list(_aggr), _x=make_list(_x), _y=make_list(_y), _z=make_list(_z)
| extend plotly = scatter3d_chart
| extend plotly=replace_string(plotly, '$CLASS1$', tostring(_aggr[0]))
| extend plotly=replace_string(plotly, '$CLASS2$', tostring(_aggr[1]))
| extend plotly=replace_string(plotly, '$CLASS3$', tostring(_aggr[2]))
| extend plotly=replace_string(plotly, '$X_NAME$', x_col)
| extend plotly=replace_string(plotly, '$Y_NAME$', y_col)
| extend plotly=replace_string(plotly, '$Z_NAME$', z_col)
| extend plotly=replace_string(plotly, '$CLASS1_X$', tostring(_x[0]))
| extend plotly=replace_string(plotly, '$CLASS1_Y$', tostring(_y[0]))
| extend plotly=replace_string(plotly, '$CLASS1_Z$', tostring(_z[0]))
| extend plotly=replace_string(plotly, '$CLASS2_X$', tostring(_x[1]))
| extend plotly=replace_string(plotly, '$CLASS2_Y$', tostring(_y[1]))
| extend plotly=replace_string(plotly, '$CLASS2_Z$', tostring(_z[1]))
| extend plotly=replace_string(plotly, '$CLASS3_X$', tostring(_x[2]))
| extend plotly=replace_string(plotly, '$CLASS3_Y$', tostring(_y[2]))
| extend plotly=replace_string(plotly, '$CLASS3_Z$', tostring(_z[2]))
| extend plotly=replace_string(plotly, '$TITLE$', chart_title)
| project plotly
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let plotly_scatter3d_fl=(tbl:(*), x_col:string, y_col:string, z_col:string, aggr_col:string='', chart_title:string='3D Scatter chart')
{
let scatter3d_chart = toscalar(PlotlyTemplate | where name == "scatter3d" | project plotly);
let tbl_ex = tbl | extend _x = column_ifexists(x_col, 0.0), _y = column_ifexists(y_col, 0.0), _z = column_ifexists(z_col, 0.0), _aggr = column_ifexists(aggr_col, 'ALL');
tbl_ex
| serialize
| summarize _x=pack_array(make_list(_x)), _y=pack_array(make_list(_y)), _z=pack_array(make_list(_z)) by _aggr
| summarize _aggr=make_list(_aggr), _x=make_list(_x), _y=make_list(_y), _z=make_list(_z)
| extend plotly = scatter3d_chart
| extend plotly=replace_string(plotly, '$CLASS1$', tostring(_aggr[0]))
| extend plotly=replace_string(plotly, '$CLASS2$', tostring(_aggr[1]))
| extend plotly=replace_string(plotly, '$CLASS3$', tostring(_aggr[2]))
| extend plotly=replace_string(plotly, '$X_NAME$', x_col)
| extend plotly=replace_string(plotly, '$Y_NAME$', y_col)
| extend plotly=replace_string(plotly, '$Z_NAME$', z_col)
| extend plotly=replace_string(plotly, '$CLASS1_X$', tostring(_x[0]))
| extend plotly=replace_string(plotly, '$CLASS1_Y$', tostring(_y[0]))
| extend plotly=replace_string(plotly, '$CLASS1_Z$', tostring(_z[0]))
| extend plotly=replace_string(plotly, '$CLASS2_X$', tostring(_x[1]))
| extend plotly=replace_string(plotly, '$CLASS2_Y$', tostring(_y[1]))
| extend plotly=replace_string(plotly, '$CLASS2_Z$', tostring(_z[1]))
| extend plotly=replace_string(plotly, '$CLASS3_X$', tostring(_x[2]))
| extend plotly=replace_string(plotly, '$CLASS3_Y$', tostring(_y[2]))
| extend plotly=replace_string(plotly, '$CLASS3_Z$', tostring(_z[2]))
| extend plotly=replace_string(plotly, '$TITLE$', chart_title)
| project plotly
};
Iris
| invoke plotly_scatter3d_fl(x_col='SepalLength', y_col='PetalLength', z_col='SepalWidth', aggr_col='Class', chart_title='3D scatter chart using plotly_scatter3d_fl()')
| render plotly
Stored
Iris
| invoke plotly_scatter3d_fl(x_col='SepalLength', y_col='PetalLength', z_col='SepalWidth', aggr_col='Class', chart_title='3D scatter chart using plotly_scatter3d_fl()')
Output
The output is a Plotly JSON string that can be rendered in an Azure Data Explorer dashboard tile. For more information on creating dashboard tiles, see Visualize data with Azure Data Explorer dashboards. The output is a Plotly JSON string that can be rendered in a Real-Time dashboard tile. For more information on creating dashboard tiles, see Real-Time dashboards.
You can rotate, zoom and hover over specific records:
30 - predict_fl()
The function predict_fl()
is a user-defined function (UDF) that predicts using an existing trained machine learning model. This model was built using Scikit-learn, serialized to string, and saved in a standard table.
Syntax
T | invoke predict_fl(
models_tbl,
model_name,
features_cols,
pred_col)
Parameters
Name | Type | Required | Description |
---|---|---|---|
models_tbl | string | ✔️ | The name of the table that contains all serialized models. The table must have the following columns:name : the model nametimestamp : time of model trainingmodel : string representation of the serialized model |
model_name | string | ✔️ | The name of the specific model to use. |
features_cols | synamic | ✔️ | An array containing the names of the features columns that are used by the model for prediction. |
pred_col | string | ✔️ | The name of the column that stores the predictions. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let predict_fl=(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
let code = ```if 1:
import pickle
import binascii
smodel = kargs["smodel"]
features_cols = kargs["features_cols"]
pred_col = kargs["pred_col"]
bmodel = binascii.unhexlify(smodel)
clf1 = pickle.loads(bmodel)
df1 = df[features_cols]
predictions = clf1.predict(df1)
result = df
result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])
```;
samples
| evaluate python(typeof(*), code, kwargs)
};
// Write your code to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create function with (folder = "Packages\\ML", docstring = "Predict using ML model, build by Scikit-learn")
predict_fl(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
let code = ```if 1:
import pickle
import binascii
smodel = kargs["smodel"]
features_cols = kargs["features_cols"]
pred_col = kargs["pred_col"]
bmodel = binascii.unhexlify(smodel)
clf1 = pickle.loads(bmodel)
df1 = df[features_cols]
predictions = clf1.predict(df1)
result = df
result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])
```;
samples
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let predict_fl=(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
let code = ```if 1:
import pickle
import binascii
smodel = kargs["smodel"]
features_cols = kargs["features_cols"]
pred_col = kargs["pred_col"]
bmodel = binascii.unhexlify(smodel)
clf1 = pickle.loads(bmodel)
df1 = df[features_cols]
predictions = clf1.predict(df1)
result = df
result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])
```;
samples
| evaluate python(typeof(*), code, kwargs)
};
//
// Predicts room occupancy from sensors measurements, and calculates the confusion matrix
//
// Occupancy Detection is an open dataset from UCI Repository at https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+
// It contains experimental data for binary classification of room occupancy from Temperature,Humidity,Light and CO2.
// Ground-truth labels were obtained from time stamped pictures that were taken every minute
//
OccupancyDetection
| where Test == 1
| extend pred_Occupancy=false
| invoke predict_fl(ML_Models, 'Occupancy', pack_array('Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio'), 'pred_Occupancy')
| summarize n=count() by Occupancy, pred_Occupancy
Stored
//
// Predicts room occupancy from sensors measurements, and calculates the confusion matrix
//
// Occupancy Detection is an open dataset from UCI Repository at https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+
// It contains experimental data for binary classification of room occupancy from Temperature,Humidity,Light and CO2.
// Ground-truth labels were obtained from time stamped pictures that were taken every minute
//
OccupancyDetection
| where Test == 1
| extend pred_Occupancy=false
| invoke predict_fl(ML_Models, 'Occupancy', pack_array('Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio'), 'pred_Occupancy')
| summarize n=count() by Occupancy, pred_Occupancy
Output
Occupancy | pred_Occupancy | n |
---|---|---|
TRUE | TRUE | 3006 |
FALSE | TRUE | 112 |
TRUE | FALSE | 15 |
FALSE | FALSE | 9284 |
Model asset
Get sample dataset and pre-trained model with Python plugin enabled.
//dataset
.set OccupancyDetection <| cluster('help').database('Samples').OccupancyDetection
//model
.set ML_Models <| datatable(name:string, timestamp:datetime, model:string) [
'Occupancy', datetime(now), '800363736b6c6561726e2e6c696e6561725f6d6f64656c2e6c6f6769737469630a4c6f67697374696352656772657373696f6e0a7100298171017d710228580700000070656e616c7479710358020000006c32710458040000006475616c7105895803000000746f6c7106473f1a36e2eb1c432d5801000000437107473ff0000000000000580d0000006669745f696e746572636570747108885811000000696e746572636570745f7363616c696e6771094b01580c000000636c6173735f776569676874710a4e580c00000072616e646f6d5f7374617465710b4e5806000000736f6c766572710c58090000006c69626c696e656172710d58080000006d61785f69746572710e4b64580b0000006d756c74695f636c617373710f58030000006f767271105807000000766572626f736571114b00580a0000007761726d5f737461727471128958060000006e5f6a6f627371134b015808000000636c61737365735f7114636e756d70792e636f72652e6d756c746961727261790a5f7265636f6e7374727563740a7115636e756d70790a6e6461727261790a71164b00857117430162711887711952711a284b014b0285711b636e756d70790a64747970650a711c58020000006231711d4b004b0187711e52711f284b0358010000007c71204e4e4e4affffffff4affffffff4b007471216289430200017122747123625805000000636f65665f7124681568164b008571256818877126527127284b014b014b05867128681c5802000000663871294b004b0187712a52712b284b0358010000003c712c4e4e4e4affffffff4affffffff4b0074712d628943286a02e0d50687e0bfc6d7c974fa93a63fb3d3b8080e6e943ffceb15defdad713f14c3a76bd73202bf712e74712f62580a000000696e746572636570745f7130681568164b008571316818877132527133284b014b01857134682b894308f1e89f57711290bf71357471366258070000006e5f697465725f7137681568164b00857138681887713952713a284b014b0185713b681c58020000006934713c4b004b0187713d52713e284b03682c4e4e4e4affffffff4affffffff4b0074713f628943040c00000071407471416258100000005f736b6c6561726e5f76657273696f6e71425806000000302e31392e32714375622e'
]
31 - predict_onnx_fl()
The function predict_onnx_fl()
is a user-defined function (UDF) that predicts using an existing trained machine learning model. This model has been converted to ONNX format, serialized to string, and saved in a standard table.
Syntax
T | invoke predict_onnx_fl(
models_tbl,
model_name,
features_cols,
pred_col)
Parameters
Name | Type | Required | Description |
---|---|---|---|
models_tbl | string | ✔️ | The name of the table that contains all serialized models. The table must have the following columns:name : the model nametimestamp : time of model trainingmodel : string representation of the serialized model |
model_name | string | ✔️ | The name of the specific model to use. |
features_cols | synamic | ✔️ | An array containing the names of the features columns that are used by the model for prediction. |
pred_col | string | ✔️ | The name of the column that stores the predictions. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let predict_onnx_fl=(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
let code = ```if 1:
import binascii
smodel = kargs["smodel"]
features_cols = kargs["features_cols"]
pred_col = kargs["pred_col"]
bmodel = binascii.unhexlify(smodel)
features_cols = kargs["features_cols"]
pred_col = kargs["pred_col"]
import onnxruntime as rt
sess = rt.InferenceSession(bmodel)
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
df1 = df[features_cols]
predictions = sess.run([label_name], {input_name: df1.values.astype(np.float32)})[0]
result = df
result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])
```;
samples | evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\ML", docstring = "Predict using ONNX model")
predict_onnx_fl(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
let code = ```if 1:
import binascii
smodel = kargs["smodel"]
features_cols = kargs["features_cols"]
pred_col = kargs["pred_col"]
bmodel = binascii.unhexlify(smodel)
features_cols = kargs["features_cols"]
pred_col = kargs["pred_col"]
import onnxruntime as rt
sess = rt.InferenceSession(bmodel)
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
df1 = df[features_cols]
predictions = sess.run([label_name], {input_name: df1.values.astype(np.float32)})[0]
result = df
result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])
```;
samples | evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let predict_onnx_fl=(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = bag_pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
let code = ```if 1:
import binascii
smodel = kargs["smodel"]
features_cols = kargs["features_cols"]
pred_col = kargs["pred_col"]
bmodel = binascii.unhexlify(smodel)
features_cols = kargs["features_cols"]
pred_col = kargs["pred_col"]
import onnxruntime as rt
sess = rt.InferenceSession(bmodel)
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
df1 = df[features_cols]
predictions = sess.run([label_name], {input_name: df1.values.astype(np.float32)})[0]
result = df
result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])
```;
samples | evaluate python(typeof(*), code, kwargs)
};
//
// Predicts room occupancy from sensors measurements, and calculates the confusion matrix
//
// Occupancy Detection is an open dataset from UCI Repository at https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+
// It contains experimental data for binary classification of room occupancy from Temperature,Humidity,Light and CO2.
// Ground-truth labels were obtained from time stamped pictures that were taken every minute
//
OccupancyDetection
| where Test == 1
| extend pred_Occupancy=bool(0)
| invoke predict_onnx_fl(ML_Models, 'ONNX-Occupancy', pack_array('Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio'), 'pred_Occupancy')
| summarize n=count() by Occupancy, pred_Occupancy
Stored
//
// Predicts room occupancy from sensors measurements, and calculates the confusion matrix
//
// Occupancy Detection is an open dataset from UCI Repository at https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+
// It contains experimental data for binary classification of room occupancy from Temperature,Humidity,Light and CO2.
// Ground-truth labels were obtained from time stamped pictures that were taken every minute
//
OccupancyDetection
| where Test == 1
| extend pred_Occupancy=bool(0)
| invoke predict_onnx_fl(ML_Models, 'ONNX-Occupancy', pack_array('Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio'), 'pred_Occupancy')
| summarize n=count() by Occupancy, pred_Occupancy
Output
Occupancy | pred_Occupancy | n |
---|---|---|
TRUE | TRUE | 3006 |
FALSE | TRUE | 112 |
TRUE | FALSE | 15 |
FALSE | FALSE | 9284 |
32 - quantize_fl()
The function quantize_fl()
is a user-defined function (UDF) that bins metric columns. It quantizes metric columns to categorical labels, based on the K-Means algorithm.
Syntax
T | invoke quantize_fl(
num_bins,
in_cols,
out_cols [,
labels ])
Parameters
Name | Type | Required | Description |
---|---|---|---|
num_bins | int | ✔️ | The required number of bins. |
in_cols | dynamic | ✔️ | An array containing the names of the columns to quantize. |
out_cols | dynamic | ✔️ | An array containing the names of the respective output columns for the binned values. |
labels | dynamic | An array containing the label names. If unspecified, bin ranges will be used. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let quantize_fl=(tbl:(*), num_bins:int, in_cols:dynamic, out_cols:dynamic, labels:dynamic=dynamic(null))
{
let kwargs = bag_pack('num_bins', num_bins, 'in_cols', in_cols, 'out_cols', out_cols, 'labels', labels);
let code = ```if 1:
from sklearn.preprocessing import KBinsDiscretizer
num_bins = kargs["num_bins"]
in_cols = kargs["in_cols"]
out_cols = kargs["out_cols"]
labels = kargs["labels"]
result = df
binner = KBinsDiscretizer(n_bins=num_bins, encode="ordinal", strategy="kmeans")
df_in = df[in_cols]
bdata = binner.fit_transform(df_in)
if labels is None:
for i in range(len(out_cols)): # loop on each column and convert it to binned labels
ii = np.round(binner.bin_edges_[i], 3)
labels = [str(ii[j-1]) + '-' + str(ii[j]) for j in range(1, num_bins+1)]
result.loc[:,out_cols[i]] = np.take(labels, bdata[:, i].astype(int))
else:
result[out_cols] = np.take(labels, bdata.astype(int))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create function with (folder = "Packages\\ML", docstring = "Binning metric columns")
quantize_fl(tbl:(*), num_bins:int, in_cols:dynamic, out_cols:dynamic, labels:dynamic)
{
let kwargs = bag_pack('num_bins', num_bins, 'in_cols', in_cols, 'out_cols', out_cols, 'labels', labels);
let code = ```if 1:
from sklearn.preprocessing import KBinsDiscretizer
num_bins = kargs["num_bins"]
in_cols = kargs["in_cols"]
out_cols = kargs["out_cols"]
labels = kargs["labels"]
result = df
binner = KBinsDiscretizer(n_bins=num_bins, encode="ordinal", strategy="kmeans")
df_in = df[in_cols]
bdata = binner.fit_transform(df_in)
if labels is None:
for i in range(len(out_cols)): # loop on each column and convert it to binned labels
ii = np.round(binner.bin_edges_[i], 3)
labels = [str(ii[j-1]) + '-' + str(ii[j]) for j in range(1, num_bins+1)]
result.loc[:,out_cols[i]] = np.take(labels, bdata[:, i].astype(int))
else:
result[out_cols] = np.take(labels, bdata.astype(int))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let quantize_fl=(tbl:(*), num_bins:int, in_cols:dynamic, out_cols:dynamic, labels:dynamic=dynamic(null))
{
let kwargs = bag_pack('num_bins', num_bins, 'in_cols', in_cols, 'out_cols', out_cols, 'labels', labels);
let code = ```if 1:
from sklearn.preprocessing import KBinsDiscretizer
num_bins = kargs["num_bins"]
in_cols = kargs["in_cols"]
out_cols = kargs["out_cols"]
labels = kargs["labels"]
result = df
binner = KBinsDiscretizer(n_bins=num_bins, encode="ordinal", strategy="kmeans")
df_in = df[in_cols]
bdata = binner.fit_transform(df_in)
if labels is None:
for i in range(len(out_cols)): # loop on each column and convert it to binned labels
ii = np.round(binner.bin_edges_[i], 3)
labels = [str(ii[j-1]) + '-' + str(ii[j]) for j in range(1, num_bins+1)]
result.loc[:,out_cols[i]] = np.take(labels, bdata[:, i].astype(int))
else:
result[out_cols] = np.take(labels, bdata.astype(int))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
//
union
(range x from 1 to 5 step 1),
(range x from 10 to 15 step 1),
(range x from 20 to 25 step 1)
| extend x_label='', x_bin=''
| invoke quantize_fl(3, pack_array('x'), pack_array('x_label'), pack_array('Low', 'Med', 'High'))
| invoke quantize_fl(3, pack_array('x'), pack_array('x_bin'), dynamic(null))
Stored
union
(range x from 1 to 5 step 1),
(range x from 10 to 15 step 1),
(range x from 20 to 25 step 1)
| extend x_label='', x_bin=''
| invoke quantize_fl(3, pack_array('x'), pack_array('x_label'), pack_array('Low', 'Med', 'High'))
| invoke quantize_fl(3, pack_array('x'), pack_array('x_bin'), dynamic(null))
Output
x | x_label | x_bin |
---|---|---|
1 | Low | 1.0-7.75 |
2 | Low | 1.0-7.75 |
3 | Low | 1.0-7.75 |
4 | Low | 1.0-7.75 |
5 | Low | 1.0-7.75 |
20 | High | 17.5-25.0 |
21 | High | 17.5-25.0 |
22 | High | 17.5-25.0 |
23 | High | 17.5-25.0 |
24 | High | 17.5-25.0 |
25 | High | 17.5-25.0 |
10 | Med | 7.75-17.5 |
11 | Med | 7.75-17.5 |
12 | Med | 7.75-17.5 |
13 | Med | 7.75-17.5 |
14 | Med | 7.75-17.5 |
15 | Med | 7.75-17.5 |
33 - series_clean_anomalies_fl()
Cleans anomalous points in a series.
The function series_clean_anomalies_fl()
is a user-defined function (UDF) that takes a dynamic numerical array as input and another numerical array of anomalies and replaces the anomalies in the input array with interpolated value of their adjacent points.
Syntax
series_clean_anomalies_fl(
y_series,
anomalies)
Parameters
Name | Type | Required | Description |
---|---|---|---|
y_series | dynamic | ✔️ | The input array of numeric values. |
anomalies | dynamic | ✔️ | The anomalies array containing either 0 for normal points or any other value for anomalous points. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_clean_anomalies_fl = (y_series:dynamic, anomalies:dynamic)
{
let fnum = array_iff(series_not_equals(anomalies, 0), real(null), y_series); // replace anomalies with null values
series_fill_linear(fnum)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Replace anomalies by interpolated value", skipvalidation = "true")
series_clean_anomalies_fl(y_series:dynamic, anomalies:dynamic)
{
let fnum = array_iff(series_not_equals(anomalies, 0), real(null), y_series); // replace anomalies with null values
series_fill_linear(fnum)
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_clean_anomalies_fl = (y_series:dynamic, anomalies:dynamic)
{
let fnum = array_iff(series_not_equals(anomalies, 0), real(null), y_series); // replace anomalies with null values
series_fill_linear(fnum)
}
;
let min_t = datetime(2016-08-29);
let max_t = datetime(2016-08-31);
demo_make_series1
| make-series num=count() on TimeStamp from min_t to max_t step 20m by OsVer
| extend anomalies = series_decompose_anomalies(num, 0.8)
| extend num_c = series_clean_anomalies_fl(num, anomalies)
| render anomalychart with (anomalycolumns=anomalies)
Stored
let min_t = datetime(2016-08-29);
let max_t = datetime(2016-08-31);
demo_make_series1
| make-series num=count() on TimeStamp from min_t to max_t step 20m by OsVer
| extend anomalies = series_decompose_anomalies(num, 0.8)
| extend num_c = series_clean_anomalies_fl(num, anomalies)
| render anomalychart with (anomalycolumns=anomalies)
Output
34 - series_cosine_similarity_fl()
Calculates the cosine similarity of two numerical vectors.
The function series_cosine_similarity_fl()
is a user-defined function (UDF) that takes an expression containing two dynamic numerical arrays as input and calculates their cosine similarity.
Syntax
series_cosine_similarity_fl(
vec1,
vec2,
[ vec1_size [,
vec2_size ]])
Parameters
Name | Type | Required | Description |
---|---|---|---|
vec1 | dynamic | ✔️ | An array of numeric values. |
vec2 | dynamic | ✔️ | An array of numeric values that is the same length as vec1. |
vec1_size | real | The size of vec1. This is equivalent to the square root of the dot product of the vector with itself. | |
vec2_size | real | The size of vec2. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_cosine_similarity_fl=(vec1:dynamic, vec2:dynamic, vec1_size:real=double(null), vec2_size:real=double(null))
{
let dp = series_dot_product(vec1, vec2);
let v1l = iff(isnull(vec1_size), sqrt(series_dot_product(vec1, vec1)), vec1_size);
let v2l = iff(isnull(vec2_size), sqrt(series_dot_product(vec2, vec2)), vec2_size);
dp/(v1l*v2l)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Calculate the Cosine similarity of 2 numerical arrays")
series_cosine_similarity_fl(vec1:dynamic, vec2:dynamic, vec1_size:real=double(null), vec2_size:real=double(null))
{
let dp = series_dot_product(vec1, vec2);
let v1l = iff(isnull(vec1_size), sqrt(series_dot_product(vec1, vec1)), vec1_size);
let v2l = iff(isnull(vec2_size), sqrt(series_dot_product(vec2, vec2)), vec2_size);
dp/(v1l*v2l)
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_cosine_similarity_fl=(vec1:dynamic, vec2:dynamic, vec1_size:real=double(null), vec2_size:real=double(null))
{
let dp = series_dot_product(vec1, vec2);
let v1l = iff(isnull(vec1_size), sqrt(series_dot_product(vec1, vec1)), vec1_size);
let v2l = iff(isnull(vec2_size), sqrt(series_dot_product(vec2, vec2)), vec2_size);
dp/(v1l*v2l)
};
let s1=pack_array(0, 1);
let s2=pack_array(sqrt(2), sqrt(2));
print angle=acos(series_cosine_similarity_fl(s1, s2))/(2*pi())*360
Stored
let s1=pack_array(0, 1);
let s2=pack_array(sqrt(2), sqrt(2));
print angle=acos(series_cosine_similarity_fl(s1, s2))/(2*pi())*360
Output
angle |
---|
45 |
35 - series_dbl_exp_smoothing_fl()
Applies a double exponential smoothing filter on a series.
The function series_dbl_exp_smoothing_fl()
is a user-defined function (UDF) that takes an expression containing a dynamic numerical array as input and applies a double exponential smoothing filter. When there is trend in the series, this function is superior to the series_exp_smoothing_fl() function, which implements a basic exponential smoothing filter.
Syntax
series_dbl_exp_smoothing_fl(
y_series [,
alpha [,
beta ]])
Parameters
Name | Type | Required | Description |
---|---|---|---|
y_series | dynamic | ✔️ | An array of numeric values. |
alpha | real | A value in the range [0-1] that specifies the weight of the last point vs. the weight of the previous points, which is 1 - alpha . The default is 0.5. | |
beta | real | A value in the range [0-1] that specifies the weight of the last slope vs. the weight of the previous slopes, which is 1 - beta . The default is 0.5. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_dbl_exp_smoothing_fl = (y_series:dynamic, alpha:double=0.5, beta:double=0.5)
{
series_iir(y_series, pack_array(alpha, alpha*(beta-1)), pack_array(1, alpha*(1+beta)-2, 1-alpha))
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Double exponential smoothing for a series")
series_dbl_exp_smoothing_fl(y_series:dynamic, alpha:double=0.5, beta:double=0.5)
{
series_iir(y_series, pack_array(alpha, alpha*(beta-1)), pack_array(1, alpha*(1+beta)-2, 1-alpha))
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_dbl_exp_smoothing_fl = (y_series:dynamic, alpha:double=0.5, beta:double=0.5)
{
series_iir(y_series, pack_array(alpha, alpha*(beta-1)), pack_array(1, alpha*(1+beta)-2, 1-alpha))
};
range x from 1 to 50 step 1
| extend y = x + rand()*10
| summarize x = make_list(x), y = make_list(y)
| extend dbl_exp_smooth_y = series_dbl_exp_smoothing_fl(y, 0.2, 0.4)
| render linechart
Stored
range x from 1 to 50 step 1
| extend y = x + rand()*10
| summarize x = make_list(x), y = make_list(y)
| extend dbl_exp_smooth_y = series_dbl_exp_smoothing_fl(y, 0.2, 0.4)
| render linechart
Output
36 - series_dot_product_fl()
Calculates the dot product of two numerical vectors.
The function series_dot_product_fl()
is a user-defined function (UDF) that takes an expression containing two dynamic numerical arrays as input and calculates their dot product.
Syntax
series_dot_product_fl(
vec1,
vec2)
Parameters
Name | Type | Required | Description |
---|---|---|---|
vec1 | dynamic | ✔️ | An array of numeric values. |
vec2 | dynamic | ✔️ | An array of numeric values that is the same length as vec1. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_dot_product_fl=(vec1:dynamic, vec2:dynamic)
{
let elem_prod = series_multiply(vec1, vec2);
let cum_sum = series_iir(elem_prod, dynamic([1]), dynamic([1,-1]));
todouble(cum_sum[-1])
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Calculate the dot product of 2 numerical arrays")
series_dot_product_fl(vec1:dynamic, vec2:dynamic)
{
let elem_prod = series_multiply(vec1, vec2);
let cum_sum = series_iir(elem_prod, dynamic([1]), dynamic([1,-1]));
todouble(cum_sum[-1])
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_dot_product_fl=(vec1:dynamic, vec2:dynamic)
{
let elem_prod = series_multiply(vec1, vec2);
let cum_sum = series_iir(elem_prod, dynamic([1]), dynamic([1,-1]));
todouble(cum_sum[-1])
};
union
(print 1 | project v1=range(1, 3, 1), v2=range(4, 6, 1)),
(print 1 | project v1=range(11, 13, 1), v2=range(14, 16, 1))
| extend v3=series_dot_product_fl(v1, v2)
Stored
union
(print 1 | project v1=range(1, 3, 1), v2=range(4, 6, 1)),
(print 1 | project v1=range(11, 13, 1), v2=range(14, 16, 1))
| extend v3=series_dot_product_fl(v1, v2)
Output
37 - series_downsample_fl()
The function series_downsample_fl()
is a user-defined function (UDF) that downsamples a time series by an integer factor. This function takes a table containing multiple time series (dynamic numerical array), and downsamples each series. The output contains both the coarser series and its respective times array. To avoid aliasing, the function applies a simple low pass filter on each series before subsampling.
Syntax
T | invoke series_downsample_fl(
t_col,
y_col,
ds_t_col,
ds_y_col,
sampling_factor)
Parameters
Name | Type | Required | Description |
---|---|---|---|
t_col | string | ✔️ | The name of the column that contains the time axis of the series to downsample. |
y_col | string | ✔️ | The name of the column that contains the series to downsample. |
ds_t_col | string | ✔️ | The name of the column to store the down sampled time axis of each series. |
ds_y_col | string | ✔️ | The name of the column to store the down sampled series. |
sampling_factor | int | ✔️ | An integer specifying the required down sampling. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_downsample_fl=(tbl:(*), t_col:string, y_col:string, ds_t_col:string, ds_y_col:string, sampling_factor:int)
{
tbl
| extend _t_ = column_ifexists(t_col, dynamic(0)), _y_ = column_ifexists(y_col, dynamic(0))
| extend _y_ = series_fir(_y_, repeat(1, sampling_factor), true, true) // apply a simple low pass filter before sub-sampling
| mv-apply _t_ to typeof(DateTime), _y_ to typeof(double) on
(extend rid=row_number()-1
| where rid % sampling_factor == ceiling(sampling_factor/2.0)-1 // sub-sampling
| summarize _t_ = make_list(_t_), _y_ = make_list(_y_))
| extend cols = bag_pack(ds_t_col, _t_, ds_y_col, _y_)
| project-away _t_, _y_
| evaluate bag_unpack(cols)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Downsampling a series by an integer factor")
series_downsample_fl(tbl:(*), t_col:string, y_col:string, ds_t_col:string, ds_y_col:string, sampling_factor:int)
{
tbl
| extend _t_ = column_ifexists(t_col, dynamic(0)), _y_ = column_ifexists(y_col, dynamic(0))
| extend _y_ = series_fir(_y_, repeat(1, sampling_factor), true, true) // apply a simple low pass filter before sub-sampling
| mv-apply _t_ to typeof(DateTime), _y_ to typeof(double) on
(extend rid=row_number()-1
| where rid % sampling_factor == ceiling(sampling_factor/2.0)-1 // sub-sampling
| summarize _t_ = make_list(_t_), _y_ = make_list(_y_))
| extend cols = bag_pack(ds_t_col, _t_, ds_y_col, _y_)
| project-away _t_, _y_
| evaluate bag_unpack(cols)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_downsample_fl=(tbl:(*), t_col:string, y_col:string, ds_t_col:string, ds_y_col:string, sampling_factor:int)
{
tbl
| extend _t_ = column_ifexists(t_col, dynamic(0)), _y_ = column_ifexists(y_col, dynamic(0))
| extend _y_ = series_fir(_y_, repeat(1, sampling_factor), true, true) // apply a simple low pass filter before sub-sampling
| mv-apply _t_ to typeof(DateTime), _y_ to typeof(double) on
(extend rid=row_number()-1
| where rid % sampling_factor == ceiling(sampling_factor/2.0)-1 // sub-sampling
| summarize _t_ = make_list(_t_), _y_ = make_list(_y_))
| extend cols = bag_pack(ds_t_col, _t_, ds_y_col, _y_)
| project-away _t_, _y_
| evaluate bag_unpack(cols)
};
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| invoke series_downsample_fl('TimeStamp', 'num', 'coarse_TimeStamp', 'coarse_num', 4)
| render timechart with(xcolumn=coarse_TimeStamp, ycolumns=coarse_num)
Stored
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| invoke series_downsample_fl('TimeStamp', 'num', 'coarse_TimeStamp', 'coarse_num', 4)
| render timechart with(xcolumn=coarse_TimeStamp, ycolumns=coarse_num)
Output
The time series downsampled by 4:
For reference, here is the original time series (before downsampling):
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| render timechart with(xcolumn=TimeStamp, ycolumns=num)
38 - series_exp_smoothing_fl()
Applies a basic exponential smoothing filter on a series.
The function series_exp_smoothing_fl()
is a user-defined function (UDF) that takes an expression containing a dynamic numerical array as input and applies a basic exponential smoothing filter.
Syntax
series_exp_smoothing_fl(
y_series [,
alpha ])
Parameters
Name | Type | Required | Description |
---|---|---|---|
y_series | dynamic | ✔️ | An array cell of numeric values. |
alpha | real | A value in the range [0-1] that specifies the weight of the last point vs. the weight of the previous points, which is 1 - alpha . The default is 0.5. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_exp_smoothing_fl = (y_series:dynamic, alpha:double=0.5)
{
series_iir(y_series, pack_array(alpha), pack_array(1, alpha-1))
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Basic exponential smoothing for a series")
series_exp_smoothing_fl(y_series:dynamic, alpha:double=0.5)
{
series_iir(y_series, pack_array(alpha), pack_array(1, alpha-1))
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_exp_smoothing_fl = (y_series:dynamic, alpha:double=0.5)
{
series_iir(y_series, pack_array(alpha), pack_array(1, alpha-1))
};
range x from 1 to 50 step 1
| extend y = x % 10
| summarize x = make_list(x), y = make_list(y)
| extend exp_smooth_y = series_exp_smoothing_fl(y, 0.4)
| render linechart
Stored
range x from 1 to 50 step 1
| extend y = x % 10
| summarize x = make_list(x), y = make_list(y)
| extend exp_smooth_y = series_exp_smoothing_fl(y, 0.4)
| render linechart
Output
39 - series_fbprophet_forecast_fl()
The function series_fbprophet_forecast_fl()
is a user-defined function (UDF) that takes an expression containing a time series as input, and predicts the values of the last trailing points using the Prophet algorithm. The function returns both the forecasted points and their confidence intervals. This function is a Kusto Query Language (KQL) wrapper to Prophet() class, and exposes only the parameters that are mandatory for prediction. Feel free to modify your copy to support more parameters. such as holidays, change points, Fourier order, and so on.
- Install the
fbprophet
package since it isn’t included in the Python image. To install the package, do the following:- Follow the guidelines for Installing packages for the Python plugin.
- To save time in the above guidelines, you can download the
prophet
ZIP file, containing the wheel files ofprophet
and its dependencies, from https://artifactswestusnew.blob.core.windows.net/public/prophet-1.1.5.zip. Save this file to your allowlisted blob container.
- To save time in the above guidelines, you can download the
- Create a SAS token with read access to your ZIP file. To create a SAS token, see get the SAS for a blob container.
- In the Example, replace the URL reference in the
external_artifacts
parameter with your file path and its SAS token.
- Follow the guidelines for Installing packages for the Python plugin.
Syntax
T | invoke series_fbprophet_forecast_fl(
ts_series,
y_series,
y_pred_series,
[ points ],
[ y_pred_low_series ],
[ y_pred_high_series ])
Parameters
Name | Type | Required | Description |
---|---|---|---|
ts_series | string | ✔️ | The name of the input table column containing the time stamps of the series to predict. |
y_series | string | ✔️ | The name of the input table column containing the values of the series to predict. |
y_pred_series | string | ✔️ | The name of the column to store the predicted series. |
points | int | ✔️ | The number of points at the end of the series to predict (forecast). These points are excluded from the learning (regression) process. The default is 0. |
y_pred_low_series | string | The name of the column to store the series of the lowest values of the confidence interval. Omit if the confidence interval isn’t needed. | |
y_pred_high_series | string | The name of the column to store the series of the highest values of the confidence interval. Omit if the confidence interval isn’t needed. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_fbprophet_forecast_fl=(tbl:(*), ts_series:string, y_series:string, y_pred_series:string, points:int=0, y_pred_low_series:string='', y_pred_high_series:string='')
{
let kwargs = bag_pack('ts_series', ts_series, 'y_series', y_series, 'y_pred_series', y_pred_series, 'points', points, 'y_pred_low_series', y_pred_low_series, 'y_pred_high_series', y_pred_high_series);
let code = ```if 1:
from sandbox_utils import Zipackage
Zipackage.install("prophet.zip")
ts_series = kargs["ts_series"]
y_series = kargs["y_series"]
y_pred_series = kargs["y_pred_series"]
points = kargs["points"]
y_pred_low_series = kargs["y_pred_low_series"]
y_pred_high_series = kargs["y_pred_high_series"]
result = df
sr = pd.Series(df[y_pred_series])
if y_pred_low_series != '':
srl = pd.Series(df[y_pred_low_series])
if y_pred_high_series != '':
srh = pd.Series(df[y_pred_high_series])
from prophet import Prophet
df1 = pd.DataFrame(columns=["ds", "y"])
for i in range(df.shape[0]):
df1["ds"] = pd.to_datetime(df[ts_series][i])
df1["ds"] = df1["ds"].dt.tz_convert(None)
df1["y"] = df[y_series][i]
df2 = df1[:-points]
m = Prophet()
m.fit(df2)
future = df1[["ds"]]
forecast = m.predict(future)
sr[i] = list(forecast["yhat"])
if y_pred_low_series != '':
srl[i] = list(forecast["yhat_lower"])
if y_pred_high_series != '':
srh[i] = list(forecast["yhat_upper"])
result[y_pred_series] = sr
if y_pred_low_series != '':
result[y_pred_low_series] = srl
if y_pred_high_series != '':
result[y_pred_high_series] = srh
```;
tbl
| evaluate python(typeof(*), code, kwargs
, external_artifacts=bag_pack('prophet.zip', 'https://artifactswestusnew.blob.core.windows.net/public/prophet-1.1.5.zip?*** YOUR SAS TOKEN ***'))
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Time Series Forecast using Facebook fbprophet package")
series_fbprophet_forecast_fl(tbl:(*), ts_series:string, y_series:string, y_pred_series:string, points:int=0, y_pred_low_series:string='', y_pred_high_series:string='')
{
let kwargs = bag_pack('ts_series', ts_series, 'y_series', y_series, 'y_pred_series', y_pred_series, 'points', points, 'y_pred_low_series', y_pred_low_series, 'y_pred_high_series', y_pred_high_series);
let code = ```if 1:
from sandbox_utils import Zipackage
Zipackage.install("prophet.zip")
ts_series = kargs["ts_series"]
y_series = kargs["y_series"]
y_pred_series = kargs["y_pred_series"]
points = kargs["points"]
y_pred_low_series = kargs["y_pred_low_series"]
y_pred_high_series = kargs["y_pred_high_series"]
result = df
sr = pd.Series(df[y_pred_series])
if y_pred_low_series != '':
srl = pd.Series(df[y_pred_low_series])
if y_pred_high_series != '':
srh = pd.Series(df[y_pred_high_series])
from prophet import Prophet
df1 = pd.DataFrame(columns=["ds", "y"])
for i in range(df.shape[0]):
df1["ds"] = pd.to_datetime(df[ts_series][i])
df1["ds"] = df1["ds"].dt.tz_convert(None)
df1["y"] = df[y_series][i]
df2 = df1[:-points]
m = Prophet()
m.fit(df2)
future = df1[["ds"]]
forecast = m.predict(future)
sr[i] = list(forecast["yhat"])
if y_pred_low_series != '':
srl[i] = list(forecast["yhat_lower"])
if y_pred_high_series != '':
srh[i] = list(forecast["yhat_upper"])
result[y_pred_series] = sr
if y_pred_low_series != '':
result[y_pred_low_series] = srl
if y_pred_high_series != '':
result[y_pred_high_series] = srh
```;
tbl
| evaluate python(typeof(*), code, kwargs
, external_artifacts=bag_pack('prophet.zip', 'https://artifactswestusnew.blob.core.windows.net/public/prophet-1.1.5.zip?*** YOUR SAS TOKEN ***'))
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_fbprophet_forecast_fl=(tbl:(*), ts_series:string, y_series:string, y_pred_series:string, points:int=0, y_pred_low_series:string='', y_pred_high_series:string='')
{
let kwargs = bag_pack('ts_series', ts_series, 'y_series', y_series, 'y_pred_series', y_pred_series, 'points', points, 'y_pred_low_series', y_pred_low_series, 'y_pred_high_series', y_pred_high_series);
let code = ```if 1:
from sandbox_utils import Zipackage
Zipackage.install("prophet.zip")
ts_series = kargs["ts_series"]
y_series = kargs["y_series"]
y_pred_series = kargs["y_pred_series"]
points = kargs["points"]
y_pred_low_series = kargs["y_pred_low_series"]
y_pred_high_series = kargs["y_pred_high_series"]
result = df
sr = pd.Series(df[y_pred_series])
if y_pred_low_series != '':
srl = pd.Series(df[y_pred_low_series])
if y_pred_high_series != '':
srh = pd.Series(df[y_pred_high_series])
from prophet import Prophet
df1 = pd.DataFrame(columns=["ds", "y"])
for i in range(df.shape[0]):
df1["ds"] = pd.to_datetime(df[ts_series][i])
df1["ds"] = df1["ds"].dt.tz_convert(None)
df1["y"] = df[y_series][i]
df2 = df1[:-points]
m = Prophet()
m.fit(df2)
future = df1[["ds"]]
forecast = m.predict(future)
sr[i] = list(forecast["yhat"])
if y_pred_low_series != '':
srl[i] = list(forecast["yhat_lower"])
if y_pred_high_series != '':
srh[i] = list(forecast["yhat_upper"])
result[y_pred_series] = sr
if y_pred_low_series != '':
result[y_pred_low_series] = srl
if y_pred_high_series != '':
result[y_pred_high_series] = srh
```;
tbl
| evaluate python(typeof(*), code, kwargs
, external_artifacts=bag_pack('prophet.zip', 'https://artifactswestusnew.blob.core.windows.net/public/prophet-1.1.5.zip?*** YOUR SAS TOKEN ***'))
};
//
// Forecasting 3 time series using fbprophet, compare to forecasting using the native function series_decompose_forecast()
//
let min_t = datetime(2017-01-05);
let max_t = datetime(2017-02-03 22:00);
let dt = 2h;
let horizon=7d;
demo_make_series2
| make-series num=avg(num) on TimeStamp from min_t to max_t+horizon step dt by sid
| extend pred_num_native = series_decompose_forecast(num, toint(horizon/dt))
| extend pred_num=dynamic(null), pred_num_lower=dynamic(null), pred_num_upper=dynamic(null)
| invoke series_fbprophet_forecast_fl('TimeStamp', 'num', 'pred_num', toint(horizon/dt), 'pred_num_lower', 'pred_num_upper')
| render timechart
Stored
//
// Forecasting 3 time series using fbprophet, compare to forecasting using the native function series_decompose_forecast()
//
let min_t = datetime(2017-01-05);
let max_t = datetime(2017-02-03 22:00);
let dt = 2h;
let horizon=7d;
demo_make_series2
| make-series num=avg(num) on TimeStamp from min_t to max_t+horizon step dt by sid
| extend pred_num_native = series_decompose_forecast(num, toint(horizon/dt))
| extend pred_num=dynamic(null), pred_num_lower=dynamic(null), pred_num_upper=dynamic(null)
| invoke series_fbprophet_forecast_fl('TimeStamp', 'num', 'pred_num', toint(horizon/dt), 'pred_num_lower', 'pred_num_upper')
| render timechart
Output
40 - series_fit_lowess_fl()
no-loc: LOWESS
series_fit_lowess_fl()
The function series_fit_lowess_fl()
is a user-defined function (UDF) that applies a LOWESS regression on a series. This function takes a table with multiple series (dynamic numerical arrays) and generates a LOWESS Curve, which is a smoothed version of the original series.
Syntax
T | invoke series_fit_lowess_fl(
y_series,
y_fit_series,
[ fit_size ],
[ x_series ],
[ x_istime ])
Parameters
Name | Type | Required | Description |
---|---|---|---|
y_series | string | ✔️ | The name of the input table column containing the dependent variable. This column is the series to fit. |
y_fit_series | string | ✔️ | The name of the column to store the fitted series. |
fit_size | int | For each point, the local regression is applied on its respective fit_size closest points. The default is 5. | |
x_series | string | The name of the column containing the independent variable, that is, the x or time axis. This parameter is optional, and is needed only for unevenly spaced series. The default value is an empty string, as x is redundant for the regression of an evenly spaced series. | |
x_istime | bool | This boolean parameter is needed only if x_series is specified and it’s a vector of datetime. The default is false . |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_fit_lowess_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_size:int=5, x_series:string='', x_istime:bool=False)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_size', fit_size, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_size = kargs["fit_size"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
import statsmodels.api as sm
def lowess_fit(ts_row, x_col, y_col, fsize):
y = ts_row[y_col]
fraction = fsize/len(y)
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
lowess = sm.nonparametric.lowess
z = lowess(y, x, return_sorted=False, frac=fraction)
return list(z)
result = df
result[y_fit_series] = df.apply(lowess_fit, axis=1, args=(x_series, y_series, fit_size))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Fits a local polynomial using LOWESS method to a series")
series_fit_lowess_fl(tbl:(*), y_series:string, y_fit_series:string, fit_size:int=5, x_series:string='', x_istime:bool=False)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_size', fit_size, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_size = kargs["fit_size"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
import statsmodels.api as sm
def lowess_fit(ts_row, x_col, y_col, fsize):
y = ts_row[y_col]
fraction = fsize/len(y)
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
lowess = sm.nonparametric.lowess
z = lowess(y, x, return_sorted=False, frac=fraction)
return list(z)
result = df
result[y_fit_series] = df.apply(lowess_fit, axis=1, args=(x_series, y_series, fit_size))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Examples
The following examples use the invoke operator to run the function.
LOWESS regression on regular time series
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_fit_lowess_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_size:int=5, x_series:string='', x_istime:bool=False)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_size', fit_size, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_size = kargs["fit_size"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
import statsmodels.api as sm
def lowess_fit(ts_row, x_col, y_col, fsize):
y = ts_row[y_col]
fraction = fsize/len(y)
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
lowess = sm.nonparametric.lowess
z = lowess(y, x, return_sorted=False, frac=fraction)
return list(z)
result = df
result[y_fit_series] = df.apply(lowess_fit, axis=1, args=(x_series, y_series, fit_size))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
//
// Apply 9 points LOWESS regression on regular time series
//
let max_t = datetime(2016-09-03);
demo_make_series1
| make-series num=count() on TimeStamp from max_t-1d to max_t step 5m by OsVer
| extend fnum = dynamic(null)
| invoke series_fit_lowess_fl('num', 'fnum', 9)
| render timechart
Stored
//
// Apply 9 points LOWESS regression on regular time series
//
let max_t = datetime(2016-09-03);
demo_make_series1
| make-series num=count() on TimeStamp from max_t-1d to max_t step 5m by OsVer
| extend fnum = dynamic(null)
| invoke series_fit_lowess_fl('num', 'fnum', 9)
| render timechart
Output
Test irregular time series
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_fit_lowess_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_size:int=5, x_series:string='', x_istime:bool=False)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_size', fit_size, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_size = kargs["fit_size"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
import statsmodels.api as sm
def lowess_fit(ts_row, x_col, y_col, fsize):
y = ts_row[y_col]
fraction = fsize/len(y)
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
lowess = sm.nonparametric.lowess
z = lowess(y, x, return_sorted=False, frac=fraction)
return list(z)
result = df
result[y_fit_series] = df.apply(lowess_fit, axis=1, args=(x_series, y_series, fit_size))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
let max_t = datetime(2016-09-03);
demo_make_series1
| where TimeStamp between ((max_t-1d)..max_t)
| summarize num=count() by bin(TimeStamp, 5m), OsVer
| order by TimeStamp asc
| where hourofday(TimeStamp) % 6 != 0 // delete every 6th hour to create irregular time series
| summarize TimeStamp=make_list(TimeStamp), num=make_list(num) by OsVer
| extend fnum = dynamic(null)
| invoke series_fit_lowess_fl('num', 'fnum', 9, 'TimeStamp', True)
| render timechart
Stored
let max_t = datetime(2016-09-03);
demo_make_series1
| where TimeStamp between ((max_t-1d)..max_t)
| summarize num=count() by bin(TimeStamp, 5m), OsVer
| order by TimeStamp asc
| where hourofday(TimeStamp) % 6 != 0 // delete every 6th hour to create irregular time series
| summarize TimeStamp=make_list(TimeStamp), num=make_list(num) by OsVer
| extend fnum = dynamic(null)
| invoke series_fit_lowess_fl('num', 'fnum', 9, 'TimeStamp', True)
| render timechart
Output
Compare LOWESS versus polynomial fit
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_fit_lowess_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_size:int=5, x_series:string='', x_istime:bool=False)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_size', fit_size, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_size = kargs["fit_size"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
import statsmodels.api as sm
def lowess_fit(ts_row, x_col, y_col, fsize):
y = ts_row[y_col]
fraction = fsize/len(y)
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
lowess = sm.nonparametric.lowess
z = lowess(y, x, return_sorted=False, frac=fraction)
return list(z)
result = df
result[y_fit_series] = df.apply(lowess_fit, axis=1, args=(x_series, y_series, fit_size))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
range x from 1 to 200 step 1
| project x = rand()*5 - 2.3
| extend y = pow(x, 5)-8*pow(x, 3)+10*x+6
| extend y = y + (rand() - 0.5)*0.5*y
| summarize x=make_list(x), y=make_list(y)
| extend y_lowess = dynamic(null)
| invoke series_fit_lowess_fl('y', 'y_lowess', 15, 'x')
| extend series_fit_poly(y, x, 5)
| project x, y, y_lowess, y_polynomial=series_fit_poly_y_poly_fit
| render linechart
Stored
range x from 1 to 200 step 1
| project x = rand()*5 - 2.3
| extend y = pow(x, 5)-8*pow(x, 3)+10*x+6
| extend y = y + (rand() - 0.5)*0.5*y
| summarize x=make_list(x), y=make_list(y)
| extend y_lowess = dynamic(null)
| invoke series_fit_lowess_fl('y', 'y_lowess', 15, 'x')
| extend series_fit_poly(y, x, 5)
| project x, y, y_lowess, y_polynomial=series_fit_poly_y_poly_fit
| render linechart
Output
41 - series_fit_poly_fl()
The function series_fit_poly_fl()
is a user-defined function (UDF) that applies a polynomial regression on a series. This function takes a table containing multiple series (dynamic numerical arrays) and generates the best fit high-order polynomial for each series using polynomial regression. This function returns both the polynomial coefficients and the interpolated polynomial over the range of the series.
Syntax
T | invoke series_fit_poly_fl(
y_series,
y_fit_series,
fit_coeff,
degree,
[ x_series ],
[ x_istime ])
Parameters
Name | Type | Required | Description |
---|---|---|---|
y_series | string | ✔️ | The name of the input table column containing the dependent variable. That is, the series to fit. |
y_fit_series | string | ✔️ | The name of the column to store the best fit series. |
fit_coeff | string | ✔️ | The name of the column to store the best fit polynomial coefficients. |
degree | int | ✔️ | The required order of the polynomial to fit. For example, 1 for linear regression, 2 for quadratic regression, and so on. |
x_series | string | The name of the column containing the independent variable, that is, the x or time axis. This parameter is optional, and is needed only for unevenly spaced series. The default value is an empty string, as x is redundant for the regression of an evenly spaced series. | |
x_istime | bool | This parameter is needed only if x_series is specified and it’s a vector of datetime. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_fit_poly_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_coeff:string, degree:int, x_series:string='', x_istime:bool=False)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_coeff', fit_coeff, 'degree', degree, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_coeff = kargs["fit_coeff"]
degree = kargs["degree"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
def fit(ts_row, x_col, y_col, deg):
y = ts_row[y_col]
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
coeff = np.polyfit(x, y, deg)
p = np.poly1d(coeff)
z = p(x)
return z, coeff
result = df
if len(df):
result[[y_fit_series, fit_coeff]] = df.apply(fit, axis=1, args=(x_series, y_series, degree,), result_type="expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Fit a polynomial of a specified degree to a series")
series_fit_poly_fl(tbl:(*), y_series:string, y_fit_series:string, fit_coeff:string, degree:int, x_series:string='', x_istime:bool=false)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_coeff', fit_coeff, 'degree', degree, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_coeff = kargs["fit_coeff"]
degree = kargs["degree"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
def fit(ts_row, x_col, y_col, deg):
y = ts_row[y_col]
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
coeff = np.polyfit(x, y, deg)
p = np.poly1d(coeff)
z = p(x)
return z, coeff
result = df
if len(df):
result[[y_fit_series, fit_coeff]] = df.apply(fit, axis=1, args=(x_series, y_series, degree,), result_type="expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Examples
The following examples use the invoke operator to run the function.
Fit fifth order polynomial to a regular time series
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_fit_poly_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_coeff:string, degree:int, x_series:string='', x_istime:bool=False)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_coeff', fit_coeff, 'degree', degree, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_coeff = kargs["fit_coeff"]
degree = kargs["degree"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
def fit(ts_row, x_col, y_col, deg):
y = ts_row[y_col]
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
coeff = np.polyfit(x, y, deg)
p = np.poly1d(coeff)
z = p(x)
return z, coeff
result = df
if len(df):
result[[y_fit_series, fit_coeff]] = df.apply(fit, axis=1, args=(x_series, y_series, degree,), result_type="expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
//
// Fit fifth order polynomial to a regular (evenly spaced) time series, created with make-series
//
let max_t = datetime(2016-09-03);
demo_make_series1
| make-series num=count() on TimeStamp from max_t-1d to max_t step 5m by OsVer
| extend fnum = dynamic(null), coeff=dynamic(null), fnum1 = dynamic(null), coeff1=dynamic(null)
| invoke series_fit_poly_fl('num', 'fnum', 'coeff', 5)
| render timechart with(ycolumns=num, fnum)
Stored
//
// Fit fifth order polynomial to a regular (evenly spaced) time series, created with make-series
//
let max_t = datetime(2016-09-03);
demo_make_series1
| make-series num=count() on TimeStamp from max_t-1d to max_t step 5m by OsVer
| extend fnum = dynamic(null), coeff=dynamic(null), fnum1 = dynamic(null), coeff1=dynamic(null)
| invoke series_fit_poly_fl('num', 'fnum', 'coeff', 5)
| render timechart with(ycolumns=num, fnum)
Output
Test irregular time series
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_fit_poly_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_coeff:string, degree:int, x_series:string='', x_istime:bool=False)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_coeff', fit_coeff, 'degree', degree, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_coeff = kargs["fit_coeff"]
degree = kargs["degree"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
def fit(ts_row, x_col, y_col, deg):
y = ts_row[y_col]
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
coeff = np.polyfit(x, y, deg)
p = np.poly1d(coeff)
z = p(x)
return z, coeff
result = df
if len(df):
result[[y_fit_series, fit_coeff]] = df.apply(fit, axis=1, args=(x_series, y_series, degree,), result_type="expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
let max_t = datetime(2016-09-03);
demo_make_series1
| where TimeStamp between ((max_t-2d)..max_t)
| summarize num=count() by bin(TimeStamp, 5m), OsVer
| order by TimeStamp asc
| where hourofday(TimeStamp) % 6 != 0 // delete every 6th hour to create unevenly spaced time series
| summarize TimeStamp=make_list(TimeStamp), num=make_list(num) by OsVer
| extend fnum = dynamic(null), coeff=dynamic(null)
| invoke series_fit_poly_fl('num', 'fnum', 'coeff', 8, 'TimeStamp', True)
| render timechart with(ycolumns=num, fnum)
Stored
let max_t = datetime(2016-09-03);
demo_make_series1
| where TimeStamp between ((max_t-2d)..max_t)
| summarize num=count() by bin(TimeStamp, 5m), OsVer
| order by TimeStamp asc
| where hourofday(TimeStamp) % 6 != 0 // delete every 6th hour to create unevenly spaced time series
| summarize TimeStamp=make_list(TimeStamp), num=make_list(num) by OsVer
| extend fnum = dynamic(null), coeff=dynamic(null)
| invoke series_fit_poly_fl('num', 'fnum', 'coeff', 8, 'TimeStamp', True)
| render timechart with(ycolumns=num, fnum)
Output
Fifth order polynomial with noise on x & y axes
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_fit_poly_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_coeff:string, degree:int, x_series:string='', x_istime:bool=False)
{
let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_coeff', fit_coeff, 'degree', degree, 'x_series', x_series, 'x_istime', x_istime);
let code = ```if 1:
y_series = kargs["y_series"]
y_fit_series = kargs["y_fit_series"]
fit_coeff = kargs["fit_coeff"]
degree = kargs["degree"]
x_series = kargs["x_series"]
x_istime = kargs["x_istime"]
def fit(ts_row, x_col, y_col, deg):
y = ts_row[y_col]
if x_col == "": # If there is no x column creates sequential range [1, len(y)]
x = np.arange(len(y)) + 1
else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
if x_istime:
x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
x = x - x.min()
x = x / x.max()
x = x * (len(x) - 1) + 1
else:
x = ts_row[x_col]
coeff = np.polyfit(x, y, deg)
p = np.poly1d(coeff)
z = p(x)
return z, coeff
result = df
if len(df):
result[[y_fit_series, fit_coeff]] = df.apply(fit, axis=1, args=(x_series, y_series, degree,), result_type="expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
range x from 1 to 200 step 1
| project x = rand()*5 - 2.3
| extend y = pow(x, 5)-8*pow(x, 3)+10*x+6
| extend y = y + (rand() - 0.5)*0.5*y
| summarize x=make_list(x), y=make_list(y)
| extend y_fit = dynamic(null), coeff=dynamic(null)
| invoke series_fit_poly_fl('y', 'y_fit', 'coeff', 5, 'x')
|fork (project-away coeff) (project coeff | mv-expand coeff)
| render linechart
Stored
range x from 1 to 200 step 1
| project x = rand()*5 - 2.3
| extend y = pow(x, 5)-8*pow(x, 3)+10*x+6
| extend y = y + (rand() - 0.5)*0.5*y
| summarize x=make_list(x), y=make_list(y)
| extend y_fit = dynamic(null), coeff=dynamic(null)
| invoke series_fit_poly_fl('y', 'y_fit', 'coeff', 5, 'x')
|fork (project-away coeff) (project coeff | mv-expand coeff)
| render linechart
Output
42 - series_lag_fl()
Applies a lag on a series.
The function series_lag_fl()
is a user-defined function (UDF) that takes an expression containing a dynamic numerical array as input and shifts it backward. It’s commonly used for shifting time series to test whether a pattern is new or it matches historical data.
Syntax
series_lag_fl(
y_series,
offset)
Parameters
Name | Type | Required | Description |
---|---|---|---|
y_series | dynamic | ✔️ | An array cell of numeric values. |
offset | int | ✔️ | An integer specifying the required offset in bins. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_lag_fl = (series:dynamic, offset:int)
{
let lag_f = toscalar(range x from 1 to offset+1 step 1
| project y=iff(x == offset+1, 1, 0)
| summarize lag_filter = make_list(y));
fir(series, lag_f, false)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Shift a series by a specified offset")
series_lag_fl(series:dynamic, offset:int)
{
let lag_f = toscalar(range x from 1 to offset+1 step 1
| project y=iff(x == offset+1, 1, 0)
| summarize lag_filter = make_list(y));
fir(series, lag_f, false)
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_lag_fl = (series:dynamic, offset:int)
{
let lag_f = toscalar(range x from 1 to offset+1 step 1
| project y=iff(x == offset+1, 1, 0)
| summarize lag_filter = make_list(y));
fir(series, lag_f, false)
};
let dt = 1h;
let time_shift = 1d;
let bins_shift = toint(time_shift/dt);
demo_make_series1
| make-series num=count() on TimeStamp step dt by OsVer
| extend num_shifted=series_lag_fl(num, bins_shift)
| render timechart
Stored
let dt = 1h;
let time_shift = 1d;
let bins_shift = toint(time_shift/dt);
demo_make_series1
| make-series num=count() on TimeStamp step dt by OsVer
| extend num_shifted=series_lag_fl(num, bins_shift)
| render timechart
Output
43 - series_metric_fl()
The series_metric_fl()
function is a user-defined function (UDF) that selects and retrieves time series of metrics ingested to your database using the Prometheus monitoring system. This function assumes the data stored in your database is structured following the Prometheus data model. Specifically, each record contains:
- timestamp
- metric name
- metric value
- a variable set of labels (
"key":"value"
pairs)
Prometheus defines a time series by its metric name and a distinct set of labels. You can retrieve sets of time series using Prometheus Query Language (PromQL) by specifying the metric name and time series selector (a set of labels).
Syntax
T | invoke series_metric_fl(
timestamp_col,
name_col,
labels_col,
value_col,
metric_name,
labels_selector,
lookback,
offset)
Parameters
Name | Type | Required | Description |
---|---|---|---|
timestamp_col | string | ✔️ | The name of the column containing the timestamp. |
name_col | string | ✔️ | The name of the column containing the metric name. |
labels_col | string | ✔️ | The name of the column containing the labels dictionary. |
value_col | string | ✔️ | The name of the column containing the metric value. |
metric_name | string | ✔️ | The metric time series to retrieve. |
labels_selector | string | Time series selector string, similar to PromQL. It’s a string containing a list of "key":"value" pairs, for example '"key1":"val1","key2":"val2"' . The default is an empty string, which means no filtering. Note that regular expressions are not supported. | |
lookback | timespan | The range vector to retrieve, similar to PromQL. The default is 10 minutes. | |
offset | datetime | Offset back from current time to retrieve, similar to PromQL. Data is retrieved from ago(offset)-lookback to ago(offset). The default is 0, which means that data is retrieved up to now() . |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_metric_fl=(metrics_tbl:(*), timestamp_col:string, name_col:string, labels_col:string, value_col:string, metric_name:string, labels_selector:string='', lookback:timespan=timespan(10m), offset:timespan=timespan(0))
{
let selector_d=iff(labels_selector == '', dynamic(['']), split(labels_selector, ','));
let etime = ago(offset);
let stime = etime - lookback;
metrics_tbl
| extend timestamp = column_ifexists(timestamp_col, datetime(null)), name = column_ifexists(name_col, ''), labels = column_ifexists(labels_col, dynamic(null)), value = column_ifexists(value_col, 0)
| extend labels = dynamic_to_json(labels) // convert to string and sort by key
| where name == metric_name and timestamp between(stime..etime)
| order by timestamp asc
| summarize timestamp = make_list(timestamp), value=make_list(value) by name, labels
| where labels has_all (selector_d)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create function with (folder = "Packages\\Series", docstring = "Selecting & retrieving metrics like PromQL")
series_metric_fl(metrics_tbl:(*), timestamp_col:string, name_col:string, labels_col:string, value_col:string, metric_name:string, labels_selector:string='', lookback:timespan=timespan(10m), offset:timespan=timespan(0))
{
let selector_d=iff(labels_selector == '', dynamic(['']), split(labels_selector, ','));
let etime = ago(offset);
let stime = etime - lookback;
metrics_tbl
| extend timestamp = column_ifexists(timestamp_col, datetime(null)), name = column_ifexists(name_col, ''), labels = column_ifexists(labels_col, dynamic(null)), value = column_ifexists(value_col, 0)
| extend labels = dynamic_to_json(labels) // convert to string and sort by key
| where name == metric_name and timestamp between(stime..etime)
| order by timestamp asc
| summarize timestamp = make_list(timestamp), value=make_list(value) by name, labels
| where labels has_all (selector_d)
}
Examples
The following examples use the invoke operator to run the function.
With specifying selector
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_metric_fl=(metrics_tbl:(*), timestamp_col:string, name_col:string, labels_col:string, value_col:string, metric_name:string, labels_selector:string='', lookback:timespan=timespan(10m), offset:timespan=timespan(0))
{
let selector_d=iff(labels_selector == '', dynamic(['']), split(labels_selector, ','));
let etime = ago(offset);
let stime = etime - lookback;
metrics_tbl
| extend timestamp = column_ifexists(timestamp_col, datetime(null)), name = column_ifexists(name_col, ''), labels = column_ifexists(labels_col, dynamic(null)), value = column_ifexists(value_col, 0)
| extend labels = dynamic_to_json(labels) // convert to string and sort by key
| where name == metric_name and timestamp between(stime..etime)
| order by timestamp asc
| summarize timestamp = make_list(timestamp), value=make_list(value) by name, labels
| where labels has_all (selector_d)
};
demo_prometheus
| invoke series_metric_fl('TimeStamp', 'Name', 'Labels', 'Val', 'writes', '"disk":"sda1","host":"aks-agentpool-88086459-vmss000001"', offset=now()-datetime(2020-12-08 00:00))
| render timechart with(series=labels)
Stored
demo_prometheus
| invoke series_metric_fl('TimeStamp', 'Name', 'Labels', 'Val', 'writes', '"disk":"sda1","host":"aks-agentpool-88086459-vmss000001"', offset=now()-datetime(2020-12-08 00:00))
| render timechart with(series=labels)
Output
Without specifying selector
The following example doesn’t specify selector, so all ‘writes’ metrics are selected. This example assumes that the function is already installed, and uses alternative direct calling syntax, specifying the input table as the first parameter:
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_metric_fl=(metrics_tbl:(*), timestamp_col:string, name_col:string, labels_col:string, value_col:string, metric_name:string, labels_selector:string='', lookback:timespan=timespan(10m), offset:timespan=timespan(0))
{
let selector_d=iff(labels_selector == '', dynamic(['']), split(labels_selector, ','));
let etime = ago(offset);
let stime = etime - lookback;
metrics_tbl
| extend timestamp = column_ifexists(timestamp_col, datetime(null)), name = column_ifexists(name_col, ''), labels = column_ifexists(labels_col, dynamic(null)), value = column_ifexists(value_col, 0)
| extend labels = dynamic_to_json(labels) // convert to string and sort by key
| where name == metric_name and timestamp between(stime..etime)
| order by timestamp asc
| summarize timestamp = make_list(timestamp), value=make_list(value) by name, labels
| where labels has_all (selector_d)
};
series_metric_fl(demo_prometheus, 'TimeStamp', 'Name', 'Labels', 'Val', 'writes', offset=now()-datetime(2020-12-08 00:00))
| render timechart with(series=labels, ysplit=axes)
Stored
series_metric_fl(demo_prometheus, 'TimeStamp', 'Name', 'Labels', 'Val', 'writes', offset=now()-datetime(2020-12-08 00:00))
| render timechart with(series=labels, ysplit=axes)
Output
44 - series_monthly_decompose_anomalies_fl()
Detect anomalous points in a daily series with monthly seasonality.
The function series_monthly_decompose_anomalies_fl()
is a user-defined function (UDF) that detects anomalies in multiple time series that have monthly seasonality. The function is built on top of series_decompose_anomalies(). The challenge is that the length of a month is variable between 28 to 31 days, so building a baseline by using series_decompose_anomalies() out of the box detects fixed seasonality thus fails to match spikes or other patterns that occur in the 1st or other day in each month.
Syntax
series_monthly_decompose_anomalies_fl(
threshold)
Parameters
Name | Type | Required | Description |
---|---|---|---|
threshold | real | Anomaly threshold. Default is 1.5. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_monthly_decompose_anomalies_fl=(tbl:(_key:string, _date:datetime, _val:real), threshold:real=1.5)
{
let _tbl=materialize(tbl
| extend _year=getyear(_date), _dom = dayofmonth(_date), _moy=monthofyear(_date), _doy=dayofyear(_date)
| extend _vdoy = 31*(_moy-1)+_dom // virtual day of year (assuming all months have 31 days)
);
let median_tbl = _tbl | summarize p50=percentiles(_val, 50) by _key, _dom;
let keys = _tbl | summarize by _key | extend dummy=1;
let years = _tbl | summarize by _year | extend dummy=1;
let vdoys = range _vdoy from 0 to 31*12-1 step 1 | extend _moy=_vdoy/31+1, _vdom=_vdoy%31+1, _vdoy=_vdoy+1 | extend dummy=1
| join kind=fullouter years on dummy | join kind=fullouter keys on dummy | project-away dummy, dummy1, dummy2;
vdoys
| join kind=leftouter _tbl on _key, _year, _vdoy
| project-away _key1, _year1, _moy1, _vdoy1
| extend _adoy=31*12*_year+_doy, _vadoy = 31*12*_year+_vdoy
| partition by _key (as T
| where _vadoy >= toscalar(T | summarize (_adoy, _vadoy)=arg_min(_adoy, _vadoy) | project _vadoy) and
_vadoy <= toscalar(T | summarize (_adoy, _vadoy)=arg_max(_adoy, _vadoy) | project _vadoy)
)
| join kind=inner median_tbl on _key, $left._vdom == $right._dom
| extend _vval = coalesce(_val, p50)
//| order by _key asc, _vadoy asc // for debugging
| make-series _vval=avg(_vval), _date=any(_date) default=datetime(null) on _vadoy step 1 by _key
| extend (anomalies, score, baseline) = series_decompose_anomalies(_vval, threshold, 31)
| mv-expand _date to typeof(datetime), _vval to typeof(real), _vadoy to typeof(long), anomalies to typeof(int), score to typeof(real), baseline to typeof(real)
| project-away _vadoy
| project-rename _val=_vval
| where isnotnull(_date)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Anomaly Detection for daily time series with monthly seasonality")
series_monthly_decompose_anomalies_fl(tbl:(_key:string, _date:datetime, _val:real), threshold:real=1.5)
{
let _tbl=materialize(tbl
| extend _year=getyear(_date), _dom = dayofmonth(_date), _moy=monthofyear(_date), _doy=dayofyear(_date)
| extend _vdoy = 31*(_moy-1)+_dom // virtual day of year (assuming all months have 31 days)
);
let median_tbl = _tbl | summarize p50=percentiles(_val, 50) by _key, _dom;
let keys = _tbl | summarize by _key | extend dummy=1;
let years = _tbl | summarize by _year | extend dummy=1;
let vdoys = range _vdoy from 0 to 31*12-1 step 1 | extend _moy=_vdoy/31+1, _vdom=_vdoy%31+1, _vdoy=_vdoy+1 | extend dummy=1
| join kind=fullouter years on dummy | join kind=fullouter keys on dummy | project-away dummy, dummy1, dummy2;
vdoys
| join kind=leftouter _tbl on _key, _year, _vdoy
| project-away _key1, _year1, _moy1, _vdoy1
| extend _adoy=31*12*_year+_doy, _vadoy = 31*12*_year+_vdoy
| partition by _key (as T
| where _vadoy >= toscalar(T | summarize (_adoy, _vadoy)=arg_min(_adoy, _vadoy) | project _vadoy) and
_vadoy <= toscalar(T | summarize (_adoy, _vadoy)=arg_max(_adoy, _vadoy) | project _vadoy)
)
| join kind=inner median_tbl on _key, $left._vdom == $right._dom
| extend _vval = coalesce(_val, p50)
//| order by _key asc, _vadoy asc // for debugging
| make-series _vval=avg(_vval), _date=any(_date) default=datetime(null) on _vadoy step 1 by _key
| extend (anomalies, score, baseline) = series_decompose_anomalies(_vval, threshold, 31)
| mv-expand _date to typeof(datetime), _vval to typeof(real), _vadoy to typeof(long), anomalies to typeof(int), score to typeof(real), baseline to typeof(real)
| project-away _vadoy
| project-rename _val=_vval
| where isnotnull(_date)
}
Example
The input table must contain _key
, _date
and _val
columns. The query builds a set of time series of _val
per each _key
and adds anomalies, score and baseline columns.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_monthly_decompose_anomalies_fl=(tbl:(_key:string, _date:datetime, _val:real), threshold:real=1.5)
{
let _tbl=materialize(tbl
| extend _year=getyear(_date), _dom = dayofmonth(_date), _moy=monthofyear(_date), _doy=dayofyear(_date)
| extend _vdoy = 31*(_moy-1)+_dom // virtual day of year (assuming all months have 31 days)
);
let median_tbl = _tbl | summarize p50=percentiles(_val, 50) by _key, _dom;
let keys = _tbl | summarize by _key | extend dummy=1;
let years = _tbl | summarize by _year | extend dummy=1;
let vdoys = range _vdoy from 0 to 31*12-1 step 1 | extend _moy=_vdoy/31+1, _vdom=_vdoy%31+1, _vdoy=_vdoy+1 | extend dummy=1
| join kind=fullouter years on dummy | join kind=fullouter keys on dummy | project-away dummy, dummy1, dummy2;
vdoys
| join kind=leftouter _tbl on _key, _year, _vdoy
| project-away _key1, _year1, _moy1, _vdoy1
| extend _adoy=31*12*_year+_doy, _vadoy = 31*12*_year+_vdoy
| partition by _key (as T
| where _vadoy >= toscalar(T | summarize (_adoy, _vadoy)=arg_min(_adoy, _vadoy) | project _vadoy) and
_vadoy <= toscalar(T | summarize (_adoy, _vadoy)=arg_max(_adoy, _vadoy) | project _vadoy)
)
| join kind=inner median_tbl on _key, $left._vdom == $right._dom
| extend _vval = coalesce(_val, p50)
//| order by _key asc, _vadoy asc // for debugging
| make-series _vval=avg(_vval), _date=any(_date) default=datetime(null) on _vadoy step 1 by _key
| extend (anomalies, score, baseline) = series_decompose_anomalies(_vval, threshold, 31)
| mv-expand _date to typeof(datetime), _vval to typeof(real), _vadoy to typeof(long), anomalies to typeof(int), score to typeof(real), baseline to typeof(real)
| project-away _vadoy
| project-rename _val=_vval
| where isnotnull(_date)
};
demo_monthly_ts
| project _key=key, _date=ts, _val=val
| invoke series_monthly_decompose_anomalies_fl()
| project-rename key=_key, ts=_date, val=_val
| render anomalychart with(anomalycolumns=anomalies, xcolumn=ts, ycolumns=val)
Stored
demo_monthly_ts
| project _key=key, _date=ts, _val=val
| invoke series_monthly_decompose_anomalies_fl()
| project-rename key=_key, ts=_date, val=_val
| render anomalychart with(anomalycolumns=anomalies, xcolumn=ts, ycolumns=val)
Output
Series A with monthly anomalies:
Series B with monthly anomalies:
45 - series_moving_avg_fl()
Applies a moving average filter on a series.
The function series_moving_avg_fl()
is a user-defined function (UDF) that takes an expression containing a dynamic numerical array as input and applies on it a simple moving average filter.
Syntax
series_moving_avg_fl(
y_series,
n [,
center ])
Parameters
Name | Type | Required | Description |
---|---|---|---|
y_series | dynamic | ✔️ | An array cell of numeric values. |
n | int | ✔️ | The width of the moving average filter. |
center | bool | Indicates whether the moving average is either applied symmetrically on a window before and after the current point or applied on a window from the current point backwards. By default, center is false . |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_moving_avg_fl = (y_series:dynamic, n:int, center:bool=false)
{
series_fir(y_series, repeat(1, n), true, center)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Calculate moving average of specified width")
series_moving_avg_fl(y_series:dynamic, n:int, center:bool=false)
{
series_fir(y_series, repeat(1, n), true, center)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_moving_avg_fl = (y_series:dynamic, n:int, center:bool=false)
{
series_fir(y_series, repeat(1, n), true, center)
};
//
// Moving average of 5 bins
//
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| extend num_ma=series_moving_avg_fl(num, 5, True)
| render timechart
Stored
//
// Moving average of 5 bins
//
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| extend num_ma=series_moving_avg_fl(num, 5, True)
| render timechart
Output
46 - series_moving_var_fl()
Applies a moving variance filter on a series.
The function series_moving_var_fl()
is a user-defined function (UDF) that takes an expression containing a dynamic numerical array as input and applies on it a moving variance filter.
Syntax
series_moving_var_fl(
y_series,
n [,
center ])
Parameters
Name | Type | Required | Description |
---|---|---|---|
y_series | dynamic | ✔️ | An array cell of numeric values. |
n | int | ✔️ | The width of the moving variance filter. |
center | bool | Indicates whether the moving variance is either applied symmetrically on a window before and after the current point or applied on a window from the current point backwards. By default, center is false . |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_moving_var_fl = (y_series:dynamic, n:int, center:bool=false)
{
let ey = series_fir(y_series, repeat(1, n), true, center);
let e2y = series_multiply(ey, ey);
let y2 = series_multiply(y_series, y_series);
let ey2 = series_fir(y2, repeat(1, n), true, center);
let var_series = series_subtract(ey2, e2y);
var_series
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Calculate moving variance of specified width")
series_moving_var_fl(y_series:dynamic, n:int, center:bool=false)
{
let ey = series_fir(y_series, repeat(1, n), true, center);
let e2y = series_multiply(ey, ey);
let y2 = series_multiply(y_series, y_series);
let ey2 = series_fir(y2, repeat(1, n), true, center);
let var_series = series_subtract(ey2, e2y);
var_series
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_moving_var_fl = (y_series:dynamic, n:int, center:bool=false)
{
let ey = series_fir(y_series, repeat(1, n), true, center);
let e2y = series_multiply(ey, ey);
let y2 = series_multiply(y_series, y_series);
let ey2 = series_fir(y2, repeat(1, n), true, center);
let var_series = series_subtract(ey2, e2y);
var_series
}
;
let sinewave=(x:double, period:double, gain:double=1.0, phase:double=0.0)
{
gain*sin(2*pi()/period*(x+phase))
}
;
let n=128;
let T=10;
let window=T*2;
union
(range x from 0 to n-1 step 1 | extend y=sinewave(x, T)),
(range x from n to 2*n-1 step 1 | extend y=0.0),
(range x from 2*n to 3*n-1 step 1 | extend y=sinewave(x, T)),
(range x from 3*n to 4*n-1 step 1 | extend y=(x-3.0*n)/128.0),
(range x from 4*n to 5*n-1 step 1 | extend y=sinewave(x, T))
| order by x asc
| summarize x=make_list(x), y=make_list(y)
| extend y_var=series_moving_var_fl(y, T, true)
| render linechart
Stored
let sinewave=(x:double, period:double, gain:double=1.0, phase:double=0.0)
{
gain*sin(2*pi()/period*(x+phase))
}
;
let n=128;
let T=10;
let window=T*2;
union
(range x from 0 to n-1 step 1 | extend y=sinewave(x, T)),
(range x from n to 2*n-1 step 1 | extend y=0.0),
(range x from 2*n to 3*n-1 step 1 | extend y=sinewave(x, T)),
(range x from 3*n to 4*n-1 step 1 | extend y=(x-3.0*n)/128.0),
(range x from 4*n to 5*n-1 step 1 | extend y=sinewave(x, T))
| order by x asc
| summarize x=make_list(x), y=make_list(y)
| extend y_var=series_moving_var_fl(y, T, true)
| render linechart
Output
47 - series_mv_ee_anomalies_fl()
The function series_mv_ee_anomalies_fl()
is a user-defined function (UDF) that detects multivariate anomalies in series by applying elliptic envelope model from scikit-learn. This model assumes that the source of the multivariate data is multi-dimensional normal distribution. The function accepts a set of series as numerical dynamic arrays, the names of the features columns and the expected percentage of anomalies out of the whole series. The function builds a multi-dimensional elliptical envelope for each series and marks the points that fall outside this normal envelope as anomalies.
Syntax
T | invoke series_mv_ee_anomalies_fl(
features_cols,
anomaly_col [,
score_col [,
anomalies_pct ]])
Parameters
Name | Type | Required | Description |
---|---|---|---|
features_cols | dynamic | ✔️ | An array containing the names of the columns that are used for the multivariate anomaly detection model. |
anomaly_col | string | ✔️ | The name of the column to store the detected anomalies. |
score_col | string | The name of the column to store the scores of the anomalies. | |
anomalies_pct | real | A real number in the range [0-50] specifying the expected percentage of anomalies in the data. Default value: 4%. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
// Define function
let series_mv_ee_anomalies_fl=(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0)
{
let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct);
let code = ```if 1:
from sklearn.covariance import EllipticEnvelope
features_cols = kargs['features_cols']
anomaly_col = kargs['anomaly_col']
score_col = kargs['score_col']
anomalies_pct = kargs['anomalies_pct']
dff = df[features_cols]
ellipsoid = EllipticEnvelope(contamination=anomalies_pct/100.0)
for i in range(len(dff)):
dffi = dff.iloc[[i], :]
dffe = dffi.explode(features_cols)
ellipsoid.fit(dffe)
df.loc[i, anomaly_col] = (ellipsoid.predict(dffe) < 0).astype(int).tolist()
if score_col != '':
df.loc[i, score_col] = ellipsoid.decision_function(dffe).tolist()
result = df
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Anomaly Detection for multi dimensional normally distributed data using elliptical envelope model")
series_mv_ee_anomalies_fl(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0)
{
let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct);
let code = ```if 1:
from sklearn.covariance import EllipticEnvelope
features_cols = kargs['features_cols']
anomaly_col = kargs['anomaly_col']
score_col = kargs['score_col']
anomalies_pct = kargs['anomalies_pct']
dff = df[features_cols]
ellipsoid = EllipticEnvelope(contamination=anomalies_pct/100.0)
for i in range(len(dff)):
dffi = dff.iloc[[i], :]
dffe = dffi.explode(features_cols)
ellipsoid.fit(dffe)
df.loc[i, anomaly_col] = (ellipsoid.predict(dffe) < 0).astype(int).tolist()
if score_col != '':
df.loc[i, score_col] = ellipsoid.decision_function(dffe).tolist()
result = df
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
// Define function
let series_mv_ee_anomalies_fl=(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0)
{
let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct);
let code = ```if 1:
from sklearn.covariance import EllipticEnvelope
features_cols = kargs['features_cols']
anomaly_col = kargs['anomaly_col']
score_col = kargs['score_col']
anomalies_pct = kargs['anomalies_pct']
dff = df[features_cols]
ellipsoid = EllipticEnvelope(contamination=anomalies_pct/100.0)
for i in range(len(dff)):
dffi = dff.iloc[[i], :]
dffe = dffi.explode(features_cols)
ellipsoid.fit(dffe)
df.loc[i, anomaly_col] = (ellipsoid.predict(dffe) < 0).astype(int).tolist()
if score_col != '':
df.loc[i, score_col] = ellipsoid.decision_function(dffe).tolist()
result = df
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Usage
normal_2d_with_anomalies
| extend anomalies=dynamic(null), scores=dynamic(null)
| invoke series_mv_ee_anomalies_fl(pack_array('x', 'y'), 'anomalies', 'scores')
| extend anomalies=series_multiply(80, anomalies)
| render timechart
Stored
normal_2d_with_anomalies
| extend anomalies=dynamic(null), scores=dynamic(null)
| invoke series_mv_ee_anomalies_fl(pack_array('x', 'y'), 'anomalies', 'scores')
| extend anomalies=series_multiply(80, anomalies)
| render timechart
Output
The table normal_2d_with_anomalies contains a set of 3 time series. Each time series has two-dimensional normal distribution with daily anomalies added at midnight, 8am, and 4pm respectively. You can create this sample dataset using an example query.
To view the data as a scatter chart, replace the usage code with the following:
normal_2d_with_anomalies
| extend anomalies=dynamic(null)
| invoke series_mv_ee_anomalies_fl(pack_array('x', 'y'), 'anomalies')
| where name == 'TS1'
| project x, y, anomalies
| mv-expand x to typeof(real), y to typeof(real), anomalies to typeof(string)
| render scatterchart with(series=anomalies)
You can see that on TS1 most of the midnight anomalies were detected using this multivariate model.
Create a sample dataset
.set normal_2d_with_anomalies <|
//
let window=14d;
let dt=1h;
let n=toint(window/dt);
let rand_normal_fl=(avg:real=0.0, stdv:real=1.0)
{
let x =rand()+rand()+rand()+rand()+rand()+rand()+rand()+rand()+rand()+rand()+rand()+rand();
(x - 6)*stdv + avg
};
union
(range s from 0 to n step 1
| project t=startofday(now())-s*dt
| extend x=rand_normal_fl(10, 5)
| extend y=iff(hourofday(t) == 0, 2*(10-x)+7+rand_normal_fl(0, 3), 2*x+7+rand_normal_fl(0, 3)) // anomalies every midnight
| extend name='TS1'),
(range s from 0 to n step 1
| project t=startofday(now())-s*dt
| extend x=rand_normal_fl(15, 3)
| extend y=iff(hourofday(t) == 8, (15-x)+10+rand_normal_fl(0, 2), x-7+rand_normal_fl(0, 1)) // anomalies every 8am
| extend name='TS2'),
(range s from 0 to n step 1
| project t=startofday(now())-s*dt
| extend x=rand_normal_fl(8, 6)
| extend y=iff(hourofday(t) == 16, x+5+rand_normal_fl(0, 4), (12-x)+rand_normal_fl(0, 4)) // anomalies every 4pm
| extend name='TS3')
| summarize t=make_list(t), x=make_list(x), y=make_list(y) by name
48 - series_mv_if_anomalies_fl()
The function series_mv_if_anomalies_fl()
is a user-defined function (UDF) that detects multivariate anomalies in series by applying isolation forest model from scikit-learn. The function accepts a set of series as numerical dynamic arrays, the names of the features columns and the expected percentage of anomalies out of the whole series. The function builds an ensemble of isolation trees for each series and marks the points that are quickly isolated as anomalies.
Syntax
T | invoke series_mv_if_anomalies_fl(
features_cols,
anomaly_col [,
score_col [,
anomalies_pct [,
num_trees [,
samples_pct ]]]])
Parameters
Name | Type | Required | Description |
---|---|---|---|
features_cols | dynamic | ✔️ | An array containing the names of the columns that are used for the multivariate anomaly detection model. |
anomaly_col | string | ✔️ | The name of the column to store the detected anomalies. |
score_col | string | The name of the column to store the scores of the anomalies. | |
anomalies_pct | real | A real number in the range [0-50] specifying the expected percentage of anomalies in the data. Default value: 4%. | |
num_trees | int | The number of isolation trees to build for each time series. Default value: 100. | |
samples_pct | real | A real number in the range [0-100] specifying the percentage of samples used to build each tree. Default value: 100%, i.e. use the full series. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
// Define function
let series_mv_if_anomalies_fl=(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0, num_trees:int=100, samples_pct:real=100.0)
{
let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct, 'num_trees', num_trees, 'samples_pct', samples_pct);
let code = ```if 1:
from sklearn.ensemble import IsolationForest
features_cols = kargs['features_cols']
anomaly_col = kargs['anomaly_col']
score_col = kargs['score_col']
anomalies_pct = kargs['anomalies_pct']
num_trees = kargs['num_trees']
samples_pct = kargs['samples_pct']
dff = df[features_cols]
iforest = IsolationForest(contamination=anomalies_pct/100.0, random_state=0, n_estimators=num_trees, max_samples=samples_pct/100.0)
for i in range(len(dff)):
dffi = dff.iloc[[i], :]
dffe = dffi.explode(features_cols)
iforest.fit(dffe)
df.loc[i, anomaly_col] = (iforest.predict(dffe) < 0).astype(int).tolist()
if score_col != '':
df.loc[i, score_col] = iforest.decision_function(dffe).tolist()
result = df
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Anomaly Detection for multi dimensional data using isolation forest model")
series_mv_if_anomalies_fl(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0, num_trees:int=100, samples_pct:real=100.0)
{
let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct, 'num_trees', num_trees, 'samples_pct', samples_pct);
let code = ```if 1:
from sklearn.ensemble import IsolationForest
features_cols = kargs['features_cols']
anomaly_col = kargs['anomaly_col']
score_col = kargs['score_col']
anomalies_pct = kargs['anomalies_pct']
num_trees = kargs['num_trees']
samples_pct = kargs['samples_pct']
dff = df[features_cols]
iforest = IsolationForest(contamination=anomalies_pct/100.0, random_state=0, n_estimators=num_trees, max_samples=samples_pct/100.0)
for i in range(len(dff)):
dffi = dff.iloc[[i], :]
dffe = dffi.explode(features_cols)
iforest.fit(dffe)
df.loc[i, anomaly_col] = (iforest.predict(dffe) < 0).astype(int).tolist()
if score_col != '':
df.loc[i, score_col] = iforest.decision_function(dffe).tolist()
result = df
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
// Define function
let series_mv_if_anomalies_fl=(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0, num_trees:int=100, samples_pct:real=100.0)
{
let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct, 'num_trees', num_trees, 'samples_pct', samples_pct);
let code = ```if 1:
from sklearn.ensemble import IsolationForest
features_cols = kargs['features_cols']
anomaly_col = kargs['anomaly_col']
score_col = kargs['score_col']
anomalies_pct = kargs['anomalies_pct']
num_trees = kargs['num_trees']
samples_pct = kargs['samples_pct']
dff = df[features_cols]
iforest = IsolationForest(contamination=anomalies_pct/100.0, random_state=0, n_estimators=num_trees, max_samples=samples_pct/100.0)
for i in range(len(dff)):
dffi = dff.iloc[[i], :]
dffe = dffi.explode(features_cols)
iforest.fit(dffe)
df.loc[i, anomaly_col] = (iforest.predict(dffe) < 0).astype(int).tolist()
if score_col != '':
df.loc[i, score_col] = iforest.decision_function(dffe).tolist()
result = df
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Usage
normal_2d_with_anomalies
| extend anomalies=dynamic(null), scores=dynamic(null)
| invoke series_mv_if_anomalies_fl(pack_array('x', 'y'), 'anomalies', 'scores', anomalies_pct=8, num_trees=1000)
| extend anomalies=series_multiply(40, anomalies)
| render timechart
Stored
normal_2d_with_anomalies
| extend anomalies=dynamic(null), scores=dynamic(null)
| invoke series_mv_if_anomalies_fl(pack_array('x', 'y'), 'anomalies', 'scores', anomalies_pct=8, num_trees=1000)
| extend anomalies=series_multiply(40, anomalies)
| render timechart
Output
The table normal_2d_with_anomalies contains a set of 3 time series. Each time series has two-dimensional normal distribution with daily anomalies added at midnight, 8am, and 4pm respectively. You can create this sample dataset using an example query.
To view the data as a scatter chart, replace the usage code with the following:
normal_2d_with_anomalies
| extend anomalies=dynamic(null)
| invoke series_mv_if_anomalies_fl(pack_array('x', 'y'), 'anomalies')
| where name == 'TS1'
| project x, y, anomalies
| mv-expand x to typeof(real), y to typeof(real), anomalies to typeof(string)
| render scatterchart with(series=anomalies)
You can see that on TS2 most of the anomalies occurring at 8am were detected using this multivariate model.
49 - series_mv_oc_anomalies_fl()
The function series_mv_oc_anomalies_fl()
is a user-defined function (UDF) that detects multivariate anomalies in series by applying the One Class SVM model from scikit-learn. The function accepts a set of series as numerical dynamic arrays, the names of the features columns and the expected percentage of anomalies out of the whole series. The function trains one class SVM for each series and marks the points that fall outside the hyper sphere as anomalies.
Syntax
T | invoke series_mv_oc_anomalies_fl(
features_cols,
anomaly_col [,
score_col [,
anomalies_pct ]])
Parameters
Name | Type | Required | Description |
---|---|---|---|
features_cols | dynamic | ✔️ | An array containing the names of the columns that are used for the multivariate anomaly detection model. |
anomaly_col | string | ✔️ | The name of the column to store the detected anomalies. |
score_col | string | The name of the column to store the scores of the anomalies. | |
anomalies_pct | real | A real number in the range [0-50] specifying the expected percentage of anomalies in the data. Default value: 4%. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_mv_oc_anomalies_fl=(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0)
{
let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct);
let code = ```if 1:
from sklearn.svm import OneClassSVM
features_cols = kargs['features_cols']
anomaly_col = kargs['anomaly_col']
score_col = kargs['score_col']
anomalies_pct = kargs['anomalies_pct']
dff = df[features_cols]
svm = OneClassSVM(nu=anomalies_pct/100.0)
for i in range(len(dff)):
dffi = dff.iloc[[i], :]
dffe = dffi.explode(features_cols)
svm.fit(dffe)
df.loc[i, anomaly_col] = (svm.predict(dffe) < 0).astype(int).tolist()
if score_col != '':
df.loc[i, score_col] = svm.decision_function(dffe).tolist()
result = df
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Write your query to use the function.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Anomaly Detection for multi dimensional data using One Class SVM model")
series_mv_oc_anomalies_fl(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0)
{
let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct);
let code = ```if 1:
from sklearn.svm import OneClassSVM
features_cols = kargs['features_cols']
anomaly_col = kargs['anomaly_col']
score_col = kargs['score_col']
anomalies_pct = kargs['anomalies_pct']
dff = df[features_cols]
svm = OneClassSVM(nu=anomalies_pct/100.0)
for i in range(len(dff)):
dffi = dff.iloc[[i], :]
dffe = dffi.explode(features_cols)
svm.fit(dffe)
df.loc[i, anomaly_col] = (svm.predict(dffe) < 0).astype(int).tolist()
if score_col != '':
df.loc[i, score_col] = svm.decision_function(dffe).tolist()
result = df
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_mv_oc_anomalies_fl=(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0)
{
let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct);
let code = ```if 1:
from sklearn.svm import OneClassSVM
features_cols = kargs['features_cols']
anomaly_col = kargs['anomaly_col']
score_col = kargs['score_col']
anomalies_pct = kargs['anomalies_pct']
dff = df[features_cols]
svm = OneClassSVM(nu=anomalies_pct/100.0)
for i in range(len(dff)):
dffi = dff.iloc[[i], :]
dffe = dffi.explode(features_cols)
svm.fit(dffe)
df.loc[i, anomaly_col] = (svm.predict(dffe) < 0).astype(int).tolist()
if score_col != '':
df.loc[i, score_col] = svm.decision_function(dffe).tolist()
result = df
```;
tbl
| evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Usage
normal_2d_with_anomalies
| extend anomalies=dynamic(null), scores=dynamic(null)
| invoke series_mv_oc_anomalies_fl(pack_array('x', 'y'), 'anomalies', 'scores', anomalies_pct=6)
| extend anomalies=series_multiply(80, anomalies)
| render timechart
Stored
normal_2d_with_anomalies
| extend anomalies=dynamic(null), scores=dynamic(null)
| invoke series_mv_oc_anomalies_fl(pack_array('x', 'y'), 'anomalies', 'scores', anomalies_pct=6)
| extend anomalies=series_multiply(80, anomalies)
| render timechart
Output
The table normal_2d_with_anomalies contains a set of 3 time series. Each time series has two-dimensional normal distribution with daily anomalies added at midnight, 8am, and 4pm respectively. You can create this sample dataset using an example query.
To view the data as a scatter chart, replace the usage code with the following:
normal_2d_with_anomalies
| extend anomalies=dynamic(null)
| invoke series_mv_oc_anomalies_fl(pack_array('x', 'y'), 'anomalies')
| where name == 'TS1'
| project x, y, anomalies
| mv-expand x to typeof(real), y to typeof(real), anomalies to typeof(string)
| render scatterchart with(series=anomalies)
You can see that on TS1 most of the anomalies occurring at midnights were detected using this multivariate model.
50 - series_rate_fl()
The function series_rate_fl()
is a user-defined function (UDF) that calculates the average rate of metric increase per second. Its logic follows PromQL rate() function. It should be used for time series of counter metrics ingested to your database by Prometheus monitoring system, and retrieved by series_metric_fl().
Syntax
T | invoke series_rate_fl(
[ n_bins [,
fix_reset ]])
T
is a table returned from series_metric_fl(). Its schema includes (timestamp:dynamic, name:string, labels:string, value:dynamic)
.
Parameters
Name | Type | Required | Description |
---|---|---|---|
n_bins | int | The number of bins to specify the gap between the extracted metric values for calculation of the rate. The function calculates the difference between the current sample and the one n_bins before, and divide it by the difference of their respective timestamps in seconds. The default is one bin. The default settings calculate irate(), the PromQL instantaneous rate function. | |
fix_reset | bool | Controls whether to check for counter resets and correct it like PromQL rate() function. The default is true . Set it to false to save redundant analysis in case no need to check for resets. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_rate_fl=(tbl:(timestamp:dynamic, value:dynamic), n_bins:int=1, fix_reset:bool=true)
{
tbl
| where fix_reset // Prometheus counters can only go up
| mv-apply value to typeof(double) on
( extend correction = iff(value < prev(value), prev(value), 0.0) // if the value decreases we assume it was reset to 0, so add last value
| extend cum_correction = row_cumsum(correction)
| extend corrected_value = value + cum_correction
| summarize value = make_list(corrected_value))
| union (tbl | where not(fix_reset))
| extend timestampS = array_shift_right(timestamp, n_bins), valueS = array_shift_right(value, n_bins)
| extend dt = series_subtract(timestamp, timestampS)
| extend dt = series_divide(dt, 1e7) // converts from ticks to seconds
| extend dv = series_subtract(value, valueS)
| extend rate = series_divide(dv, dt)
| project-away dt, dv, timestampS, value, valueS
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create function with (folder = "Packages\\Series", docstring = "Simulate PromQL rate()")
series_rate_fl(tbl:(timestamp:dynamic, value:dynamic), n_bins:int=1, fix_reset:bool=true)
{
tbl
| where fix_reset // Prometheus counters can only go up
| mv-apply value to typeof(double) on
( extend correction = iff(value < prev(value), prev(value), 0.0) // if the value decreases we assume it was reset to 0, so add last value
| extend cum_correction = row_cumsum(correction)
| extend corrected_value = value + cum_correction
| summarize value = make_list(corrected_value))
| union (tbl | where not(fix_reset))
| extend timestampS = array_shift_right(timestamp, n_bins), valueS = array_shift_right(value, n_bins)
| extend dt = series_subtract(timestamp, timestampS)
| extend dt = series_divide(dt, 1e7) // converts from ticks to seconds
| extend dv = series_subtract(value, valueS)
| extend rate = series_divide(dv, dt)
| project-away dt, dv, timestampS, value, valueS
}
Examples
The following examples use the invoke operator to run the function.
Calculate average rate of metric increase
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_rate_fl=(tbl:(timestamp:dynamic, value:dynamic), n_bins:int=1, fix_reset:bool=true)
{
tbl
| where fix_reset // Prometheus counters can only go up
| mv-apply value to typeof(double) on
( extend correction = iff(value < prev(value), prev(value), 0.0) // if the value decreases we assume it was reset to 0, so add last value
| extend cum_correction = row_cumsum(correction)
| extend corrected_value = value + cum_correction
| summarize value = make_list(corrected_value))
| union (tbl | where not(fix_reset))
| extend timestampS = array_shift_right(timestamp, n_bins), valueS = array_shift_right(value, n_bins)
| extend dt = series_subtract(timestamp, timestampS)
| extend dt = series_divide(dt, 1e7) // converts from ticks to seconds
| extend dv = series_subtract(value, valueS)
| extend rate = series_divide(dv, dt)
| project-away dt, dv, timestampS, value, valueS
};
//
demo_prometheus
| invoke series_metric_fl('TimeStamp', 'Name', 'Labels', 'Val', 'writes', offset=now()-datetime(2020-12-08 00:00))
| invoke series_rate_fl(2)
| render timechart with(series=labels)
Stored
demo_prometheus
| invoke series_metric_fl('TimeStamp', 'Name', 'Labels', 'Val', 'writes', offset=now()-datetime(2020-12-08 00:00))
| invoke series_rate_fl(2)
| render timechart with(series=labels)
Output
Selects the main disk of two hosts
The following example selects the main disk of two hosts, and assumes that the function is already installed. This example uses alternative direct calling syntax, specifying the input table as the first parameter:
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_rate_fl=(tbl:(timestamp:dynamic, value:dynamic), n_bins:int=1, fix_reset:bool=true)
{
tbl
| where fix_reset // Prometheus counters can only go up
| mv-apply value to typeof(double) on
( extend correction = iff(value < prev(value), prev(value), 0.0) // if the value decreases we assume it was reset to 0, so add last value
| extend cum_correction = row_cumsum(correction)
| extend corrected_value = value + cum_correction
| summarize value = make_list(corrected_value))
| union (tbl | where not(fix_reset))
| extend timestampS = array_shift_right(timestamp, n_bins), valueS = array_shift_right(value, n_bins)
| extend dt = series_subtract(timestamp, timestampS)
| extend dt = series_divide(dt, 1e7) // converts from ticks to seconds
| extend dv = series_subtract(value, valueS)
| extend rate = series_divide(dv, dt)
| project-away dt, dv, timestampS, value, valueS
};
//
series_rate_fl(series_metric_fl(demo_prometheus, 'TimeStamp', 'Name', 'Labels', 'Val', 'writes', '"disk":"sda1"', lookback=2h, offset=now()-datetime(2020-12-08 00:00)), n_bins=10)
| render timechart with(series=labels)
Stored
series_rate_fl(series_metric_fl(demo_prometheus, 'TimeStamp', 'Name', 'Labels', 'Val', 'writes', '"disk":"sda1"', lookback=2h, offset=now()-datetime(2020-12-08 00:00)), n_bins=10)
| render timechart with(series=labels)
Output
51 - series_rolling_fl()
The function series_rolling_fl()
is a user-defined function (UDF) that applies rolling aggregation on a series. It takes a table containing multiple series (dynamic numerical array) and applies, for each series, a rolling aggregation function.
Syntax
T | invoke series_rolling_fl(
y_series,
y_rolling_series,
n,
aggr,
aggr_params,
center)
Parameters
Name | Type | Required | Description |
---|---|---|---|
y_series | string | ✔️ | The name of the column that contains the series to fit. |
y_rolling_series | string | ✔️ | The name of the column to store the rolling aggregation series. |
n | int | ✔️ | The width of the rolling window. |
aggr | string | ✔️ | The name of the aggregation function to use. See aggregation functions. |
aggr_params | string | Optional parameters for the aggregation function. | |
center | bool | Indicates whether the rolling window is applied symmetrically before and after the current point or applied from the current point backwards. By default, center is false , for calculation on streaming data. |
Aggregation functions
This function supports any aggregation function from numpy or scipy.stats that calculates a scalar out of a series. The following list isn’t exhaustive:
sum
mean
min
max
ptp (max-min)
percentile
median
std
var
gmean
(geometric mean)hmean
(harmonic mean)mode
(most common value)moment
(nth moment)tmean
(trimmed mean)tmin
tmax
tstd
iqr
(inter quantile range)
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_rolling_fl = (tbl:(*), y_series:string, y_rolling_series:string, n:int, aggr:string, aggr_params:dynamic=dynamic([null]), center:bool=true)
{
let kwargs = bag_pack('y_series', y_series, 'y_rolling_series', y_rolling_series, 'n', n, 'aggr', aggr, 'aggr_params', aggr_params, 'center', center);
let code = ```if 1:
y_series = kargs["y_series"]
y_rolling_series = kargs["y_rolling_series"]
n = kargs["n"]
aggr = kargs["aggr"]
aggr_params = kargs["aggr_params"]
center = kargs["center"]
result = df
in_s = df[y_series]
func = getattr(np, aggr, None)
if not func:
import scipy.stats
func = getattr(scipy.stats, aggr)
if func:
result[y_rolling_series] = list(pd.Series(in_s[i]).rolling(n, center=center, min_periods=1).apply(func, args=tuple(aggr_params)).values for i in range(len(in_s)))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Rolling window functions on a series")
series_rolling_fl(tbl:(*), y_series:string, y_rolling_series:string, n:int, aggr:string, aggr_params:dynamic, center:bool=true)
{
let kwargs = bag_pack('y_series', y_series, 'y_rolling_series', y_rolling_series, 'n', n, 'aggr', aggr, 'aggr_params', aggr_params, 'center', center);
let code = ```if 1:
y_series = kargs["y_series"]
y_rolling_series = kargs["y_rolling_series"]
n = kargs["n"]
aggr = kargs["aggr"]
aggr_params = kargs["aggr_params"]
center = kargs["center"]
result = df
in_s = df[y_series]
func = getattr(np, aggr, None)
if not func:
import scipy.stats
func = getattr(scipy.stats, aggr)
if func:
result[y_rolling_series] = list(pd.Series(in_s[i]).rolling(n, center=center, min_periods=1).apply(func, args=tuple(aggr_params)).values for i in range(len(in_s)))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Examples
The following examples use the invoke operator to run the function.
Calculate rolling median of 9 elements
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_rolling_fl = (tbl:(*), y_series:string, y_rolling_series:string, n:int, aggr:string, aggr_params:dynamic=dynamic([null]), center:bool=true)
{
let kwargs = bag_pack('y_series', y_series, 'y_rolling_series', y_rolling_series, 'n', n, 'aggr', aggr, 'aggr_params', aggr_params, 'center', center);
let code = ```if 1:
y_series = kargs["y_series"]
y_rolling_series = kargs["y_rolling_series"]
n = kargs["n"]
aggr = kargs["aggr"]
aggr_params = kargs["aggr_params"]
center = kargs["center"]
result = df
in_s = df[y_series]
func = getattr(np, aggr, None)
if not func:
import scipy.stats
func = getattr(scipy.stats, aggr)
if func:
result[y_rolling_series] = list(pd.Series(in_s[i]).rolling(n, center=center, min_periods=1).apply(func, args=tuple(aggr_params)).values for i in range(len(in_s)))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
//
// Calculate rolling median of 9 elements
//
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| extend rolling_med = dynamic(null)
| invoke series_rolling_fl('num', 'rolling_med', 9, 'median')
| render timechart
Stored
//
// Calculate rolling median of 9 elements
//
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| extend rolling_med = dynamic(null)
| invoke series_rolling_fl('num', 'rolling_med', 9, 'median', dynamic([null]))
| render timechart
Output
Calculate rolling min, max & 75th percentile of 15 elements
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_rolling_fl = (tbl:(*), y_series:string, y_rolling_series:string, n:int, aggr:string, aggr_params:dynamic=dynamic([null]), center:bool=true)
{
let kwargs = bag_pack('y_series', y_series, 'y_rolling_series', y_rolling_series, 'n', n, 'aggr', aggr, 'aggr_params', aggr_params, 'center', center);
let code = ```if 1:
y_series = kargs["y_series"]
y_rolling_series = kargs["y_rolling_series"]
n = kargs["n"]
aggr = kargs["aggr"]
aggr_params = kargs["aggr_params"]
center = kargs["center"]
result = df
in_s = df[y_series]
func = getattr(np, aggr, None)
if not func:
import scipy.stats
func = getattr(scipy.stats, aggr)
if func:
result[y_rolling_series] = list(pd.Series(in_s[i]).rolling(n, center=center, min_periods=1).apply(func, args=tuple(aggr_params)).values for i in range(len(in_s)))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
//
// Calculate rolling min, max & 75th percentile of 15 elements
//
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| extend rolling_min = dynamic(null), rolling_max = dynamic(null), rolling_pct = dynamic(null)
| invoke series_rolling_fl('num', 'rolling_min', 15, 'min', dynamic([null]))
| invoke series_rolling_fl('num', 'rolling_max', 15, 'max', dynamic([null]))
| invoke series_rolling_fl('num', 'rolling_pct', 15, 'percentile', dynamic([75]))
| render timechart
Stored
//
// Calculate rolling min, max & 75th percentile of 15 elements
//
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| extend rolling_min = dynamic(null), rolling_max = dynamic(null), rolling_pct = dynamic(null)
| invoke series_rolling_fl('num', 'rolling_min', 15, 'min', dynamic([null]))
| invoke series_rolling_fl('num', 'rolling_max', 15, 'max', dynamic([null]))
| invoke series_rolling_fl('num', 'rolling_pct', 15, 'percentile', dynamic([75]))
| render timechart
Output
Calculate the rolling trimmed mean
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_rolling_fl = (tbl:(*), y_series:string, y_rolling_series:string, n:int, aggr:string, aggr_params:dynamic=dynamic([null]), center:bool=true)
{
let kwargs = bag_pack('y_series', y_series, 'y_rolling_series', y_rolling_series, 'n', n, 'aggr', aggr, 'aggr_params', aggr_params, 'center', center);
let code = ```if 1:
y_series = kargs["y_series"]
y_rolling_series = kargs["y_rolling_series"]
n = kargs["n"]
aggr = kargs["aggr"]
aggr_params = kargs["aggr_params"]
center = kargs["center"]
result = df
in_s = df[y_series]
func = getattr(np, aggr, None)
if not func:
import scipy.stats
func = getattr(scipy.stats, aggr)
if func:
result[y_rolling_series] = list(pd.Series(in_s[i]).rolling(n, center=center, min_periods=1).apply(func, args=tuple(aggr_params)).values for i in range(len(in_s)))
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
range x from 1 to 100 step 1
| extend y=iff(x % 13 == 0, 2.0, iff(x % 23 == 0, -2.0, rand()))
| summarize x=make_list(x), y=make_list(y)
| extend yr = dynamic(null)
| invoke series_rolling_fl('y', 'yr', 7, 'tmean', pack_array(pack_array(-2, 2), pack_array(false, false))) // trimmed mean: ignoring values outside [-2,2] inclusive
| render linechart
Stored
range x from 1 to 100 step 1
| extend y=iff(x % 13 == 0, 2.0, iff(x % 23 == 0, -2.0, rand()))
| summarize x=make_list(x), y=make_list(y)
| extend yr = dynamic(null)
| invoke series_rolling_fl('y', 'yr', 7, 'tmean', pack_array(pack_array(-2, 2), pack_array(false, false))) // trimmed mean: ignoring values outside [-2,2] inclusive
| render linechart
Output
52 - series_shapes_fl()
The function series_shapes_fl()
is a user-defined function (UDF) that detects positive/negative trend or jump in a series. This function takes a table containing multiple time series (dynamic numerical array), and calculates trend and jump scores for each series. The output is a dictionary (dynamic) containing the scores.
Syntax
T | extend series_shapes_fl(
y_series,
advanced)
Parameters
Name | Type | Required | Description |
---|---|---|---|
y_series | dynamic | ✔️ | An array cell of numeric values. |
advanced | bool | The default is false . Set to true to output additional calculated parameters. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_shapes_fl=(series:dynamic, advanced:bool=false)
{
let n = array_length(series);
// calculate normal dynamic range between 10th and 90th percentiles
let xs = array_sort_asc(series);
let low_idx = tolong(n*0.1);
let high_idx = tolong(n*0.9);
let low_pct = todouble(xs[low_idx]);
let high_pct = todouble(xs[high_idx]);
let norm_range = high_pct-low_pct;
// trend score
let lf = series_fit_line_dynamic(series);
let slope = todouble(lf.slope);
let rsquare = todouble(lf.rsquare);
let rel_slope = abs(n*slope/norm_range);
let sign_slope = iff(slope >= 0.0, 1.0, -1.0);
let norm_slope = sign_slope*rel_slope/(rel_slope+0.1); // map rel_slope from [-Inf, +Inf] to [-1, 1]; 0.1 is a clibration constant
let trend_score = norm_slope*rsquare;
// jump score
let lf2=series_fit_2lines_dynamic(series);
let lslope = todouble(lf2.left.slope);
let rslope = todouble(lf2.right.slope);
let rsquare2 = todouble(lf2.rsquare);
let split_idx = tolong(lf2.split_idx);
let last_left = todouble(lf2.left.interception)+lslope*split_idx;
let first_right = todouble(lf2.right.interception)+rslope;
let jump = first_right-last_left;
let rel_jump = abs(jump/norm_range);
let sign_jump = iff(first_right >= last_left, 1.0, -1.0);
let norm_jump = sign_jump*rel_jump/(rel_jump+0.1); // map rel_jump from [-Inf, +Inf] to [-1, 1]; 0.1 is a clibration constant
let jump_score1 = norm_jump*rsquare2;
// filter for jumps that are not close to the series edges and the right slope has the same direction
let norm_rslope = abs(rslope/norm_range);
let jump_score = iff((sign_jump*rslope >= 0.0 or norm_rslope < 0.02) and split_idx between((0.1*n)..(0.9*n)), jump_score1, 0.0);
let res = iff(advanced, bag_pack("n", n, "low_pct", low_pct, "high_pct", high_pct, "norm_range", norm_range, "slope", slope, "rsquare", rsquare, "rel_slope", rel_slope, "norm_slope", norm_slope,
"trend_score", trend_score, "split_idx", split_idx, "jump", jump, "rsquare2", rsquare2, "last_left", last_left, "first_right", first_right, "rel_jump", rel_jump,
"lslope", lslope, "rslope", rslope, "norm_rslope", norm_rslope, "norm_jump", norm_jump, "jump_score", jump_score)
, bag_pack("trend_score", trend_score, "jump_score", jump_score));
res
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Series detector for positive/negative trend or step. Returns a dynamic with trend and jump scores")
series_shapes_fl(series:dynamic, advanced:bool=false)
{
let n = array_length(series);
// calculate normal dynamic range between 10th and 90th percentiles
let xs = array_sort_asc(series);
let low_idx = tolong(n*0.1);
let high_idx = tolong(n*0.9);
let low_pct = todouble(xs[low_idx]);
let high_pct = todouble(xs[high_idx]);
let norm_range = high_pct-low_pct;
// trend score
let lf = series_fit_line_dynamic(series);
let slope = todouble(lf.slope);
let rsquare = todouble(lf.rsquare);
let rel_slope = abs(n*slope/norm_range);
let sign_slope = iff(slope >= 0.0, 1.0, -1.0);
let norm_slope = sign_slope*rel_slope/(rel_slope+0.1); // map rel_slope from [-Inf, +Inf] to [-1, 1]; 0.1 is a clibration constant
let trend_score = norm_slope*rsquare;
// jump score
let lf2=series_fit_2lines_dynamic(series);
let lslope = todouble(lf2.left.slope);
let rslope = todouble(lf2.right.slope);
let rsquare2 = todouble(lf2.rsquare);
let split_idx = tolong(lf2.split_idx);
let last_left = todouble(lf2.left.interception)+lslope*split_idx;
let first_right = todouble(lf2.right.interception)+rslope;
let jump = first_right-last_left;
let rel_jump = abs(jump/norm_range);
let sign_jump = iff(first_right >= last_left, 1.0, -1.0);
let norm_jump = sign_jump*rel_jump/(rel_jump+0.1); // map rel_jump from [-Inf, +Inf] to [-1, 1]; 0.1 is a clibration constant
let jump_score1 = norm_jump*rsquare2;
// filter for jumps that are not close to the series edges and the right slope has the same direction
let norm_rslope = abs(rslope/norm_range);
let jump_score = iff((sign_jump*rslope >= 0.0 or norm_rslope < 0.02) and split_idx between((0.1*n)..(0.9*n)), jump_score1, 0.0);
let res = iff(advanced, bag_pack("n", n, "low_pct", low_pct, "high_pct", high_pct, "norm_range", norm_range, "slope", slope, "rsquare", rsquare, "rel_slope", rel_slope, "norm_slope", norm_slope,
"trend_score", trend_score, "split_idx", split_idx, "jump", jump, "rsquare2", rsquare2, "last_left", last_left, "first_right", first_right, "rel_jump", rel_jump,
"lslope", lslope, "rslope", rslope, "norm_rslope", norm_rslope, "norm_jump", norm_jump, "jump_score", jump_score)
, bag_pack("trend_score", trend_score, "jump_score", jump_score));
res
}
Example
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_shapes_fl=(series:dynamic, advanced:bool=false)
{
let n = array_length(series);
// calculate normal dynamic range between 10th and 90th percentiles
let xs = array_sort_asc(series);
let low_idx = tolong(n*0.1);
let high_idx = tolong(n*0.9);
let low_pct = todouble(xs[low_idx]);
let high_pct = todouble(xs[high_idx]);
let norm_range = high_pct-low_pct;
// trend score
let lf = series_fit_line_dynamic(series);
let slope = todouble(lf.slope);
let rsquare = todouble(lf.rsquare);
let rel_slope = abs(n*slope/norm_range);
let sign_slope = iff(slope >= 0.0, 1.0, -1.0);
let norm_slope = sign_slope*rel_slope/(rel_slope+0.1); // map rel_slope from [-Inf, +Inf] to [-1, 1]; 0.1 is a clibration constant
let trend_score = norm_slope*rsquare;
// jump score
let lf2=series_fit_2lines_dynamic(series);
let lslope = todouble(lf2.left.slope);
let rslope = todouble(lf2.right.slope);
let rsquare2 = todouble(lf2.rsquare);
let split_idx = tolong(lf2.split_idx);
let last_left = todouble(lf2.left.interception)+lslope*split_idx;
let first_right = todouble(lf2.right.interception)+rslope;
let jump = first_right-last_left;
let rel_jump = abs(jump/norm_range);
let sign_jump = iff(first_right >= last_left, 1.0, -1.0);
let norm_jump = sign_jump*rel_jump/(rel_jump+0.1); // map rel_jump from [-Inf, +Inf] to [-1, 1]; 0.1 is a clibration constant
let jump_score1 = norm_jump*rsquare2;
// filter for jumps that are not close to the series edges and the right slope has the same direction
let norm_rslope = abs(rslope/norm_range);
let jump_score = iff((sign_jump*rslope >= 0.0 or norm_rslope < 0.02) and split_idx between((0.1*n)..(0.9*n)), jump_score1, 0.0);
let res = iff(advanced, bag_pack("n", n, "low_pct", low_pct, "high_pct", high_pct, "norm_range", norm_range, "slope", slope, "rsquare", rsquare, "rel_slope", rel_slope, "norm_slope", norm_slope,
"trend_score", trend_score, "split_idx", split_idx, "jump", jump, "rsquare2", rsquare2, "last_left", last_left, "first_right", first_right, "rel_jump", rel_jump,
"lslope", lslope, "rslope", rslope, "norm_rslope", norm_rslope, "norm_jump", norm_jump, "jump_score", jump_score)
, bag_pack("trend_score", trend_score, "jump_score", jump_score));
res
};
let ts_len = 100;
let noise_pct = 2;
let noise_gain = 3;
union
(print tsid=1 | extend y = array_concat(repeat(20, ts_len/2), repeat(150, ts_len/2))),
(print tsid=2 | extend y = array_concat(repeat(0, ts_len*3/4), repeat(-50, ts_len/4))),
(print tsid=3 | extend y = range(40, 139, 1)),
(print tsid=4 | extend y = range(-20, -109, -1))
| extend x = range(1, array_length(y), 1)
//
| extend shapes = series_shapes_fl(y)
| order by tsid asc
| fork (take 4) (project tsid, shapes)
| render timechart with(series=tsid, xcolumn=x, ycolumns=y)
Stored
let ts_len = 100;
let noise_pct = 2;
let noise_gain = 3;
union
(print tsid=1 | extend y = array_concat(repeat(20, ts_len/2), repeat(150, ts_len/2))),
(print tsid=2 | extend y = array_concat(repeat(0, ts_len*3/4), repeat(-50, ts_len/4))),
(print tsid=3 | extend y = range(40, 139, 1)),
(print tsid=4 | extend y = range(-20, -109, -1))
| extend x = range(1, array_length(y), 1)
//
| extend shapes = series_shapes_fl(y)
| order by tsid asc
| fork (take 4) (project tsid, shapes)
| render timechart with(series=tsid, xcolumn=x, ycolumns=y)
Output
The respective trend and jump scores:
tsid shapes
1 {
"trend_score": 0.703199714530169,
"jump_score": 0.90909090909090906
}
2 {
"trend_score": -0.51663751343174869,
"jump_score": -0.90909090909090906
}
3 {
"trend_score": 0.92592592592592582,
"jump_score": 0.0
}
4 {
"trend_score": -0.92592592592592582,
"jump_score": 0.0
}
53 - series_uv_anomalies_fl()
The function series_uv_anomalies_fl()
is a user-defined function (UDF) that detects anomalies in time series by calling the Univariate Anomaly Detection API, part of Azure Cognitive Services. The function accepts a limited set of time series as numerical dynamic arrays and the required anomaly detection sensitivity level. Each time series is converted into the required JSON format and posts it to the Anomaly Detector service endpoint. The service response contains dynamic arrays of high/low/all anomalies, the modeled baseline time series, its normal high/low boundaries (a value above or below the high/low boundary is an anomaly) and the detected seasonality.
Prerequisites
- An Azure subscription. Create a free Azure account.
- A cluster and database Create a cluster and database or a KQL database with editing permissions and data.
- The Python plugin must be enabled on the cluster. This is required for the inline Python used in the function.
- Enable the http_request plugin / http_request_post plugin on the cluster to access the anomaly detection service endpoint.
- Modify the callout policy for type
webapi
to access the anomaly detection service endpoint.
In the following function example, replace YOUR-AD-RESOURCE-NAME
in the uri and YOUR-KEY
in the Ocp-Apim-Subscription-Key
of the header with your Anomaly Detector resource name and key.
Syntax
T | invoke series_uv_anomalies_fl(
y_series [,
sensitivity [,
tsid]])
Parameters
Name | Type | Required | Description |
---|---|---|---|
y_series | string | ✔️ | The name of the input table column containing the values of the series to be anomaly detected. |
sensitivity | integer | An integer in the range [0-100] specifying the anomaly detection sensitivity. 0 is the least sensitive detection, while 100 is the most sensitive indicating even a small deviation from the expected baseline would be tagged as anomaly. Default value: 85 | |
tsid | string | The name of the input table column containing the time series ID. Can be omitted when analyzing a single time series. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let series_uv_anomalies_fl=(tbl:(*), y_series:string, sensitivity:int=85, tsid:string='_tsid')
{
let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/entire/detect';
let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
let kwargs = bag_pack('y_series', y_series, 'sensitivity', sensitivity);
let code = ```if 1:
import json
y_series = kargs["y_series"]
sensitivity = kargs["sensitivity"]
json_str = []
for i in range(len(df)):
row = df.iloc[i, :]
ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
json_data = {'series': ts, "sensitivity":sensitivity} # auto-detect period, or we can force 'period': 84. We can also add 'maxAnomalyRatio':0.25 for maximum 25% anomalies
json_str = json_str + [json.dumps(json_data)]
result = df
result['json_str'] = json_str
```;
tbl
| evaluate python(typeof(*, json_str:string), code, kwargs)
| extend _tsid = column_ifexists(tsid, 1)
| partition by _tsid (
project json_str
| evaluate http_request_post(uri, headers, dynamic(null))
| project period=ResponseBody.period, baseline_ama=ResponseBody.expectedValues, ad_ama=series_add(0, ResponseBody.isAnomaly), pos_ad_ama=series_add(0, ResponseBody.isPositiveAnomaly)
, neg_ad_ama=series_add(0, ResponseBody.isNegativeAnomaly), upper_ama=series_add(ResponseBody.expectedValues, ResponseBody.upperMargins), lower_ama=series_subtract(ResponseBody.expectedValues, ResponseBody.lowerMargins)
| extend _tsid=toscalar(_tsid)
)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Time Series Anomaly Detection by Azure Cognitive Service")
series_uv_anomalies_fl(tbl:(*), y_series:string, sensitivity:int=85, tsid:string='_tsid')
{
let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/entire/detect';
let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
let kwargs = bag_pack('y_series', y_series, 'sensitivity', sensitivity);
let code = ```if 1:
import json
y_series = kargs["y_series"]
sensitivity = kargs["sensitivity"]
json_str = []
for i in range(len(df)):
row = df.iloc[i, :]
ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
json_data = {'series': ts, "sensitivity":sensitivity} # auto-detect period, or we can force 'period': 84. We can also add 'maxAnomalyRatio':0.25 for maximum 25% anomalies
json_str = json_str + [json.dumps(json_data)]
result = df
result['json_str'] = json_str
```;
tbl
| evaluate python(typeof(*, json_str:string), code, kwargs)
| extend _tsid = column_ifexists(tsid, 1)
| partition by _tsid (
project json_str
| evaluate http_request_post(uri, headers, dynamic(null))
| project period=ResponseBody.period, baseline_ama=ResponseBody.expectedValues, ad_ama=series_add(0, ResponseBody.isAnomaly), pos_ad_ama=series_add(0, ResponseBody.isPositiveAnomaly)
, neg_ad_ama=series_add(0, ResponseBody.isNegativeAnomaly), upper_ama=series_add(ResponseBody.expectedValues, ResponseBody.upperMargins), lower_ama=series_subtract(ResponseBody.expectedValues, ResponseBody.lowerMargins)
| extend _tsid=toscalar(_tsid)
)
}
Examples
The following examples use the invoke operator to run the function.
Use series_uv_anomalies_fl()
to detect anomalies
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_uv_anomalies_fl=(tbl:(*), y_series:string, sensitivity:int=85, tsid:string='_tsid')
{
let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/entire/detect';
let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
let kwargs = bag_pack('y_series', y_series, 'sensitivity', sensitivity);
let code = ```if 1:
import json
y_series = kargs["y_series"]
sensitivity = kargs["sensitivity"]
json_str = []
for i in range(len(df)):
row = df.iloc[i, :]
ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
json_data = {'series': ts, "sensitivity":sensitivity} # auto-detect period, or we can force 'period': 84. We can also add 'maxAnomalyRatio':0.25 for maximum 25% anomalies
json_str = json_str + [json.dumps(json_data)]
result = df
result['json_str'] = json_str
```;
tbl
| evaluate python(typeof(*, json_str:string), code, kwargs)
| extend _tsid = column_ifexists(tsid, 1)
| partition by _tsid (
project json_str
| evaluate http_request_post(uri, headers, dynamic(null))
| project period=ResponseBody.period, baseline_ama=ResponseBody.expectedValues, ad_ama=series_add(0, ResponseBody.isAnomaly), pos_ad_ama=series_add(0, ResponseBody.isPositiveAnomaly)
, neg_ad_ama=series_add(0, ResponseBody.isNegativeAnomaly), upper_ama=series_add(ResponseBody.expectedValues, ResponseBody.upperMargins), lower_ama=series_subtract(ResponseBody.expectedValues, ResponseBody.lowerMargins)
| extend _tsid=toscalar(_tsid)
)
};
let etime=datetime(2017-03-02);
let stime=datetime(2017-01-01);
let dt=1h;
let ts = requests
| make-series value=avg(value) on timestamp from stime to etime step dt
| extend _tsid='TS1';
ts
| invoke series_uv_anomalies_fl('value')
| lookup ts on _tsid
| render anomalychart with(xcolumn=timestamp, ycolumns=value, anomalycolumns=ad_ama)
Stored
let etime=datetime(2017-03-02);
let stime=datetime(2017-01-01);
let dt=1h;
let ts = requests
| make-series value=avg(value) on timestamp from stime to etime step dt
| extend _tsid='TS1';
ts
| invoke series_uv_anomalies_fl('value')
| lookup ts on _tsid
| render anomalychart with(xcolumn=timestamp, ycolumns=value, anomalycolumns=ad_ama)
Output
Compare series_uv_anomalies_fl()
and native series_decompose_anomalies()
The following example compares the Univariate Anomaly Detection API to the native series_decompose_anomalies()
function over three time series and assumes the series_uv_anomalies_fl()
function is already defined in the database:
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_uv_anomalies_fl=(tbl:(*), y_series:string, sensitivity:int=85, tsid:string='_tsid')
{
let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/entire/detect';
let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
let kwargs = bag_pack('y_series', y_series, 'sensitivity', sensitivity);
let code = ```if 1:
import json
y_series = kargs["y_series"]
sensitivity = kargs["sensitivity"]
json_str = []
for i in range(len(df)):
row = df.iloc[i, :]
ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
json_data = {'series': ts, "sensitivity":sensitivity} # auto-detect period, or we can force 'period': 84. We can also add 'maxAnomalyRatio':0.25 for maximum 25% anomalies
json_str = json_str + [json.dumps(json_data)]
result = df
result['json_str'] = json_str
```;
tbl
| evaluate python(typeof(*, json_str:string), code, kwargs)
| extend _tsid = column_ifexists(tsid, 1)
| partition by _tsid (
project json_str
| evaluate http_request_post(uri, headers, dynamic(null))
| project period=ResponseBody.period, baseline_ama=ResponseBody.expectedValues, ad_ama=series_add(0, ResponseBody.isAnomaly), pos_ad_ama=series_add(0, ResponseBody.isPositiveAnomaly)
, neg_ad_ama=series_add(0, ResponseBody.isNegativeAnomaly), upper_ama=series_add(ResponseBody.expectedValues, ResponseBody.upperMargins), lower_ama=series_subtract(ResponseBody.expectedValues, ResponseBody.lowerMargins)
| extend _tsid=toscalar(_tsid)
)
};
let ts = demo_make_series2
| summarize TimeStamp=make_list(TimeStamp), num=make_list(num) by sid;
ts
| invoke series_uv_anomalies_fl('num', 'sid', 90)
| join ts on $left._tsid == $right.sid
| project-away _tsid
| extend (ad_adx, score_adx, baseline_adx)=series_decompose_anomalies(num, 1.5, -1, 'linefit')
| project-reorder num, *
| render anomalychart with(series=sid, xcolumn=TimeStamp, ycolumns=num, baseline_adx, baseline_ama, lower_ama, upper_ama, anomalycolumns=ad_adx, ad_ama)
Stored
let ts = demo_make_series2
| summarize TimeStamp=make_list(TimeStamp), num=make_list(num) by sid;
ts
| invoke series_uv_anomalies_fl('num', 'sid', 90)
| join ts on $left._tsid == $right.sid
| project-away _tsid
| extend (ad_adx, score_adx, baseline_adx)=series_decompose_anomalies(num, 1.5, -1, 'linefit')
| project-reorder num, *
| render anomalychart with(series=sid, xcolumn=TimeStamp, ycolumns=num, baseline_adx, baseline_ama, lower_ama, upper_ama, anomalycolumns=ad_adx, ad_ama)
Output
The following graph shows anomalies detected by the Univariate Anomaly Detection API on TS1. You can also select TS2 or TS3 in the chart filter box.
The following graph shows the anomalies detected by native function on TS1.
54 - series_uv_change_points_fl()
The function series_uv_change_points_fl()
is a user-defined function (UDF) that finds change points in time series by calling the Univariate Anomaly Detection API, part of Azure Cognitive Services. The function accepts a limited set of time series as numerical dynamic arrays, the change point detection threshold, and the minimum size of the stable trend window. Each time series is converted into the required JSON format and posts it to the Anomaly Detector service endpoint. The service response contains dynamic arrays of change points, their respective confidence, and the detected seasonality.
Prerequisites
- An Azure subscription. Create a free Azure account.
- A cluster and database Create a cluster and database or a KQL database with editing permissions and data.
- The Python plugin must be enabled on the cluster. This is required for the inline Python used in the function.
- Enable the http_request plugin / http_request_post plugin on the cluster to access the anomaly detection service endpoint.
- Modify the callout policy for type
webapi
to access the anomaly detection service endpoint.
Syntax
T | invoke series_uv_change_points_fl(
y_series [,
score_threshold [,
trend_window [,
tsid]]])
Parameters
Name | Type | Required | Description |
---|---|---|---|
y_series | string | ✔️ | The name of the input table column containing the values of the series to be anomaly detected. |
score_threshold | real | A value specifying the minimum confidence to declare a change point. Each point whose confidence is above the threshold is defined as a change point. Default value: 0.9 | |
trend_window | integer | A value specifying the minimal window size for robust calculation of trend changes. Default value: 5 | |
tsid | string | The name of the input table column containing the time series ID. Can be omitted when analyzing a single time series. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required. In the following function definition, replace YOUR-AD-RESOURCE-NAME
in the uri and YOUR-KEY
in the Ocp-Apim-Subscription-Key
of the header with your Anomaly Detector resource name and key.
let series_uv_change_points_fl=(tbl:(*), y_series:string, score_threshold:real=0.9, trend_window:int=5, tsid:string='_tsid')
{
let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/changepoint/detect';
let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
let kwargs = bag_pack('y_series', y_series, 'score_threshold', score_threshold, 'trend_window', trend_window);
let code = ```if 1:
import json
y_series = kargs["y_series"]
score_threshold = kargs["score_threshold"]
trend_window = kargs["trend_window"]
json_str = []
for i in range(len(df)):
row = df.iloc[i, :]
ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
json_data = {'series': ts, "threshold":score_threshold, "stableTrendWindow": trend_window} # auto-detect period, or we can force 'period': 84
json_str = json_str + [json.dumps(json_data)]
result = df
result['json_str'] = json_str
```;
tbl
| evaluate python(typeof(*, json_str:string), code, kwargs)
| extend _tsid = column_ifexists(tsid, 1)
| partition by _tsid (
project json_str
| evaluate http_request_post(uri, headers, dynamic(null))
| project period=ResponseBody.period, change_point=series_add(0, ResponseBody.isChangePoint), confidence=ResponseBody.confidenceScores
| extend _tsid=toscalar(_tsid)
)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required. In the following function definition, replace YOUR-AD-RESOURCE-NAME
in the uri and YOUR-KEY
in the Ocp-Apim-Subscription-Key
of the header with your Anomaly Detector resource name and key.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Time Series Change Points Detection by Azure Cognitive Service")
series_uv_change_points_fl(tbl:(*), y_series:string, score_threshold:real=0.9, trend_window:int=5, tsid:string='_tsid')
{
let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/changepoint/detect';
let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
let kwargs = bag_pack('y_series', y_series, 'score_threshold', score_threshold, 'trend_window', trend_window);
let code = ```if 1:
import json
y_series = kargs["y_series"]
score_threshold = kargs["score_threshold"]
trend_window = kargs["trend_window"]
json_str = []
for i in range(len(df)):
row = df.iloc[i, :]
ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
json_data = {'series': ts, "threshold":score_threshold, "stableTrendWindow": trend_window} # auto-detect period, or we can force 'period': 84
json_str = json_str + [json.dumps(json_data)]
result = df
result['json_str'] = json_str
```;
tbl
| evaluate python(typeof(*, json_str:string), code, kwargs)
| extend _tsid = column_ifexists(tsid, 1)
| partition by _tsid (
project json_str
| evaluate http_request_post(uri, headers, dynamic(null))
| project period=ResponseBody.period, change_point=series_add(0, ResponseBody.isChangePoint), confidence=ResponseBody.confidenceScores
| extend _tsid=toscalar(_tsid)
)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let series_uv_change_points_fl=(tbl:(*), y_series:string, score_threshold:real=0.9, trend_window:int=5, tsid:string='_tsid')
{
let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/changepoint/detect';
let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
let kwargs = bag_pack('y_series', y_series, 'score_threshold', score_threshold, 'trend_window', trend_window);
let code = ```if 1:
import json
y_series = kargs["y_series"]
score_threshold = kargs["score_threshold"]
trend_window = kargs["trend_window"]
json_str = []
for i in range(len(df)):
row = df.iloc[i, :]
ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
json_data = {'series': ts, "threshold":score_threshold, "stableTrendWindow": trend_window} # auto-detect period, or we can force 'period': 84
json_str = json_str + [json.dumps(json_data)]
result = df
result['json_str'] = json_str
```;
tbl
| evaluate python(typeof(*, json_str:string), code, kwargs)
| extend _tsid = column_ifexists(tsid, 1)
| partition by _tsid (
project json_str
| evaluate http_request_post(uri, headers, dynamic(null))
| project period=ResponseBody.period, change_point=series_add(0, ResponseBody.isChangePoint), confidence=ResponseBody.confidenceScores
| extend _tsid=toscalar(_tsid)
)
};
let ts = range x from 1 to 300 step 1
| extend y=iff(x between (100 .. 110) or x between (200 .. 220), 20, 5)
| extend ts=datetime(2021-01-01)+x*1d
| extend y=y+4*rand()
| summarize ts=make_list(ts), y=make_list(y)
| extend sid=1;
ts
| invoke series_uv_change_points_fl('y', 0.8, 10, 'sid')
| join ts on $left._tsid == $right.sid
| project-away _tsid
| project-reorder y, * // just to visualize the anomalies on top of y series
| render anomalychart with(xcolumn=ts, ycolumns=y, confidence, anomalycolumns=change_point)
Stored
let ts = range x from 1 to 300 step 1
| extend y=iff(x between (100 .. 110) or x between (200 .. 220), 20, 5)
| extend ts=datetime(2021-01-01)+x*1d
| extend y=y+4*rand()
| summarize ts=make_list(ts), y=make_list(y)
| extend sid=1;
ts
| invoke series_uv_change_points_fl('y', 0.8, 10, 'sid')
| join ts on $left._tsid == $right.sid
| project-away _tsid
| project-reorder y, * // just to visualize the anomalies on top of y series
| render anomalychart with(xcolumn=ts, ycolumns=y, confidence, anomalycolumns=change_point)
Output
The following graph shows change points on a time series.
55 - time_weighted_avg_fl()
The function time_weighted_avg_fl()
is a user-defined function (UDF) that calculates the time weighted average of a metric in a given time window, over input time bins. This function is similar to summarize operator. The function aggregates the metric by time bins, but instead of calculating simple avg() of the metric value in each bin, it weights each value by its duration. The duration is defined from the timestamp of the current value to the timestamp of the next value.
There are two options to calculate time weighted average. This function fills forward the value from the current sample until the next one. Alternatively time_weighted_avg2_fl() linearly interpolates the metric value between consecutive samples.
Syntax
T | invoke time_weighted_avg_fl(
t_col,
y_col,
key_col,
stime,
etime,
dt)
Parameters
Name | Type | Required | Description |
---|---|---|---|
t_col | string | ✔️ | The name of the column containing the time stamp of the records. |
y_col | string | ✔️ | The name of the column containing the metric value of the records. |
key_col | string | ✔️ | The name of the column containing the partition key of the records. |
stime | datetime | ✔️ | The start time of the aggregation window. |
etime | datetime | ✔️ | The end time of the aggregation window. |
dt | timespan | ✔️ | The aggregation time bin. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let time_weighted_avg_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
let _etime = etime + dt;
let gridTimes = range _ts from stime to _etime step dt | extend _val=real(null), dummy=1;
let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
gridTimes
| join kind=fullouter keys on dummy
| project-away dummy, dummy1
| union tbl_ex
| where _ts between (stime.._etime)
| partition hint.strategy=native by _key (
order by _ts asc, _val nulls last
| scan declare(f_value:real=0.0) with (step s: true => f_value = iff(isnull(_val), s.f_value, _val);) // fill forward null values
| extend diff_t=(next(_ts)-_ts)/1m
)
| where isnotnull(diff_t)
| summarize tw_sum=sum(f_value*diff_t), t_sum =sum(diff_t) by bin_at(_ts, dt, stime), _key
| where t_sum > 0 and _ts <= etime
| extend tw_avg = tw_sum/t_sum
| project-away tw_sum, t_sum
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Time weighted average of a metric using fill forward interpolation")
time_weighted_avg_fl(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
let _etime = etime + dt;
let gridTimes = range _ts from stime to _etime step dt | extend _val=real(null), dummy=1;
let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
gridTimes
| join kind=fullouter keys on dummy
| project-away dummy, dummy1
| union tbl_ex
| where _ts between (stime.._etime)
| partition hint.strategy=native by _key (
order by _ts asc, _val nulls last
| scan declare(f_value:real=0.0) with (step s: true => f_value = iff(isnull(_val), s.f_value, _val);) // fill forward null values
| extend diff_t=(next(_ts)-_ts)/1m
)
| where isnotnull(diff_t)
| summarize tw_sum=sum(f_value*diff_t), t_sum =sum(diff_t) by bin_at(_ts, dt, stime), _key
| where t_sum > 0 and _ts <= etime
| extend tw_avg = tw_sum/t_sum
| project-away tw_sum, t_sum
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let time_weighted_avg_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
let _etime = etime + dt;
let gridTimes = range _ts from stime to _etime step dt | extend _val=real(null), dummy=1;
let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
gridTimes
| join kind=fullouter keys on dummy
| project-away dummy, dummy1
| union tbl_ex
| where _ts between (stime.._etime)
| partition hint.strategy=native by _key (
order by _ts asc, _val nulls last
| scan declare(f_value:real=0.0) with (step s: true => f_value = iff(isnull(_val), s.f_value, _val);) // fill forward null values
| extend diff_t=(next(_ts)-_ts)/1m
)
| where isnotnull(diff_t)
| summarize tw_sum=sum(f_value*diff_t), t_sum =sum(diff_t) by bin_at(_ts, dt, stime), _key
| where t_sum > 0 and _ts <= etime
| extend tw_avg = tw_sum/t_sum
| project-away tw_sum, t_sum
};
let tbl = datatable(ts:datetime, val:real, key:string) [
datetime(2021-04-26 00:00), 100, 'Device1',
datetime(2021-04-26 00:45), 300, 'Device1',
datetime(2021-04-26 01:15), 200, 'Device1',
datetime(2021-04-26 00:00), 600, 'Device2',
datetime(2021-04-26 00:30), 400, 'Device2',
datetime(2021-04-26 01:30), 500, 'Device2',
datetime(2021-04-26 01:45), 300, 'Device2'
];
let minmax=materialize(tbl | summarize mint=min(ts), maxt=max(ts));
let stime=toscalar(minmax | project mint);
let etime=toscalar(minmax | project maxt);
let dt = 1h;
tbl
| invoke time_weighted_avg_fl('ts', 'val', 'key', stime, etime, dt)
| project-rename val = tw_avg
| order by _key asc, _ts asc
Stored
let tbl = datatable(ts:datetime, val:real, key:string) [
datetime(2021-04-26 00:00), 100, 'Device1',
datetime(2021-04-26 00:45), 300, 'Device1',
datetime(2021-04-26 01:15), 200, 'Device1',
datetime(2021-04-26 00:00), 600, 'Device2',
datetime(2021-04-26 00:30), 400, 'Device2',
datetime(2021-04-26 01:30), 500, 'Device2',
datetime(2021-04-26 01:45), 300, 'Device2'
];
let minmax=materialize(tbl | summarize mint=min(ts), maxt=max(ts));
let stime=toscalar(minmax | project mint);
let etime=toscalar(minmax | project maxt);
let dt = 1h;
tbl
| invoke time_weighted_avg_fl('ts', 'val', 'key', stime, etime, dt)
| project-rename val = tw_avg
| order by _key asc, _ts asc
Output
_ts | _key | val |
---|---|---|
2021-04-26 00:00:00.0000000 | Device1 | 150 |
2021-04-26 01:00:00.0000000 | Device1 | 225 |
2021-04-26 00:00:00.0000000 | Device2 | 500 |
2021-04-26 01:00:00.0000000 | Device2 | 400 |
The first value of Device1 is (45m*100 + 15m*300)/60m = 150, the second value is (15m*300 + 45m*200)/60m = 225.
The first value of Device2 is (30m*600 + 30m*400)/60m = 500, the second value is (30m*400 + 15m*500 + 15m*300)/60m = 400.
56 - time_weighted_avg2_fl()
The function time_weighted_avg2_fl()
is a user-defined function (UDF) that calculates the time weighted average of a metric in a given time window, over input time bins. This function is similar to summarize operator. The function aggregates the metric by time bins, but instead of calculating simple avg() of the metric value in each bin, it weights each value by its duration. The duration is defined from the timestamp of the current value to the timestamp of the next value.
There are two options to calculate time weighted average. This function linearly interpolates the metric value between consecutive samples. Alternatively time_weighted_avg_fl() fills forward the value from the current sample until the next one.
Syntax
T | invoke time_weighted_avg2_fl(
t_col,
y_col,
key_col,
stime,
etime,
dt)
Parameters
Name | Type | Required | Description |
---|---|---|---|
t_col | string | ✔️ | The name of the column containing the time stamp of the records. |
y_col | string | ✔️ | The name of the column containing the metric value of the records. |
key_col | string | ✔️ | The name of the column containing the partition key of the records. |
stime | datetime | ✔️ | The start time of the aggregation window. |
etime | datetime | ✔️ | The end time of the aggregation window. |
dt | timespan | ✔️ | The aggregation time bin. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let time_weighted_avg2_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
let _etime = etime + dt;
let gridTimes = range _ts from stime to _etime step dt | extend _val=real(null), dummy=1;
let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
gridTimes
| join kind=fullouter keys on dummy
| project-away dummy, dummy1
| union tbl_ex
| where _ts between (stime.._etime)
| partition hint.strategy=native by _key (
order by _ts desc, _val nulls last
| scan declare(val1:real=0.0, t1:datetime) with ( // fill backward null values
step s: true => val1=iff(isnull(_val), s.val1, _val), t1=iff(isnull(_val), s.t1, _ts);)
| extend dt1=(t1-_ts)/1m
| order by _ts asc, _val nulls last
| scan declare(val0:real=0.0, t0:datetime) with ( // fill forward null values
step s: true => val0=iff(isnull(_val), s.val0, _val), t0=iff(isnull(_val), s.t0, _ts);)
| extend dt0=(_ts-t0)/1m
| extend _twa_val=iff(dt0+dt1 == 0, _val, ((val0*dt1)+(val1*dt0))/(dt0+dt1))
| scan with ( // fill forward null twa values
step s: true => _twa_val=iff(isnull(_twa_val), s._twa_val, _twa_val);)
| extend diff_t=(next(_ts)-_ts)/1m
)
| where isnotnull(diff_t)
| order by _key asc, _ts asc
| extend next_twa_val=iff(_key == next(_key), next(_twa_val), _twa_val)
| summarize tw_sum=sum((_twa_val+next_twa_val)*diff_t/2.0), t_sum =sum(diff_t) by bin_at(_ts, dt, stime), _key
| where t_sum > 0 and _ts <= etime
| extend tw_avg = tw_sum/t_sum
| project-away tw_sum, t_sum
| order by _key asc, _ts asc
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Time weighted average of a metric using linear interpolation")
time_weighted_avg2_fl(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
let _etime = etime + dt;
let gridTimes = range _ts from stime to _etime step dt | extend _val=real(null), dummy=1;
let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
gridTimes
| join kind=fullouter keys on dummy
| project-away dummy, dummy1
| union tbl_ex
| where _ts between (stime.._etime)
| partition hint.strategy=native by _key (
order by _ts desc, _val nulls last
| scan declare(val1:real=0.0, t1:datetime) with ( // fill backward null values
step s: true => val1=iff(isnull(_val), s.val1, _val), t1=iff(isnull(_val), s.t1, _ts);)
| extend dt1=(t1-_ts)/1m
| order by _ts asc, _val nulls last
| scan declare(val0:real=0.0, t0:datetime) with ( // fill forward null values
step s: true => val0=iff(isnull(_val), s.val0, _val), t0=iff(isnull(_val), s.t0, _ts);)
| extend dt0=(_ts-t0)/1m
| extend _twa_val=iff(dt0+dt1 == 0, _val, ((val0*dt1)+(val1*dt0))/(dt0+dt1))
| scan with ( // fill forward null twa values
step s: true => _twa_val=iff(isnull(_twa_val), s._twa_val, _twa_val);)
| extend diff_t=(next(_ts)-_ts)/1m
)
| where isnotnull(diff_t)
| order by _key asc, _ts asc
| extend next_twa_val=iff(_key == next(_key), next(_twa_val), _twa_val)
| summarize tw_sum=sum((_twa_val+next_twa_val)*diff_t/2.0), t_sum =sum(diff_t) by bin_at(_ts, dt, stime), _key
| where t_sum > 0 and _ts <= etime
| extend tw_avg = tw_sum/t_sum
| project-away tw_sum, t_sum
| order by _key asc, _ts asc
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let time_weighted_avg2_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
let _etime = etime + dt;
let gridTimes = range _ts from stime to _etime step dt | extend _val=real(null), dummy=1;
let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
gridTimes
| join kind=fullouter keys on dummy
| project-away dummy, dummy1
| union tbl_ex
| where _ts between (stime.._etime)
| partition hint.strategy=native by _key (
order by _ts desc, _val nulls last
| scan declare(val1:real=0.0, t1:datetime) with ( // fill backward null values
step s: true => val1=iff(isnull(_val), s.val1, _val), t1=iff(isnull(_val), s.t1, _ts);)
| extend dt1=(t1-_ts)/1m
| order by _ts asc, _val nulls last
| scan declare(val0:real=0.0, t0:datetime) with ( // fill forward null values
step s: true => val0=iff(isnull(_val), s.val0, _val), t0=iff(isnull(_val), s.t0, _ts);)
| extend dt0=(_ts-t0)/1m
| extend _twa_val=iff(dt0+dt1 == 0, _val, ((val0*dt1)+(val1*dt0))/(dt0+dt1))
| scan with ( // fill forward null twa values
step s: true => _twa_val=iff(isnull(_twa_val), s._twa_val, _twa_val);)
| extend diff_t=(next(_ts)-_ts)/1m
)
| where isnotnull(diff_t)
| order by _key asc, _ts asc
| extend next_twa_val=iff(_key == next(_key), next(_twa_val), _twa_val)
| summarize tw_sum=sum((_twa_val+next_twa_val)*diff_t/2.0), t_sum =sum(diff_t) by bin_at(_ts, dt, stime), _key
| where t_sum > 0 and _ts <= etime
| extend tw_avg = tw_sum/t_sum
| project-away tw_sum, t_sum
| order by _key asc, _ts asc
};
let tbl = datatable(ts:datetime, val:real, key:string) [
datetime(2021-04-26 00:00), 100, 'Device1',
datetime(2021-04-26 00:45), 300, 'Device1',
datetime(2021-04-26 01:15), 200, 'Device1',
datetime(2021-04-26 00:00), 600, 'Device2',
datetime(2021-04-26 00:30), 400, 'Device2',
datetime(2021-04-26 01:30), 500, 'Device2',
datetime(2021-04-26 01:45), 300, 'Device2'
];
let minmax=materialize(tbl | summarize mint=min(ts), maxt=max(ts));
let stime=toscalar(minmax | project mint);
let etime=toscalar(minmax | project maxt);
let dt = 1h;
tbl
| invoke time_weighted_avg2_fl('ts', 'val', 'key', stime, etime, dt)
| project-rename val = tw_avg
| order by _key asc, _ts asc
Stored
let tbl = datatable(ts:datetime, val:real, key:string) [
datetime(2021-04-26 00:00), 100, 'Device1',
datetime(2021-04-26 00:45), 300, 'Device1',
datetime(2021-04-26 01:15), 200, 'Device1',
datetime(2021-04-26 00:00), 600, 'Device2',
datetime(2021-04-26 00:30), 400, 'Device2',
datetime(2021-04-26 01:30), 500, 'Device2',
datetime(2021-04-26 01:45), 300, 'Device2'
];
let minmax=materialize(tbl | summarize mint=min(ts), maxt=max(ts));
let stime=toscalar(minmax | project mint);
let etime=toscalar(minmax | project maxt);
let dt = 1h;
tbl
| invoke time_weighted_avg2_fl('ts', 'val', 'key', stime, etime, dt)
| project-rename val = tw_avg
| order by _key asc, _ts asc
Output
_ts | _key | val |
---|---|---|
2021-04-26 00:00:00.0000000 | Device1 | 218.75 |
2021-04-26 01:00:00.0000000 | Device1 | 206.25 |
2021-04-26 00:00:00.0000000 | Device2 | 462.5 |
2021-04-26 01:00:00.0000000 | Device2 | 412.5 |
The first value of Device1 is (45m*(100+300)/2 + 15m*(300+250)/2)/60m = 218.75, the second value is (15m*(250+200)/2 + 45m*200)/60m = 206.25.
The first value of Device2 is (30m*(600+400)/2 + 30m*(400+450)/2)/60m = 462.5, the second value is (30m*(450+500)/2 + 15m*(500+300)/2 + 15m*300)/60m = 412.5.
57 - time_weighted_val_fl()
The function time_weighted_val_fl()
is a user-defined function (UDF) that linearly interpolates metric value by time weighted average of the values of its previous point and its next point.
Syntax
T | invoke time_weighted_avg_fl(
t_col,
y_col,
key_col,
stime,
etime,
dt)
Parameters
Name | Type | Required | Description |
---|---|---|---|
t_col | string | ✔️ | The name of the column containing the time stamp of the records. |
y_col | string | ✔️ | The name of the column containing the metric value of the records. |
key_col | string | ✔️ | The name of the column containing the partition key of the records. |
stime | datetime | ✔️ | The start time of the aggregation window. |
etime | datetime | ✔️ | The end time of the aggregation window. |
dt | timespan | ✔️ | The aggregation time bin. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let time_weighted_val_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
let gridTimes = range _ts from stime to etime step dt | extend _val=real(null), grid=1, dummy=1;
let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
gridTimes
| join kind=fullouter keys on dummy
| project-away dummy, dummy1
| union (tbl_ex | extend grid=0)
| where _ts between (stime..etime)
| partition hint.strategy=native by _key (
order by _ts desc, _val nulls last
| scan declare(val1:real=0.0, t1:datetime) with ( // fill backward null values
step s: true => val1=iff(isnull(_val), s.val1, _val), t1=iff(isnull(_val), s.t1, _ts);)
| extend dt1=(t1-_ts)/1m
| order by _ts asc, _val nulls last
| scan declare(val0:real=0.0, t0:datetime) with ( // fill forward null values
step s: true => val0=iff(isnull(_val), s.val0, _val), t0=iff(isnull(_val), s.t0, _ts);)
| extend dt0=(_ts-t0)/1m
| extend _twa_val=iff(dt0+dt1 == 0, _val, ((val0*dt1)+(val1*dt0))/(dt0+dt1))
| scan with ( // fill forward null twa values
step s: true => _twa_val=iff(isnull(_twa_val), s._twa_val, _twa_val);)
| where grid == 0 or (grid == 1 and _ts != prev(_ts))
)
| project _ts, _key, _twa_val, orig_val=iff(grid == 1, 0, 1)
| order by _key asc, _ts asc
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Linear interpolation of metric value by time weighted average")
time_weighted_val_fl(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
let gridTimes = range _ts from stime to etime step dt | extend _val=real(null), grid=1, dummy=1;
let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
gridTimes
| join kind=fullouter keys on dummy
| project-away dummy, dummy1
| union (tbl_ex | extend grid=0)
| where _ts between (stime..etime)
| partition hint.strategy=native by _key (
order by _ts desc, _val nulls last
| scan declare(val1:real=0.0, t1:datetime) with ( // fill backward null values
step s: true => val1=iff(isnull(_val), s.val1, _val), t1=iff(isnull(_val), s.t1, _ts);)
| extend dt1=(t1-_ts)/1m
| order by _ts asc, _val nulls last
| scan declare(val0:real=0.0, t0:datetime) with ( // fill forward null values
step s: true => val0=iff(isnull(_val), s.val0, _val), t0=iff(isnull(_val), s.t0, _ts);)
| extend dt0=(_ts-t0)/1m
| extend _twa_val=iff(dt0+dt1 == 0, _val, ((val0*dt1)+(val1*dt0))/(dt0+dt1))
| scan with ( // fill forward null twa values
step s: true => _twa_val=iff(isnull(_twa_val), s._twa_val, _twa_val);)
| where grid == 0 or (grid == 1 and _ts != prev(_ts))
)
| project _ts, _key, _twa_val, orig_val=iff(grid == 1, 0, 1)
| order by _key asc, _ts asc
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let time_weighted_val_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
let gridTimes = range _ts from stime to etime step dt | extend _val=real(null), grid=1, dummy=1;
let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
gridTimes
| join kind=fullouter keys on dummy
| project-away dummy, dummy1
| union (tbl_ex | extend grid=0)
| where _ts between (stime..etime)
| partition hint.strategy=native by _key (
order by _ts desc, _val nulls last
| scan declare(val1:real=0.0, t1:datetime) with ( // fill backward null values
step s: true => val1=iff(isnull(_val), s.val1, _val), t1=iff(isnull(_val), s.t1, _ts);)
| extend dt1=(t1-_ts)/1m
| order by _ts asc, _val nulls last
| scan declare(val0:real=0.0, t0:datetime) with ( // fill forward null values
step s: true => val0=iff(isnull(_val), s.val0, _val), t0=iff(isnull(_val), s.t0, _ts);)
| extend dt0=(_ts-t0)/1m
| extend _twa_val=iff(dt0+dt1 == 0, _val, ((val0*dt1)+(val1*dt0))/(dt0+dt1))
| scan with ( // fill forward null twa values
step s: true => _twa_val=iff(isnull(_twa_val), s._twa_val, _twa_val);)
| where grid == 0 or (grid == 1 and _ts != prev(_ts))
)
| project _ts, _key, _twa_val, orig_val=iff(grid == 1, 0, 1)
| order by _key asc, _ts asc
};
let tbl = datatable(ts:datetime, val:real, key:string) [
datetime(2021-04-26 00:00), 100, 'Device1',
datetime(2021-04-26 00:45), 300, 'Device1',
datetime(2021-04-26 01:15), 200, 'Device1',
datetime(2021-04-26 00:00), 600, 'Device2',
datetime(2021-04-26 00:30), 400, 'Device2',
datetime(2021-04-26 01:30), 500, 'Device2',
datetime(2021-04-26 01:45), 300, 'Device2'
];
let minmax=materialize(tbl | summarize mint=min(ts), maxt=max(ts));
let stime=toscalar(minmax | project mint);
let etime=toscalar(minmax | project maxt);
let dt = 1h;
tbl
| invoke time_weighted_val_fl('ts', 'val', 'key', stime, etime, dt)
| project-rename val = _twa_val
| order by _key asc, _ts asc
Stored
let tbl = datatable(ts:datetime, val:real, key:string) [
datetime(2021-04-26 00:00), 100, 'Device1',
datetime(2021-04-26 00:45), 300, 'Device1',
datetime(2021-04-26 01:15), 200, 'Device1',
datetime(2021-04-26 00:00), 600, 'Device2',
datetime(2021-04-26 00:30), 400, 'Device2',
datetime(2021-04-26 01:30), 500, 'Device2',
datetime(2021-04-26 01:45), 300, 'Device2'
];
let minmax=materialize(tbl | summarize mint=min(ts), maxt=max(ts));
let stime=toscalar(minmax | project mint);
let etime=toscalar(minmax | project maxt);
let dt = 1h;
tbl
| invoke time_weighted_val_fl('ts', 'val', 'key', stime, etime, dt)
| project-rename val = _twa_val
| order by _key asc, _ts asc
Output
_ts | _key | val | orig_val |
---|---|---|---|
2021-04-26 00:00:00.0000000 | Device1 | 100 | 1 |
2021-04-26 00:45:00.0000000 | Device1 | 300 | 1 |
2021-04-26 01:00:00.0000000 | Device1 | 250 | 0 |
2021-04-26 01:15:00.0000000 | Device1 | 200 | 1 |
2021-04-26 00:00:00.0000000 | Device2 | 600 | 1 |
2021-04-26 00:30:00.0000000 | Device2 | 400 | 1 |
2021-04-26 01:00:00.0000000 | Device2 | 450 | 0 |
2021-04-26 01:30:00.0000000 | Device2 | 500 | 1 |
2021-04-26 01:45:00.0000000 | Device2 | 300 | 1 |
58 - time_window_rolling_avg_fl()
The function time_window_rolling_avg_fl()
is a user-defined function (UDF) that calculates the rolling average of the required value over a constant duration time window.
Calculating rolling average over a constant time window for regular time series (that is, having constant intervals) can be achieved using series_fir(), as the constant time window can be converted to a fixed width filter of equal coefficients. However, calculating it for irregular time series is more complex, as the actual number of samples in the window varies. Still it can be achieved using the powerful scan operator.
This type of rolling window calculation is required for use cases where the metric values are emitted only when changed (and not in constant intervals). For example in IoT, where edge devices send metrics to the cloud only upon changes, optimizing communication bandwidth.
Syntax
T | invoke time_window_rolling_avg_fl(
t_col,
y_col,
key_col,
dt [,
direction ])
Parameters
Name | Type | Required | Description |
---|---|---|---|
t_col | string | ✔️ | The name of the column containing the time stamp of the records. |
y_col | string | ✔️ | The name of the column containing the metric value of the records. |
key_col | string | ✔️ | The name of the column containing the partition key of the records. |
dt | timespan | ✔️ | The duration of the rolling window. |
direction | int | The aggregation direction. The possible values are +1 or -1. A rolling window is set from current time forward/backward respectively. Default is -1, as backward rolling window is the only possible method for streaming scenarios. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let time_window_rolling_avg_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, dt:timespan, direction:int=int(-1))
{
let tbl_ex = tbl | extend timestamp = column_ifexists(t_col, datetime(null)), value = column_ifexists(y_col, 0.0), key = column_ifexists(key_col, '');
tbl_ex
| partition hint.strategy=shuffle by key
(
extend timestamp=pack_array(timestamp, timestamp - direction*dt), delta = pack_array(-direction, direction)
| mv-expand timestamp to typeof(datetime), delta to typeof(long)
| sort by timestamp asc, delta desc
| scan declare (cum_sum:double=0.0, cum_count:long=0) with
(
step s: true => cum_count = s.cum_count + delta,
cum_sum = s.cum_sum + delta * value;
)
| extend avg_value = iff(direction == 1, prev(cum_sum)/prev(cum_count), cum_sum/cum_count)
| where delta == -direction
| project timestamp, value, avg_value, key
)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Series", docstring = "Time based rolling average of a metric")
time_window_rolling_avg_fl(tbl:(*), t_col:string, y_col:string, key_col:string, dt:timespan, direction:int=int(-1))
{
let tbl_ex = tbl | extend timestamp = column_ifexists(t_col, datetime(null)), value = column_ifexists(y_col, 0.0), key = column_ifexists(key_col, '');
tbl_ex
| partition hint.strategy=shuffle by key
(
extend timestamp=pack_array(timestamp, timestamp - direction*dt), delta = pack_array(-direction, direction)
| mv-expand timestamp to typeof(datetime), delta to typeof(long)
| sort by timestamp asc, delta desc
| scan declare (cum_sum:double=0.0, cum_count:long=0) with
(
step s: true => cum_count = s.cum_count + delta,
cum_sum = s.cum_sum + delta * value;
)
| extend avg_value = iff(direction == 1, prev(cum_sum)/prev(cum_count), cum_sum/cum_count)
| where delta == -direction
| project timestamp, value, avg_value, key
)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let time_window_rolling_avg_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, dt:timespan, direction:int=int(-1))
{
let tbl_ex = tbl | extend timestamp = column_ifexists(t_col, datetime(null)), value = column_ifexists(y_col, 0.0), key = column_ifexists(key_col, '');
tbl_ex
| partition hint.strategy=shuffle by key
(
extend timestamp=pack_array(timestamp, timestamp - direction*dt), delta = pack_array(-direction, direction)
| mv-expand timestamp to typeof(datetime), delta to typeof(long)
| sort by timestamp asc, delta desc
| scan declare (cum_sum:double=0.0, cum_count:long=0) with
(
step s: true => cum_count = s.cum_count + delta,
cum_sum = s.cum_sum + delta * value;
)
| extend avg_value = iff(direction == 1, prev(cum_sum)/prev(cum_count), cum_sum/cum_count)
| where delta == -direction
| project timestamp, value, avg_value, key
)
};
let tbl = datatable(ts:datetime, val:real, key:string) [
datetime(8:00), 1, 'Device1',
datetime(8:01), 2, 'Device1',
datetime(8:05), 3, 'Device1',
datetime(8:05), 10, 'Device2',
datetime(8:09), 20, 'Device2',
datetime(8:40), 4, 'Device1',
datetime(9:00), 5, 'Device1',
datetime(9:01), 6, 'Device1',
datetime(9:05), 30, 'Device2',
datetime(9:50), 7, 'Device1'
];
tbl
| invoke time_window_rolling_avg_fl('ts', 'val', 'key', 10m)
Stored
let tbl = datatable(ts:datetime, val:real, key:string) [
datetime(8:00), 1, 'Device1',
datetime(8:01), 2, 'Device1',
datetime(8:05), 3, 'Device1',
datetime(8:05), 10, 'Device2',
datetime(8:09), 20, 'Device2',
datetime(8:40), 4, 'Device1',
datetime(9:00), 5, 'Device1',
datetime(9:01), 6, 'Device1',
datetime(9:05), 30, 'Device2',
datetime(9:50), 7, 'Device1'
];
tbl
| invoke time_window_rolling_avg_fl('ts', 'val', 'key', 10m)
Output
timestamp | value | avg_value | key |
---|---|---|---|
2021-11-29 08:05:00.0000000 | 10 | 10 | Device2 |
2021-11-29 08:09:00.0000000 | 20 | 15 | Device2 |
2021-11-29 09:05:00.0000000 | 30 | 30 | Device2 |
2021-11-29 08:00:00.0000000 | 1 | 1 | Device1 |
2021-11-29 08:01:00.0000000 | 2 | 1.5 | Device1 |
2021-11-29 08:05:00.0000000 | 3 | 2 | Device1 |
2021-11-29 08:40:00.0000000 | 4 | 4 | Device1 |
2021-11-29 09:00:00.0000000 | 5 | 5 | Device1 |
2021-11-29 09:01:00.0000000 | 6 | 5.5 | Device1 |
2021-11-29 09:50:00.0000000 | 7 | 7 | Device1 |
The first value (10) at 8:05 contains only a single value, which fell in the 10-minute backward window, the second value (15) is the average of two samples at 8:09 and at 8:05, etc.
59 - two_sample_t_test_fl()
The function two_sample_t_test_fl()
is a user-defined function (UDF) that performs the Two-Sample T-Test.
Syntax
T | invoke two_sample_t_test_fl(
data1,
data2,
test_statistic,
p_value,
equal_var)
Parameters
Name | Type | Required | Description |
---|---|---|---|
data1 | string | ✔️ | The name of the column containing the first set of data to be used for the test. |
data2 | string | ✔️ | The name of the column containing the second set of data to be used for the test. |
test_statistic | string | ✔️ | The name of the column to store test statistic value for the results. |
p_value | string | ✔️ | The name of the column to store p-value for the results. |
equal_var | bool | If true (default), performs a standard independent 2 sample test that assumes equal population variances. If false , performs Welch’s t-test, which does not assume equal population variance. As mentioned above, consider using the native welch_test(). |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let two_sample_t_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string, equal_var:bool=true)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value, 'equal_var', equal_var);
let code = ```if 1:
from scipy import stats
import pandas
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
equal_var = kargs["equal_var"]
def func(row):
statistics = stats.ttest_ind(row[data1], row[data2], equal_var=equal_var)
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Two-Sample t-Test")
two_sample_t_test_fl(tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string, equal_var:bool=true)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value, 'equal_var', equal_var);
let code = ```if 1:
from scipy import stats
import pandas
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
equal_var = kargs["equal_var"]
def func(row):
statistics = stats.ttest_ind(row[data1], row[data2], equal_var=equal_var)
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let two_sample_t_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string, equal_var:bool=true)
{
let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value, 'equal_var', equal_var);
let code = ```if 1:
from scipy import stats
import pandas
data1 = kargs["data1"]
data2 = kargs["data2"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
equal_var = kargs["equal_var"]
def func(row):
statistics = stats.ttest_ind(row[data1], row[data2], equal_var=equal_var)
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke two_sample_t_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Stored
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke two_sample_t_test_fl('sample1', 'sample2', 'test_stat', 'p_val')
Output
ID | sample1 | sample2 | test_stat | p_val |
---|---|---|---|---|
Test #1 | [23.64, 20.57, 20.42] | [27.1, 22.12, 33.56] | -1.7415675457565645 | 0.15655096653487446 |
Test #2 | [20.85, 21.89, 23.41] | [35.09, 30.02, 26.52], -3.2711673491022579 | 0.030755331219276136 | |
Test #3 | [20.13, 20.5, 21.7, 22.02] | [32.2, 32.79, 33.9, 34.22] | -18.5515946201742 | 1.5823717131966134E-06 |
60 - User-defined functions
User-defined functions are reusable subqueries that can be defined as part of the query itself (query-defined functions), or stored as part of the database metadata (stored functions). User-defined functions are invoked through a name, are provided with zero or more input arguments (which can be scalar or tabular), and produce a single value (which can be scalar or tabular) based on the function body.
A user-defined function belongs to one of two categories:
- Scalar functions
- Tabular functions
The function’s input arguments and output determine whether it’s scalar or tabular, which then establishes how it might be used.
To optimize multiple uses of the user-defined functions within a single query, see Optimize queries that use named expressions.
We’ve created an assortment of user-defined functions that you can use in your queries. For more information, see Functions library.
Scalar function
- Has zero input arguments, or all its input arguments are scalar values
- Produces a single scalar value
- Can be used wherever a scalar expression is allowed
- May only use the row context in which it’s defined
- Can only refer to tables (and views) that are in the accessible schema
Tabular function
- Accepts one or more tabular input arguments, and zero or more scalar input arguments, and/or:
- Produces a single tabular value
Function names
Valid user-defined function names must follow the same identifier naming rules as other entities.
The name must also be unique in its scope of definition.
Input arguments
Valid user-defined functions follow these rules:
- A user-defined function has a strongly typed list of zero or more input arguments.
- An input argument has a name, a type, and (for scalar arguments) a default value.
- The name of an input argument is an identifier.
- The type of an input argument is either one of the scalar data types, or a tabular schema.
Syntactically, the input arguments list is a comma-separated list of argument definitions, wrapped in parenthesis. Each argument definition is specified as
ArgName:ArgType [= ArgDefaultValue]
For tabular arguments, ArgType has the same syntax as the table definition (parenthesis and a list of column name/type pairs), with the addition of a solitary (*)
indicating “any tabular schema”.
For example:
Syntax | Input arguments list description |
---|---|
() | No arguments |
(s:string) | Single scalar argument called s taking a value of type string |
(a:long, b:bool=true) | Two scalar arguments, the second of which has a default value |
(T1:(*), T2:(r:real), b:bool) | Three arguments (two tabular arguments and one scalar argument) |
Examples
Scalar function
let Add7 = (arg0:long = 5) { arg0 + 7 };
range x from 1 to 10 step 1
| extend x_plus_7 = Add7(x), five_plus_seven = Add7()
Tabular function with no arguments
let tenNumbers = () { range x from 1 to 10 step 1};
tenNumbers
| extend x_plus_7 = x + 7
Tabular function with arguments
let MyFilter = (T:(x:long), v:long) {
T | where x >= v
};
MyFilter((range x from 1 to 10 step 1), 9)
Output
x |
---|
9 |
10 |
A tabular function that uses a tabular input with no column specified. Any table can be passed to a function, and no table columns can be referenced inside the function.
let MyDistinct = (T:(*)) {
T | distinct *
};
MyDistinct((range x from 1 to 3 step 1))
Output
x |
---|
1 |
2 |
3 |
Declaring user-defined functions
The declaration of a user-defined function provides:
- Function name
- Function schema (parameters it accepts, if any)
- Function body
let f=(s:string, i:long) {
tolong(s) * i
};
The function body includes:
- Exactly one expression, which provides the function’s return value (scalar or tabular value).
- Any number (zero or more) of let statements, whose scope is that of the function body. If specified, the let statements must precede the expression defining the function’s return value.
- Any number (zero or more) of query parameters statements, which declare query parameters used by the function. If specified, they must precede the expression defining the function’s return value.
Examples of user-defined functions
The following section shows examples of how to use user-defined functions.
User-defined function that uses a let statement
The following example shows a user-defined function (lambda) that accepts a parameter named ID. The function is bound to the name Test and makes use of three let statements, in which the Test3 definition uses the ID parameter. When run, the output from the query is 70:
let Test = (id: int) {
let Test2 = 10;
let Test3 = 10 + Test2 + id;
let Test4 = (arg: int) {
let Test5 = 20;
Test2 + Test3 + Test5 + arg
};
Test4(10)
};
range x from 1 to Test(10) step 1
| count
User-defined function that defines a default value for a parameter
The following example shows a function that accepts three arguments. The latter two have a default value and don’t have to be present at the call site.
let f = (a:long, b:string = "b.default", c:long = 0) {
strcat(a, "-", b, "-", c)
};
print f(12, c=7) // Returns "12-b.default-7"
Invoking a user-defined function
The method to invoke a user-defined function depends on the arguments that the function expects to receive. The following sections cover how to invoke a UDF without arguments, invoke a UDF with scalar arguments, and invoke a UDF with tabular arguments.
Invoke a UDF without arguments
A user-defined function that takes no arguments and can be invoked either by its name, or by its name and an empty argument list in parentheses.
// Bind the identifier a to a user-defined function (lambda) that takes
// no arguments and returns a constant of type long:
let a=(){123};
// Invoke the function in two equivalent ways:
range x from 1 to 10 step 1
| extend y = x * a, z = x * a()
// Bind the identifier T to a user-defined function (lambda) that takes
// no arguments and returns a random two-by-two table:
let T=(){
range x from 1 to 2 step 1
| project x1 = rand(), x2 = rand()
};
// Invoke the function in two equivalent ways:
// (Note that the second invocation must be itself wrapped in
// an additional set of parentheses, as the union operator
// differentiates between "plain" names and expressions)
union T, (T())
Invoke a UDF with scalar arguments
A user-defined function that takes one or more scalar arguments can be invoked by using the function name and a concrete argument list in parentheses:
let f=(a:string, b:string) {
strcat(a, " (la la la)", b)
};
print f("hello", "world")
Invoke a UDF with tabular arguments
A user-defined function that takes one or more table arguments (with any number of scalar arguments) and can be invoked using the function name and a concrete argument list in parentheses:
let MyFilter = (T:(x:long), v:long) {
T | where x >= v
};
MyFilter((range x from 1 to 10 step 1), 9)
You can also use the operator invoke
to invoke a user-defined function that
takes one or more table arguments and returns a table. This function is useful when the first concrete table argument to the function is the source of the invoke
operator:
let append_to_column_a=(T:(a:string), what:string) {
T | extend a=strcat(a, " ", what)
};
datatable (a:string) ["sad", "really", "sad"]
| invoke append_to_column_a(":-)")
Default values
Functions may provide default values to some of their parameters under the following conditions:
- Default values may be provided for scalar parameters only.
- Default values are always literals (constants). They can’t be arbitrary calculations.
- Parameters with no default value always precede parameters that do have a default value.
- Callers must provide the value of all parameters with no default values arranged in the same order as the function declaration.
- Callers don’t need to provide the value for parameters with default values, but may do so.
- Callers may provide arguments in an order that doesn’t match the order of the parameters. If so, they must name their arguments.
The following example returns a table with two identical records. In the first invocation of f
, the arguments are completely “scrambled”, so each one is explicitly given a name:
let f = (a:long, b:string = "b.default", c:long = 0) {
strcat(a, "-", b, "-", c)
};
union
(print x=f(c=7, a=12)), // "12-b.default-7"
(print x=f(12, c=7)) // "12-b.default-7"
Output
x |
---|
12-b.default-7 |
12-b.default-7 |
View functions
A user-defined function that takes no arguments and returns a tabular expression can be marked as a view. Marking a user-defined function as a view means that the function behaves like a table whenever a wildcard table name resolution is performed.
The following example shows two user-defined functions, T_view
and T_notview
, and shows how only the first one is resolved by the wildcard reference in the union
:
let T_view = view () { print x=1 };
let T_notview = () { print x=2 };
union T*
Restrictions
The following restrictions apply:
- User-defined functions can’t pass into toscalar() invocation information that depends on the row-context in which the function is called.
- User-defined functions that return a tabular expression can’t be invoked with an argument that varies with the row context.
- A function taking at least one tabular input can’t be invoked on a remote cluster.
- A scalar function can’t be invoked on a remote cluster.
The only place a user-defined function may be invoked with an argument that varies with the row context is when the user-defined function is composed of scalar functions only and doesn’t use toscalar()
.
Examples
Supported scalar function
The following query is supported because f
is a scalar function that doesn’t reference any tabular expression.
let Table1 = datatable(xdate:datetime)[datetime(1970-01-01)];
let Table2 = datatable(Column:long)[1235];
let f = (hours:long) { now() + hours*1h };
Table2 | where Column != 123 | project d = f(10)
The following query is supported because f
is a scalar function that references the tabular expression Table1
but is invoked with no reference to the current row context f(10)
:
let Table1 = datatable(xdate:datetime)[datetime(1970-01-01)];
let Table2 = datatable(Column:long)[1235];
let f = (hours:long) { toscalar(Table1 | summarize min(xdate) - hours*1h) };
Table2 | where Column != 123 | project d = f(10)
Unsupported scalar function
The following query isn’t supported because f
is a scalar function that references the tabular expression Table1
, and is invoked with a reference to the current row context f(Column)
:
let Table1 = datatable(xdate:datetime)[datetime(1970-01-01)];
let Table2 = datatable(Column:long)[1235];
let f = (hours:long) { toscalar(Table1 | summarize min(xdate) - hours*1h) };
Table2 | where Column != 123 | project d = f(Column)
Unsupported tabular function
The following query isn’t supported because f
is a tabular function that is invoked in a context that expects a scalar value.
let Table1 = datatable(xdate:datetime)[datetime(1970-01-01)];
let Table2 = datatable(Column:long)[1235];
let f = (hours:long) { range x from 1 to hours step 1 | summarize make_list(x) };
Table2 | where Column != 123 | project d = f(Column)
Features that are currently unsupported by user-defined functions
For completeness, here are some commonly requested features for user-defined functions that are currently not supported:
Function overloading: There’s currently no way to overload a function (a way to create multiple functions with the same name and different input schema).
Default values: The default value for a scalar parameter to a function must be a scalar literal (constant).
Related content
61 - wilcoxon_test_fl()
The function wilcoxon_test_fl()
is a user-defined function (UDF) that performs the Wilcoxon Test.
Syntax
T | invoke wilcoxon_test_fl()(
data,
test_statistic,
p_value)
Parameters
Name | Type | Required | Description |
---|---|---|---|
data | string | ✔️ | The name of the column containing the data to be used for the test. |
test_statistic | string | ✔️ | The name of the column to store test statistic value for the results. |
p_value | string | ✔️ | The name of the column to store p-value for the results. |
Function definition
You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:
Query-defined
Define the function using the following let statement. No permissions are required.
let wilcoxon_test_fl = (tbl:(*), data:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data', data, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data = kargs["data"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.wilcoxon(row[data])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.
Stored
Define the stored function once using the following .create function
. Database User permissions are required.
.create-or-alter function with (folder = "Packages\\Stats", docstring = "Wilcoxon Test")
wilcoxon_test_fl(tbl:(*), data:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data', data, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data = kargs["data"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.wilcoxon(row[data])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
}
Example
The following example uses the invoke operator to run the function.
Query-defined
To use a query-defined function, invoke it after the embedded function definition.
let wilcoxon_test_fl = (tbl:(*), data:string, test_statistic:string, p_value:string)
{
let kwargs = bag_pack('data', data, 'test_statistic', test_statistic, 'p_value', p_value);
let code = ```if 1:
from scipy import stats
data = kargs["data"]
test_statistic = kargs["test_statistic"]
p_value = kargs["p_value"]
def func(row):
statistics = stats.wilcoxon(row[data])
return statistics[0], statistics[1]
result = df
result[[test_statistic, p_value]] = df.apply(func, axis=1, result_type = "expand")
```;
tbl
| evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, sample1:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]),
'Test #2', dynamic([20.85, 21.89, 23.41]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke wilcoxon_test_fl('sample1', 'test_stat', 'p_val') -->
Stored
datatable(id:string, sample1:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]),
'Test #2', dynamic([20.85, 21.89, 23.41]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke wilcoxon_test_fl('sample1', 'test_stat', 'p_val')
Output
ID | sample1 | test_stat | p_val |
---|---|---|---|
Test #1 | [23.64, 20.57, 20.42] | 0, 0.10880943004054568 | |
Test #2 | [20.85, 21.89, 23.41] | 0, 0.10880943004054568 | |
Test #3 | [20.13, 20.5, 21.7, 22.02] | 0, 0.06788915486182899 |