This is the multi-page printable view of this section. Click here to print.

Return to the regular view of this page.

Functions

1 - bartlett_test_fl()

This article describes the bartlett_test_fl() user-defined function.

The bartlett_test_fl() function is a user-defined tabular function that performs the Bartlett Test.

Syntax

T | invoke bartlett_test_fl()(data1, data2, test_statistic,p_value)

Parameters

NameTypeRequiredDescription
data1string✔️The name of the column containing the first set of data to be used for the test.
data2string✔️The name of the column containing the second set of data to be used for the test.
test_statisticstring✔️The name of the column to store test statistic value for the results.
p_valuestring✔️The name of the column to store p-value for the results.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let bartlett_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
    let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
    let code = ```if 1:
        from scipy import stats
        data1 = kargs["data1"]
        data2 = kargs["data2"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        def func(row):
            statistics = stats.bartlett(row[data1], row[data2])
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Stats", docstring = "Bartlett Test")
bartlett_test_fl(tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
    let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
    let code = ```if 1:
        from scipy import stats
        data1 = kargs["data1"]
        data2 = kargs["data2"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        def func(row):
            statistics = stats.bartlett(row[data1], row[data2])
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let bartlett_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
    let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
    let code = ```if 1:
        from scipy import stats
        data1 = kargs["data1"]
        data2 = kargs["data2"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        def func(row):
            statistics = stats.bartlett(row[data1], row[data2])
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
// Example query that uses the function
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke bartlett_test_fl('sample1', 'sample2', 'test_stat', 'p_val')

Stored

datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke bartlett_test_fl('sample1', 'sample2', 'test_stat', 'p_val')

Output

idsample1sample2test_statp_val
Test #1[23.64, 20.57, 20.42][27.1, 22.12, 33.56]1.76607962244257230.183868001738637
Test #2[20.85, 21.89, 23.41][35.09, 30.02, 26.52]1.92117106168960140.16572762069132516
Test #3[20.13, 20.5, 21.7, 22.02][32.2, 32.79, 33.9, 34.22]0.00269857138292344540.958570306268548

| Test #3 | [20.13, 20.5, 21.7, 22.02] | [32.2, 32.79, 33.9, 34.22] | 0.0026985713829234454 | 0.958570306268548 |

2 - binomial_test_fl()

This article describes the binomial_test_fl() user-defined function.

The function binomial_test_fl() is a UDF (user-defined function) that performs the binomial test.

Syntax

T | invoke binomial_test_fl(successes, trials [,success_prob [, alt_hypotheis ]])

Parameters

NameTypeRequiredDescription
successesstring✔️The name of the column containing the number of success results.
trialsstring✔️The name of the column containing the total number of trials.
p_valuestring✔️The name of the column to store the results.
success_probrealThe success probability. The default is 0.5.
alt_hypotheisstringThe alternate hypothesis can be two-sided, greater, or less. The default is two-sided.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let binomial_test_fl = (tbl:(*), successes:string, trials:string, p_value:string, success_prob:real=0.5, alt_hypotheis:string='two-sided')
{
    let kwargs = bag_pack('successes', successes, 'trials', trials, 'p_value', p_value, 'success_prob', success_prob, 'alt_hypotheis', alt_hypotheis);
    let code = ```if 1:
        from scipy import stats
        
        successes = kargs["successes"]
        trials = kargs["trials"]
        p_value = kargs["p_value"]
        success_prob = kargs["success_prob"]
        alt_hypotheis = kargs["alt_hypotheis"]
        
        def func(row, prob, h1):
            pv = stats.binom_test(row[successes], row[trials], p=prob, alternative=h1)
            return pv
        result = df
        result[p_value] = df.apply(func, axis=1, args=(success_prob, alt_hypotheis), result_type="expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Stats", docstring = "Binomial test")
binomial_test_fl(tbl:(*), successes:string, trials:string, p_value:string, success_prob:real=0.5, alt_hypotheis:string='two-sided')
{
    let kwargs = bag_pack('successes', successes, 'trials', trials, 'p_value', p_value, 'success_prob', success_prob, 'alt_hypotheis', alt_hypotheis);
    let code = ```if 1:
        from scipy import stats
        
        successes = kargs["successes"]
        trials = kargs["trials"]
        p_value = kargs["p_value"]
        success_prob = kargs["success_prob"]
        alt_hypotheis = kargs["alt_hypotheis"]
        
        def func(row, prob, h1):
            pv = stats.binom_test(row[successes], row[trials], p=prob, alternative=h1)
            return pv
        result = df
        result[p_value] = df.apply(func, axis=1, args=(success_prob, alt_hypotheis), result_type="expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let binomial_test_fl = (tbl:(*), successes:string, trials:string, p_value:string, success_prob:real=0.5, alt_hypotheis:string='two-sided')
{
    let kwargs = bag_pack('successes', successes, 'trials', trials, 'p_value', p_value, 'success_prob', success_prob, 'alt_hypotheis', alt_hypotheis);
    let code = ```if 1:
        from scipy import stats
        
        successes = kargs["successes"]
        trials = kargs["trials"]
        p_value = kargs["p_value"]
        success_prob = kargs["success_prob"]
        alt_hypotheis = kargs["alt_hypotheis"]
        
        def func(row, prob, h1):
            pv = stats.binom_test(row[successes], row[trials], p=prob, alternative=h1)
            return pv
        result = df
        result[p_value] = df.apply(func, axis=1, args=(success_prob, alt_hypotheis), result_type="expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, x:int, n:int) [
'Test #1', 3, 5,
'Test #2', 5, 5,
'Test #3', 3, 15
]
| extend p_val=0.0
| invoke binomial_test_fl('x', 'n', 'p_val', success_prob=0.2, alt_hypotheis='greater')

Stored

datatable(id:string, x:int, n:int) [
'Test #1', 3, 5,
'Test #2', 5, 5,
'Test #3', 3, 15
]
| extend p_val=0.0
| invoke binomial_test_fl('x', 'n', 'p_val', success_prob=0.2, alt_hypotheis='greater')

Output

idxnp_val
Test #1350.05792
Test #2550.00032
Test #33150.601976790745087

3 - comb_fl()

This article describes comb_fl() user-defined function.

Calculate C(n, k)

The function comb_fl() is a user-defined function (UDF) that calculates C(n, k), the number of combinations for selection of k items out of n, without order. It’s based on the native gamma() function to calculate factorial. For more information, see facorial_fl(). For a selection of k items with order, use perm_fl().

Syntax

comb_fl(n, k)

Parameters

NameTypeRequiredDescription

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let comb_fl=(n:int, k:int)
{
    let fact_n = gamma(n+1);
    let fact_nk = gamma(n-k+1);
    let fact_k = gamma(k+1);
    tolong(fact_n/fact_nk/fact_k)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Stats", docstring = "Calculate number of combinations for selection of k items out of n items without order")
comb_fl(n:int, k:int)
{
    let fact_n = gamma(n+1);
    let fact_nk = gamma(n-k+1);
    let fact_k = gamma(k+1);
    tolong(fact_n/fact_nk/fact_k)
}

Example

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let comb_fl=(n:int, k:int)
{
    let fact_n = gamma(n+1);
    let fact_nk = gamma(n-k+1);
    let fact_k = gamma(k+1);
    tolong(fact_n/fact_nk/fact_k)
};
range n from 3 to 10 step 3
| extend k = n-2
| extend cnk = comb_fl(n, k)

Stored

range n from 3 to 10 step 3
| extend k = n-2
| extend cnk = comb_fl(n, k)

Output

nkcnk
313
6415
9736

4 - dbscan_dynamic_fl()

This article describes the dbscan_dynamic_fl() user-defined function.

The function dbscan_dynamic_fl() is a UDF (user-defined function) that clusterizes a dataset using the DBSCAN algorithm. This function is similar to dbscan_fl() just the features are supplied by a single numerical array column and not by multiple scalar columns.

Syntax

T | invoke dbscan_fl(features_col, cluster_col, epsilon, min_samples, metric, metric_params)

Parameters

NameTypeRequiredDescription
features_colstring✔️The name of the column containing the numeric array of features to be used for clustering.
cluster_colstring✔️The name of the column to store the output cluster ID for each record.
epsilonreal✔️The maximum distance between two samples to be considered as neighbors.
min_samplesintThe number of samples in a neighborhood for a point to be considered as a core point.
metricstringThe metric to use when calculating distance between points.
metric_paramsdynamicExtra keyword arguments for the metric function.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let dbscan_dynamic_fl=(tbl:(*), features_col:string, cluster_col:string, epsilon:double, min_samples:int=10, metric:string='minkowski', metric_params:dynamic=dynamic({'p': 2}))
{
    let kwargs = bag_pack('features_col', features_col, 'cluster_col', cluster_col, 'epsilon', epsilon, 'min_samples', min_samples,
                          'metric', metric, 'metric_params', metric_params);
    let code = ```if 1:

        from sklearn.cluster import DBSCAN
        from sklearn.preprocessing import StandardScaler

        features_col = kargs["features_col"]
        cluster_col = kargs["cluster_col"]
        epsilon = kargs["epsilon"]
        min_samples = kargs["min_samples"]
        metric = kargs["metric"]
        metric_params = kargs["metric_params"]

        df1 = df[features_col].apply(np.array)
        mat = np.vstack(df1.values)
        
        # Scale the dataframe
        scaler = StandardScaler()
        mat = scaler.fit_transform(mat)

        # see https://docs.scipy.org/doc/scipy/reference/spatial.distance.html for the various distance metrics

        dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric, metric_params=metric_params) # 'minkowski', 'chebyshev'
        labels = dbscan.fit_predict(mat)

        result = df
        result[cluster_col] = labels
    ```;
    tbl
    | evaluate python(typeof(*),code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\ML", docstring = "DBSCAN clustering of features passed as a single column containing numerical array")
dbscan_dynamic_fl(tbl:(*), features_col:string, cluster_col:string, epsilon:double, min_samples:int=10, metric:string='minkowski', metric_params:dynamic=dynamic({'p': 2}))
{
    let kwargs = bag_pack('features_col', features_col, 'cluster_col', cluster_col, 'epsilon', epsilon, 'min_samples', min_samples,
                          'metric', metric, 'metric_params', metric_params);
    let code = ```if 1:

        from sklearn.cluster import DBSCAN
        from sklearn.preprocessing import StandardScaler

        features_col = kargs["features_col"]
        cluster_col = kargs["cluster_col"]
        epsilon = kargs["epsilon"]
        min_samples = kargs["min_samples"]
        metric = kargs["metric"]
        metric_params = kargs["metric_params"]

        df1 = df[features_col].apply(np.array)
        mat = np.vstack(df1.values)
        
        # Scale the dataframe
        scaler = StandardScaler()
        mat = scaler.fit_transform(mat)

        # see https://docs.scipy.org/doc/scipy/reference/spatial.distance.html for the various distance metrics

        dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric, metric_params=metric_params) # 'minkowski', 'chebyshev'
        labels = dbscan.fit_predict(mat)

        result = df
        result[cluster_col] = labels
    ```;
    tbl
    | evaluate python(typeof(*),code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Clustering of artificial dataset with three clusters

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let dbscan_dynamic_fl=(tbl:(*), features_col:string, cluster_col:string, epsilon:double, min_samples:int=10, metric:string='minkowski', metric_params:dynamic=dynamic({'p': 2}))
{
    let kwargs = bag_pack('features_col', features_col, 'cluster_col', cluster_col, 'epsilon', epsilon, 'min_samples', min_samples,
                          'metric', metric, 'metric_params', metric_params);
    let code = ```if 1:

        from sklearn.cluster import DBSCAN
        from sklearn.preprocessing import StandardScaler

        features_col = kargs["features_col"]
        cluster_col = kargs["cluster_col"]
        epsilon = kargs["epsilon"]
        min_samples = kargs["min_samples"]
        metric = kargs["metric"]
        metric_params = kargs["metric_params"]

        df1 = df[features_col].apply(np.array)
        mat = np.vstack(df1.values)
        
        # Scale the dataframe
        scaler = StandardScaler()
        mat = scaler.fit_transform(mat)

        # see https://docs.scipy.org/doc/scipy/reference/spatial.distance.html for the various distance metrics

        dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric, metric_params=metric_params) # 'minkowski', 'chebyshev'
        labels = dbscan.fit_predict(mat)

        result = df
        result[cluster_col] = labels
    ```;
    tbl
    | evaluate python(typeof(*),code, kwargs)
};
union 
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| project Features=pack_array(x, y), cluster_id=int(null)
| invoke dbscan_dynamic_fl("Features", "cluster_id", epsilon=0.6, min_samples=4, metric_params=dynamic({'p':2}))
| extend x=toreal(Features[0]), y=toreal(Features[1])
| render scatterchart with(series=cluster_id)

Stored

union 
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| project Features=pack_array(x, y), cluster_id=int(null)
| invoke dbscan_dynamic_fl("Features", "cluster_id", epsilon=0.6, min_samples=4, metric_params=dynamic({'p':2}))
| extend x=toreal(Features[0]), y=toreal(Features[1])
| render scatterchart with(series=cluster_id)

Screenshot of scatterchart of DBSCAN clustering of artificial dataset with three clusters.

5 - dbscan_fl()

This article describes the dbscan_fl() user-defined function.

The function dbscan_fl() is a UDF (user-defined function) that clusterizes a dataset using the DBSCAN algorithm.

Syntax

T | invoke dbscan_fl(features, cluster_col, epsilon, min_samples, metric, metric_params)

Parameters

NameTypeRequiredDescription
featuresdynamic✔️An array containing the names of the features columns to use for clustering.
cluster_colstring✔️The name of the column to store the output cluster ID for each record.
epsilonreal✔️The maximum distance between two samples to be considered as neighbors.
min_samplesintThe number of samples in a neighborhood for a point to be considered as a core point.
metricstringThe metric to use when calculating distance between points.
metric_paramsdynamicExtra keyword arguments for the metric function.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let dbscan_fl=(tbl:(*), features:dynamic, cluster_col:string, epsilon:double, min_samples:int=10,
                       metric:string='minkowski', metric_params:dynamic=dynamic({'p': 2}))
{
    let kwargs = bag_pack('features', features, 'cluster_col', cluster_col, 'epsilon', epsilon, 'min_samples', min_samples,
                          'metric', metric, 'metric_params', metric_params);
    let code = ```if 1:

        from sklearn.cluster import DBSCAN
        from sklearn.preprocessing import StandardScaler

        features = kargs["features"]
        cluster_col = kargs["cluster_col"]
        epsilon = kargs["epsilon"]
        min_samples = kargs["min_samples"]
        metric = kargs["metric"]
        metric_params = kargs["metric_params"]

        df1 = df[features]
        mat = df1.values
        
        # Scale the dataframe
        scaler = StandardScaler()
        mat = scaler.fit_transform(mat)

        # see https://docs.scipy.org/doc/scipy/reference/spatial.distance.html for the various distance metrics

        dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric, metric_params=metric_params) # 'minkowski', 'chebyshev'
        labels = dbscan.fit_predict(mat)

        result = df
        result[cluster_col] = labels
    ```;
    tbl
    | evaluate python(typeof(*),code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\ML", docstring = "DBSCAN clustering")
dbscan_fl(tbl:(*), features:dynamic, cluster_col:string, epsilon:double, min_samples:int=10,
                       metric:string='minkowski', metric_params:dynamic=dynamic({'p': 2}))
{
    let kwargs = bag_pack('features', features, 'cluster_col', cluster_col, 'epsilon', epsilon, 'min_samples', min_samples,
                          'metric', metric, 'metric_params', metric_params);
    let code = ```if 1:

        from sklearn.cluster import DBSCAN
        from sklearn.preprocessing import StandardScaler

        features = kargs["features"]
        cluster_col = kargs["cluster_col"]
        epsilon = kargs["epsilon"]
        min_samples = kargs["min_samples"]
        metric = kargs["metric"]
        metric_params = kargs["metric_params"]

        df1 = df[features]
        mat = df1.values
        
        # Scale the dataframe
        scaler = StandardScaler()
        mat = scaler.fit_transform(mat)

        # see https://docs.scipy.org/doc/scipy/reference/spatial.distance.html for the various distance metrics

        dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric, metric_params=metric_params) # 'minkowski', 'chebyshev'
        labels = dbscan.fit_predict(mat)

        result = df
        result[cluster_col] = labels
    ```;
    tbl
    | evaluate python(typeof(*),code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Clustering of artificial dataset with three clusters

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let dbscan_fl=(tbl:(*), features:dynamic, cluster_col:string, epsilon:double, min_samples:int=10,
                       metric:string='minkowski', metric_params:dynamic=dynamic({'p': 2}))
{
    let kwargs = bag_pack('features', features, 'cluster_col', cluster_col, 'epsilon', epsilon, 'min_samples', min_samples,
                          'metric', metric, 'metric_params', metric_params);
    let code = ```if 1:

        from sklearn.cluster import DBSCAN
        from sklearn.preprocessing import StandardScaler

        features = kargs["features"]
        cluster_col = kargs["cluster_col"]
        epsilon = kargs["epsilon"]
        min_samples = kargs["min_samples"]
        metric = kargs["metric"]
        metric_params = kargs["metric_params"]

        df1 = df[features]
        mat = df1.values
        
        # Scale the dataframe
        scaler = StandardScaler()
        mat = scaler.fit_transform(mat)

        # see https://docs.scipy.org/doc/scipy/reference/spatial.distance.html for the various distance metrics

        dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric=metric, metric_params=metric_params) # 'minkowski', 'chebyshev'
        labels = dbscan.fit_predict(mat)

        result = df
        result[cluster_col] = labels
    ```;
    tbl
    | evaluate python(typeof(*),code, kwargs)
};
union 
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| extend cluster_id=int(null)
| invoke dbscan_fl(pack_array("x", "y"), "cluster_id", epsilon=0.6, min_samples=4, metric_params=dynamic({'p':2}))
| render scatterchart with(series=cluster_id)

Stored

union 
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| extend cluster_id=int(null)
| invoke dbscan_fl(pack_array("x", "y"), "cluster_id", epsilon=0.6, min_samples=4, metric_params=dynamic({'p':2}))
| render scatterchart with(series=cluster_id)

Screenshot of scatterchart of DBSCAN clustering of artificial dataset with three clusters.

6 - detect_anomalous_new_entity_fl()

Learn how to use the detect_anomalous_new_entity_fl() function to detect the appearance of anomalous new entities.

Detect the appearance of anomalous new entities in timestamped data.

The function detect_anomalous_new_entity_fl() is a UDF (user-defined function) that detects the appearance of anomalous new entities - such as IP addresses or users - in timestamped data, such as traffic logs. In cybersecurity context, such events might be suspicious and indicate a potential attack or compromise.

The anomaly model is based on a Poisson distribution representing the number of new entities appearing per time bin (such as day) for each scope. Poisson distribution parameter is estimated based on the rate of appearance of new entities in training period, with added decay factor reflecting the fact that recent appearances are more important than old ones. Thus we calculate the probability to encounter a new entity in defined detection period per some scope - such as a subscription or an account. The model output is controlled by several optional parameters, such as minimal threshold for anomaly, decay rate parameter, and others.

The model’s direct output is an anomaly score based on the inverse of estimated probability to encounter a new entity. The score is monotonous in the range of [0, 1], with 1 representing something anomalous. In addition to the anomaly score, there’s a binary flag for detected anomaly (controlled by a minimal threshold parameter), and other explanatory fields.

Syntax

detect_anomalous_new_entity_fl(entityColumnName, scopeColumnName, timeColumnName, startTraining, startDetection, endDetection, [maxEntitiesThresh], [minTrainingDaysThresh], [decayParam], [anomalyScoreThresh])

Parameters

NameTypeRequiredDescription
entityColumnNamestring✔️The name of the input table column containing the names or IDs of the entities for which anomaly model is calculated.
scopeColumnNamestring✔️The name of the input table column containing the partition or scope, so that a different anomaly model is built for each scope.
timeColumnNamestring✔️The name of the input table column containing the timestamps, that are used to define the training and detection periods.
startTrainingdatetime✔️The beginning of the training period for the anomaly model. Its end is defined by the beginning of detection period.
startDetectiondatetime✔️The beginning of the detection period for anomaly detection.
endDetectiondatetime✔️The end of the detection period for anomaly detection.
maxEntitiesThreshintThe maximum number of existing entities in scope to calculate anomalies. If the number of entities is above the threshold, the scope is considered too noisy and anomalies aren’t calculated. The default value is 60.
minTrainingDaysThreshintThe minimum number of days in training period that a scope exists to calculate anomalies. If it is below threshold, the scope is considered too new and unknown, so anomalies aren’t calculated. The default value is 14.
decayParamrealThe decay rate parameter for anomaly model, a number in range (0,1]. Lower values mean faster decay, so more importance is given to later appearances in training period. A value of 1 means no decay, so a simple average is used for Poisson distribution parameter estimation. The default value is 0.95.
anomalyScoreThreshrealThe minimum value of anomaly score for which an anomaly is detected, a number in range [0, 1]. Higher values mean that only more significant cases are considered anomalous, so fewer anomalies are detected (higher precision, lower recall). The default value is 0.9.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let detect_anomalous_new_entity_fl = (T:(*), entityColumnName:string, scopeColumnName:string
                                        , timeColumnName:string, startTraining:datetime, startDetection:datetime, endDetection:datetime
                                        , maxEntitiesThresh:int = 60, minTrainingDaysThresh:int = 14, decayParam:real = 0.95, anomalyScoreThresh:real = 0.9)
{
//pre-process the input data by adding standard column names and dividing to datasets
let timePeriodBinSize = 'day';      // we assume a reasonable bin for time is day, so the probability model is built per that bin size
let processedData = (
    T
    | extend scope      = column_ifexists(scopeColumnName, '')
    | extend entity     = column_ifexists(entityColumnName, '')
    | extend sliceTime  = todatetime(column_ifexists(timeColumnName, ''))
    | where isnotempty(scope) and isnotempty(entity) and isnotempty(sliceTime)
    | extend dataSet = case((sliceTime >= startTraining and sliceTime < startDetection), 'trainSet'
                           , sliceTime >= startDetection and sliceTime <= endDetection,  'detectSet'
                                                                                       , 'other')
    | where dataSet in ('trainSet', 'detectSet')
);
// summarize the data by scope and entity. this will be used to create a distribution of entity appearances based on first seen data
let entityData = (
    processedData
    | summarize countRowsEntity = count(), firstSeenEntity = min(sliceTime), lastSeenEntity = max(sliceTime), firstSeenSet = arg_min(sliceTime, dataSet) 
        by scope, entity
    | extend firstSeenSet = dataSet
    | project-away dataSet
);
// aggregate entity data per scope and get the number of entities appearing over time
let aggregatedCandidateScopeData = (
    entityData
    | summarize countRowsScope = sum(countRowsEntity), countEntitiesScope = dcount(entity), countEntitiesScopeInTrain = dcountif(entity, firstSeenSet == 'trainSet')
        , firstSeenScope = min(firstSeenEntity), lastSeenScope = max(lastSeenEntity), hasNewEntities = iff(dcountif(entity,firstSeenSet == 'detectSet') > 0, 1, 0) 
            by scope
    | extend slicesInTrainingScope = datetime_diff(timePeriodBinSize, startDetection, firstSeenScope)
    | where countEntitiesScopeInTrain <= maxEntitiesThresh and slicesInTrainingScope >= minTrainingDaysThresh and lastSeenScope >= startDetection and hasNewEntities == 1
);
let modelData = (
    entityData
    | join kind = inner (aggregatedCandidateScopeData) on scope 
    | where firstSeenSet == 'trainSet'
    | summarize countAddedEntities = dcount(entity), firstSeenScope = min(firstSeenScope), slicesInTrainingScope = max(slicesInTrainingScope), countEntitiesScope = max(countEntitiesScope)
        by scope, firstSeenSet, firstSeenEntity
    | extend diffInDays = datetime_diff(timePeriodBinSize, startDetection, firstSeenEntity)
// adding exponentially decaying weights to counts
    | extend decayingWeight = pow(base = decayParam, exponent = diffInDays)
    | extend decayingValue = countAddedEntities * decayingWeight
    | summarize   newEntityProbability = round(1 - exp(-1.0 * sum(decayingValue)/max(diffInDays)), 4)
                , countKnownEntities = sum(countAddedEntities), lastNewEntityTimestamp = max(firstSeenEntity), slicesOnScope = max(slicesInTrainingScope)///for explainability
        by scope, firstSeenSet
// anomaly score is based on probability to get no new entities, calculated using Poisson distribution (P(X=0) = exp(-avg)) with added decay on average
    | extend newEntityAnomalyScore = round(1 - newEntityProbability, 4)
    | extend isAnomalousNewEntity = iff(newEntityAnomalyScore >= anomalyScoreThresh, 1, 0)
);
let resultsData = (
    processedData
    | where dataSet == 'detectSet'
    | join kind = inner (modelData) on scope
	| project-away scope1
    | where isAnomalousNewEntity == 1
    | summarize arg_min(sliceTime, *) by scope, entity
    | extend anomalyType = strcat('newEntity_', entityColumnName), anomalyExplainability = strcat('The ', entityColumnName, ' ', entity, ' wasn\'t seen on ', scopeColumnName, ' ', scope, ' during the last ',  slicesOnScope, ' ', timePeriodBinSize, 's. Previously, ', countKnownEntities
        , ' entities were seen, the last one of them appearing at ', format_datetime(lastNewEntityTimestamp, 'yyyy-MM-dd HH:mm'), '.')
    | join kind = leftouter (entityData | where firstSeenSet == 'trainSet' | extend entityFirstSeens = strcat(entity, ' : ', format_datetime(firstSeenEntity, 'yyyy-MM-dd HH:mm')) | sort by scope, firstSeenEntity asc | summarize anomalyState = make_list(entityFirstSeens) by scope) on scope
    | project-away scope1
);
resultsData
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (docstring = "Detect new and anomalous entity (such as username or IP) per scope (such as subscription or account)", skipvalidation = "true", folder = 'KCL') 
    detect_anomalous_new_entity_fl(T:(*), entityColumnName:string, scopeColumnName:string
                                        , timeColumnName:string, startTraining:datetime, startDetection:datetime, endDetection:datetime
                                        , maxEntitiesThresh:int = 60, minTrainingDaysThresh:int = 14, decayParam:real = 0.95, anomalyScoreThresh:real = 0.9)
{
//pre-process the input data by adding standard column names and dividing to datasets
let timePeriodBinSize = 'day';      // we assume a reasonable bin for time is day, so the probability model is built per that bin size
let processedData = (
    T
    | extend scope      = column_ifexists(scopeColumnName, '')
    | extend entity     = column_ifexists(entityColumnName, '')
    | extend sliceTime  = todatetime(column_ifexists(timeColumnName, ''))
    | where isnotempty(scope) and isnotempty(entity) and isnotempty(sliceTime)
    | extend dataSet = case((sliceTime >= startTraining and sliceTime < startDetection), 'trainSet'
                           , sliceTime >= startDetection and sliceTime <= endDetection,  'detectSet'
                                                                                       , 'other')
    | where dataSet in ('trainSet', 'detectSet')
);
// summarize the data by scope and entity. this will be used to create a distribution of entity appearances based on first seen data
let entityData = (
    processedData
    | summarize countRowsEntity = count(), firstSeenEntity = min(sliceTime), lastSeenEntity = max(sliceTime), firstSeenSet = arg_min(sliceTime, dataSet) 
        by scope, entity
    | extend firstSeenSet = dataSet
    | project-away dataSet
);
// aggregate entity data per scope and get the number of entities appearing over time
let aggregatedCandidateScopeData = (
    entityData
    | summarize countRowsScope = sum(countRowsEntity), countEntitiesScope = dcount(entity), countEntitiesScopeInTrain = dcountif(entity, firstSeenSet == 'trainSet')
        , firstSeenScope = min(firstSeenEntity), lastSeenScope = max(lastSeenEntity), hasNewEntities = iff(dcountif(entity,firstSeenSet == 'detectSet') > 0, 1, 0) 
            by scope
    | extend slicesInTrainingScope = datetime_diff(timePeriodBinSize, startDetection, firstSeenScope)
    | where countEntitiesScopeInTrain <= maxEntitiesThresh and slicesInTrainingScope >= minTrainingDaysThresh and lastSeenScope >= startDetection and hasNewEntities == 1
);
let modelData = (
    entityData
    | join kind = inner (aggregatedCandidateScopeData) on scope 
    | where firstSeenSet == 'trainSet'
    | summarize countAddedEntities = dcount(entity), firstSeenScope = min(firstSeenScope), slicesInTrainingScope = max(slicesInTrainingScope), countEntitiesScope = max(countEntitiesScope)
        by scope, firstSeenSet, firstSeenEntity
    | extend diffInDays = datetime_diff(timePeriodBinSize, startDetection, firstSeenEntity)
// adding exponentially decaying weights to counts of 
    | extend decayingWeight = pow(base = decayParam, exponent = diffInDays)
    | extend decayingValue = countAddedEntities * decayingWeight
    | summarize   newEntityProbability = round(1 - exp(-1.0 * sum(decayingValue)/max(diffInDays)), 4)
                , countKnownEntities = sum(countAddedEntities), lastNewEntityTimestamp = max(firstSeenEntity), slicesOnScope = max(slicesInTrainingScope)///for explainability
        by scope, firstSeenSet
// anomaly score is based on probability to get no new entities, calculated using Poisson distribution (P(X=0) = exp(-avg)) with added decay on average
    | extend newEntityAnomalyScore = round(1 - newEntityProbability, 4)
    | extend isAnomalousNewEntity = iff(newEntityAnomalyScore >= anomalyScoreThresh, 1, 0)
);
let resultsData = (
    processedData
    | where dataSet == 'detectSet'
    | join kind = inner (modelData) on scope
    | project-away scope1
    | where isAnomalousNewEntity == 1
    | summarize arg_min(sliceTime, *) by scope, entity
    | extend anomalyType = strcat('newEntity_', entityColumnName), anomalyExplainability = strcat('The ', entityColumnName, ' ', entity, ' wasn\'t seen on ', scopeColumnName, ' ', scope, ' during the last ',  slicesOnScope, ' ', timePeriodBinSize, 's. Previously, ', countKnownEntities
        , ' entities were seen, the last one of them appearing at ', format_datetime(lastNewEntityTimestamp, 'yyyy-MM-dd HH:mm'), '.')
    | join kind = leftouter (entityData | where firstSeenSet == 'trainSet' | extend entityFirstSeens = strcat(entity, ' : ', format_datetime(firstSeenEntity, 'yyyy-MM-dd HH:mm')) | sort by scope, firstSeenEntity asc | summarize anomalyState = make_list(entityFirstSeens) by scope) on scope
    | project-away scope1
);
resultsData
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let detect_anomalous_new_entity_fl = (T:(*), entityColumnName:string, scopeColumnName:string
                                        , timeColumnName:string, startTraining:datetime, startDetection:datetime, endDetection:datetime
                                        , maxEntitiesThresh:int = 60, minTrainingDaysThresh:int = 14, decayParam:real = 0.95, anomalyScoreThresh:real = 0.9)
{
//pre-process the input data by adding standard column names and dividing to datasets
let timePeriodBinSize = 'day';      // we assume a reasonable bin for time is day, so the probability model is built per that bin size
let processedData = (
    T
    | extend scope      = column_ifexists(scopeColumnName, '')
    | extend entity     = column_ifexists(entityColumnName, '')
    | extend sliceTime  = todatetime(column_ifexists(timeColumnName, ''))
    | where isnotempty(scope) and isnotempty(entity) and isnotempty(sliceTime)
    | extend dataSet = case((sliceTime >= startTraining and sliceTime < startDetection), 'trainSet'
                           , sliceTime >= startDetection and sliceTime <= endDetection,  'detectSet'
                                                                                       , 'other')
    | where dataSet in ('trainSet', 'detectSet')
);
// summarize the data by scope and entity. this will be used to create a distribution of entity appearances based on first seen data
let entityData = (
    processedData
    | summarize countRowsEntity = count(), firstSeenEntity = min(sliceTime), lastSeenEntity = max(sliceTime), firstSeenSet = arg_min(sliceTime, dataSet) 
        by scope, entity
    | extend firstSeenSet = dataSet
    | project-away dataSet
);
// aggregate entity data per scope and get the number of entities appearing over time
let aggregatedCandidateScopeData = (
    entityData
    | summarize countRowsScope = sum(countRowsEntity), countEntitiesScope = dcount(entity), countEntitiesScopeInTrain = dcountif(entity, firstSeenSet == 'trainSet')
        , firstSeenScope = min(firstSeenEntity), lastSeenScope = max(lastSeenEntity), hasNewEntities = iff(dcountif(entity,firstSeenSet == 'detectSet') > 0, 1, 0) 
            by scope
    | extend slicesInTrainingScope = datetime_diff(timePeriodBinSize, startDetection, firstSeenScope)
    | where countEntitiesScopeInTrain <= maxEntitiesThresh and slicesInTrainingScope >= minTrainingDaysThresh and lastSeenScope >= startDetection and hasNewEntities == 1
);
let modelData = (
    entityData
    | join kind = inner (aggregatedCandidateScopeData) on scope 
    | where firstSeenSet == 'trainSet'
    | summarize countAddedEntities = dcount(entity), firstSeenScope = min(firstSeenScope), slicesInTrainingScope = max(slicesInTrainingScope), countEntitiesScope = max(countEntitiesScope)
        by scope, firstSeenSet, firstSeenEntity
    | extend diffInDays = datetime_diff(timePeriodBinSize, startDetection, firstSeenEntity)
// adding exponentially decaying weights to counts
    | extend decayingWeight = pow(base = decayParam, exponent = diffInDays)
    | extend decayingValue = countAddedEntities * decayingWeight
    | summarize   newEntityProbability =  round(1 - exp(-1.0 * sum(decayingValue)/max(diffInDays)), 4)
                , countKnownEntities = sum(countAddedEntities), lastNewEntityTimestamp = max(firstSeenEntity), slicesOnScope = max(slicesInTrainingScope)///for explainability
        by scope, firstSeenSet
// anomaly score is based on probability to get no new entities, calculated using Poisson distribution (P(X=0) = exp(-avg)) with added decay on average
    | extend newEntityAnomalyScore = round(1 - newEntityProbability, 4)
    | extend isAnomalousNewEntity = iff(newEntityAnomalyScore >= anomalyScoreThresh, 1, 0)
);
let resultsData = (
    processedData
    | where dataSet == 'detectSet'
    | join kind = inner (modelData) on scope
    | project-away scope1
    | where isAnomalousNewEntity == 1
    | summarize arg_min(sliceTime, *) by scope, entity
    | extend anomalyType = strcat('newEntity_', entityColumnName), anomalyExplainability = strcat('The ', entityColumnName, ' ', entity, ' wasn\'t seen on ', scopeColumnName, ' ', scope, ' during the last ',  slicesOnScope, ' ', timePeriodBinSize, 's. Previously, ', countKnownEntities
        , ' entities were seen, the last one of them appearing at ', format_datetime(lastNewEntityTimestamp, 'yyyy-MM-dd HH:mm'), '.')
    | join kind = leftouter (entityData | where firstSeenSet == 'trainSet' | extend entityFirstSeens = strcat(entity, ' : ', format_datetime(firstSeenEntity, 'yyyy-MM-dd HH:mm')) | sort by scope, firstSeenEntity asc | summarize anomalyState = make_list(entityFirstSeens) by scope) on scope
    | project-away scope1
);
resultsData
};
// synthetic data generation
let detectPeriodStart   = datetime(2022-04-30 05:00:00.0000000);
let trainPeriodStart    = datetime(2022-03-01 05:00);
let names               = pack_array("Admin", "Dev1", "Dev2", "IT-support");
let countNames          = array_length(names);
let testData            = range t from 1 to 24*60 step 1
    | extend timeSlice      = trainPeriodStart + 1h * t
    | extend countEvents    = round(2*rand() + iff((t/24)%7>=5, 10.0, 15.0) - (((t%24)/10)*((t%24)/10)), 2) * 100 // generate a series with weekly seasonality
    | extend userName       = tostring(names[toint(rand(countNames))])
    | extend deviceId       = hash_md5(rand())
    | extend accountName    = iff(((rand() < 0.2) and (timeSlice < detectPeriodStart)), 'testEnvironment', 'prodEnvironment')
    | extend userName       = iff(timeSlice == detectPeriodStart, 'H4ck3r', userName)
    | extend deviceId       = iff(timeSlice == detectPeriodStart, 'abcdefghijklmnoprtuvwxyz012345678', deviceId)
    | sort by timeSlice desc
;
testData
| invoke detect_anomalous_new_entity_fl(entityColumnName    = 'userName'  //principalName for positive, deviceId for negative
                                , scopeColumnName           = 'accountName'
                                , timeColumnName            = 'timeSlice'
                                , startTraining             = trainPeriodStart
                                , startDetection            = detectPeriodStart
                                , endDetection              = detectPeriodStart
                            )

Stored

let detectPeriodStart   = datetime(2022-04-30 05:00:00.0000000);
let trainPeriodStart    = datetime(2022-03-01 05:00);
let names               = pack_array("Admin", "Dev1", "Dev2", "IT-support");
let countNames          = array_length(names);
let testData            = range t from 1 to 24*60 step 1
    | extend timeSlice      = trainPeriodStart + 1h * t
    | extend countEvents    = round(2*rand() + iff((t/24)%7>=5, 10.0, 15.0) - (((t%24)/10)*((t%24)/10)), 2) * 100 // generate a series with weekly seasonality
    | extend userName       = tostring(names[toint(rand(countNames))])
    | extend deviceId       = hash_md5(rand())
    | extend accountName    = iff(((rand() < 0.2) and (timeSlice < detectPeriodStart)), 'testEnvironment', 'prodEnvironment')
    | extend userName       = iff(timeSlice == detectPeriodStart, 'H4ck3r', userName)
    | extend deviceId       = iff(timeSlice == detectPeriodStart, 'abcdefghijklmnoprtuvwxyz012345678', deviceId)
    | sort by timeSlice desc
;
testData
| invoke detect_anomalous_new_entity_fl(entityColumnName    = 'userName'
                                , scopeColumnName           = 'accountName'
                                , timeColumnName            = 'timeSlice'
                                , startTraining             = trainPeriodStart
                                , startDetection            = detectPeriodStart
                                , endDetection              = detectPeriodStart
                            )

Output

scopeentitysliceTimettimeSlicecountEventsuserNamedeviceIdaccountNamedataSetfirstSeenSetnewEntityProbabilitycountKnownEntitieslastNewEntityTimestampslicesOnScopenewEntityAnomalyScoreisAnomalousNewEntityanomalyTypeanomalyExplainabilityanomalyState
prodEnvironmentH4ck3r2022-04-30 05:00:00.000000014402022-04-30 05:00:00.00000001687H4ck3rabcdefghijklmnoprtuvwxyz012345678prodEnvironmentdetectSettrainSet0.003142022-03-01 09:00:00.0000000600.99691newEntity_userNameThe userName H4ck3r wasn’t seen on accountName prodEnvironment during the last 60 days. Previously, four entities were seen, the last one of them appearing at 2022-03-01 09:00.[“IT-support : 2022-03-01 07:00”, “Admin : 2022-03-01 08:00”, “Dev2 : 2022-03-01 09:00”, “Dev1 : 2022-03-01 14:00”]

The output of running the function is the first-seen row in test dataset for each entity per scope, filtered for new entities (meaning they didn’t appear during the training period) that were tagged as anomalous (meaning that entity anomaly score was above anomalyScoreThresh). Some other fields are added for clarity:

  • dataSet: current dataset (is always detectSet).
  • firstSeenSet: dataset in which the scope was first seen (should be ’trainSet’).
  • newEntityProbability: probability to see any new entity based on Poisson model estimation.
  • countKnownEntities: existing entities on scope.
  • lastNewEntityTimestamp: last time a new entity was seen before the anomalous one.
  • slicesOnScope: count of slices per scope.
  • newEntityAnomalyScore: anomaly score was the new entity in range [0, 1], higher values meaning more anomaly.
  • isAnomalousNewEntity: binary flag for anomalous new entities
  • anomalyType: shows the type of anomaly (helpful when running several anomaly detection logics together).
  • anomalyExplainability: textual wrapper for generated anomaly and its explanation.
  • anomalyState: bag of existing entities on scope with their first seen times.

Running this function on user per account with default parameters gets a previously unseen and anomalous user (‘H4ck3r’) with high anomaly score of 0.9969, meaning that this is unexpected (due to small numbers of existing users in training period).

When we run the function with default parameters on deviceId as entity, we won’t see an anomaly, due to large number of existing devices which makes it expected. However, if we lower the parameter anomalyScoreThresh to 0.0001 and raise the parameter to maxEntitiesThresh to 10000, we’ll effectively decrease precision in favor of recall, and detect an anomaly (with a low anomaly score) on device ‘abcdefghijklmnoprtuvwxyz012345678’.

The output shows the anomalous entities together with explanation fields in standardized format. These fields are useful for investigating the anomaly and for running anomalous entity detection on several entities or running other algorithms together.

The suggested usage in cybersecurity context is running the function on meaningful entities - such as usernames or IP addresses - per meaningful scopes - such as subscription on accounts. A detected anomalous new entity means that its appearance isn’t expected on the scope, and might be suspicious.

7 - factorial_fl()

This article describes factorial_fl() user-defined function.

Calculate factorial.

The function factorial_fl() is a UDF (user-defined function) that calculates factorial of positive integers (n!). It’s a simple wrapper of the native gamma() function.

Syntax

factorial_fl(n)

Parameters

NameTypeRequiredDescription
nint✔️The input integer for which to calculate the factorial.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let factorial_fl=(n:int)
{
    gamma(n+1)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Stats", docstring = "Calculate factorial")
factorial_fl(n:int)
{
    gamma(n+1)
}

Example

Query-defined

let factorial_fl=(n:int)
{
    gamma(n+1)
};
range x from 1 to 10 step 3
| extend fx = factorial_fl(x)

Stored

range x from 1 to 10 step 3
| extend fx = factorial_fl(x)

Output

xfx
11
424
75040
103628799

8 - Functions

This article describes Functions.

Functions are reusable queries or query parts. Kusto supports two kinds of functions:

  • Built-in functions are hard-coded functions defined by Kusto that can’t be modified by users.

  • User-defined functions, which are divided into two types:

    • Stored functions: user-defined functions that are stored and managed database schema entities, similar to tables. For more information, see Stored functions. To create a stored function, use the .create function command.

    • Query-defined functions: user-defined functions that are defined and used within the scope of a single query. The definition of such functions is done through a let statement. For more information on how to create query-defined functions, see Create a user defined function.

    For more information on user-defined functions, see User-defined functions.

9 - Functions library

This article describes user-defined functions that extend query environment capabilities.

The following article contains a categorized list of UDF (user-defined functions).

The user-defined functions code is given in the articles. It can be used within a let statement embedded in a query or can be persisted in a database using .create function.

Cybersecurity functions

Function NameDescription
detect_anomalous_new_entity_fl()Detect the appearance of anomalous new entities in timestamped data.
detect_anomalous_spike_fl()Detect the appearance of anomalous spikes in numeric variables in timestamped data.
graph_blast_radius_fl()Calculate the Blast Radius (list and score) of source nodes over path or edge data.
graph_exposure_perimeter_fl()Calculate the Exposure Perimeter (list and score) of target nodes over path or edge data.
graph_path_discovery_fl()Discover valid paths between relevant endpoints (sources and targets) over graph data (edge and nodes).

General functions

Function NameDescription
geoip_fl()Retrieves geographic information of ip address.
get_packages_version_fl()Returns version information of the Python engine and the specified packages.

Machine learning functions

Function NameDescription
dbscan_fl()Clusterize using the DBSCAN algorithm, features are in separate columns.
dbscan_dynamic_fl()Clusterize using the DBSCAN algorithm, features are in a single dynamic column.
kmeans_fl()Clusterize using the K-Means algorithm, features are in separate columns.
kmeans_dynamic_fl()Clusterize using the K-Means algorithm, features are in a single dynamic column.
predict_fl()Predict using an existing trained machine learning model.
predict_onnx_fl()Predict using an existing trained machine learning model in ONNX format.

Plotly functions

The following section contains functions for rendering interactive Plotly charts.

Function NameDescription
plotly_anomaly_fl()Render anomaly chart using a Plotly template.
plotly_gauge_fl()Render gauge chart using a Plotly template.
plotly_scatter3d_fl()Render 3D scatter chart using a Plotly template.

PromQL functions

The following section contains common PromQL functions. These functions can be used for analysis of metrics ingested to your database by the Prometheus monitoring system. All functions assume that metrics in your database are structured using the Prometheus data model.

Function NameDescription
series_metric_fl()Select and retrieve time series stored with the Prometheus data model.
series_rate_fl()Calculate the average rate of counter metric increase per second.

Series processing functions

Function NameDescription
quantize_fl()Quantize metric columns.
series_clean_anomalies_fl()Replace anomalies in a series by interpolated value.
series_cosine_similarity_fl()Calculate the cosine similarity of two numerical vectors.
series_dbl_exp_smoothing_fl()Apply a double exponential smoothing filter on series.
series_dot_product_fl()Calculate the dot product of two numerical vectors.
series_downsample_fl()Downsample time series by an integer factor.
series_exp_smoothing_fl()Apply a basic exponential smoothing filter on series.
series_fit_lowess_fl()Fit a local polynomial to series using LOWESS method.
series_fit_poly_fl()Fit a polynomial to series using regression analysis.
series_fbprophet_forecast_fl()Forecast time series values using the Prophet algorithm.
series_lag_fl()Apply a lag filter on series.
series_monthly_decompose_anomalies_fl()Detect anomalies in a series with monthly seasonality.
series_moving_avg_fl()Apply a moving average filter on series.
series_moving_var_fl()Apply a moving variance filter on series.
series_mv_ee_anomalies_fl()Multivariate Anomaly Detection for series using elliptical envelope model.
series_mv_if_anomalies_fl()Multivariate Anomaly Detection for series using isolation forest model.
series_mv_oc_anomalies_fl()Multivariate Anomaly Detection for series using one class SVM model.
series_rolling_fl()Apply a rolling aggregation function on series.
series_shapes_fl()Detects positive/negative trend or jump in series.
series_uv_anomalies_fl()Detect anomalies in time series using the Univariate Anomaly Detection Cognitive Service API.
series_uv_change_points_fl()Detect change points in time series using the Univariate Anomaly Detection Cognitive Service API.
time_weighted_avg_fl()Calculates the time weighted average of a metric using fill forward interpolation.
time_weighted_avg2_fl()Calculates the time weighted average of a metric using linear interpolation.
time_weighted_val_fl()Calculates the time weighted value of a metric using linear interpolation.
time_window_rolling_avg_fl()Calculates the rolling average of a metric over a constant duration time window.

Statistical and probability functions

Function NameDescription
bartlett_test_fl()Perform the Bartlett test.
binomial_test_fl()Perform the binomial test.
comb_fl()Calculate C(n, k), the number of combinations for selection of k items out of n.
factorial_fl()Calculate n!, the factorial of n.
ks_test_fl()Perform a Kolmogorov Smirnov test.
levene_test_fl()Perform a Levene test.
normality_test_fl()Performs the Normality Test.
mann_whitney_u_test_fl()Perform a Mann-Whitney U Test.
pair_probabilities_fl()Calculate various probabilities and related metrics for a pair of categorical variables.
pairwise_dist_fl()Calculate pairwise distances between entities based on multiple nominal and numerical variables.
percentiles_linear_fl()Calculate percentiles using linear interpolation between closest ranks
perm_fl()Calculate P(n, k), the number of permutations for selection of k items out of n.
two_sample_t_test_fl()Perform the two sample t-test.
wilcoxon_test_fl()Perform the Wilcoxon Test.

Text analytics

Function NameDescription
log_reduce_fl()Find common patterns in textual logs and output a summary table.
log_reduce_full_fl()Find common patterns in textual logs and output a full table.
log_reduce_predict_fl()Apply a trained model to find common patterns in textual logs and output a summary table.
log_reduce_predict_full_fl()Apply a trained model to find common patterns in textual logs and output a full table.
log_reduce_train_fl()Find common patterns in textual logs and output a model.

10 - geoip_fl()

Learn how to use the geoip_fl() user-defined function.

geoip_fl() is a user-defined function that retrieves geographic information of ip address.

Syntax

T | invoke geoip_fl(ip_col, country_col, state_col, city_col, longitude_col, latitude_col)

Parameters

NameTypeRequiredDescription
ip_colstring✔️The name of the column containing the IP addresses to resolve.
country_colstring✔️The name of the column to store the retrieved country.
state_colstring✔️The name of the column to store the retrieved state.
city_colstring✔️The name of the column to store the retrieved city.
longitude_colreal✔️The name of the column to store the retrieved longitude.
latitude_colreal✔️The name of the column to store the retrieved latitude.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let geoip_fl=(tbl:(*), ip_col:string, country_col:string, state_col:string, city_col:string, longitude_col:string, latitude_col:string)
{
    let kwargs = bag_pack('ip_col', ip_col, 'country_col', country_col, 'state_col', state_col, 'city_col', city_col, 'longitude_col', longitude_col, 'latitude_col', latitude_col);
    let code= ```if 1:
        from sandbox_utils import Zipackage
        Zipackage.install('geoip2.zip')
        import geoip2.database

        ip_col = kargs['ip_col']
        country_col = kargs['country_col']
        state_col = kargs['state_col']
        city_col = kargs['city_col']
        longitude_col = kargs['longitude_col']
        latitude_col = kargs['latitude_col']
        result=df
        reader = geoip2.database.Reader(r'C:\\Temp\\GeoLite2-City.mmdb')

        def geodata(ip):
            try:
                gd = reader.city(ip)
                geo = pd.Series((gd.country.name, gd.subdivisions.most_specific.name, gd.city.name, gd.location.longitude, gd.location.latitude))
            except:
                geo = pd.Series((None, None, None, None, None))
            return geo
        
        result[[country_col, state_col, city_col, longitude_col, latitude_col]] = result[ip_col].apply(geodata)
        
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs,
        external_artifacts =
        pack('geoip2.zip', 'https://artifactswestus.blob.core.windows.net/public/geoip2-4.6.0.zip',
             'GeoLite2-City.mmdb', 'https://artifactswestus.blob.core.windows.net/public/GeoLite2-City-20230221.mmdb')
        )
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = 'Packages\\Utils', docstring = 'Retrieve geographics of ip address')
geoip_fl(tbl:(*), ip_col:string, country_col:string, state_col:string, city_col:string, longitude_col:string, latitude_col:string)
{
    let kwargs = bag_pack('ip_col', ip_col, 'country_col', country_col, 'state_col', state_col, 'city_col', city_col, 'longitude_col', longitude_col, 'latitude_col', latitude_col);
    let code= ```if 1:
        from sandbox_utils import Zipackage
        Zipackage.install('geoip2.zip')
        import geoip2.database

        ip_col = kargs['ip_col']
        country_col = kargs['country_col']
        state_col = kargs['state_col']
        city_col = kargs['city_col']
        longitude_col = kargs['longitude_col']
        latitude_col = kargs['latitude_col']
        result=df
        reader = geoip2.database.Reader(r'C:\\Temp\\GeoLite2-City.mmdb')

        def geodata(ip):
            try:
                gd = reader.city(ip)
                geo = pd.Series((gd.country.name, gd.subdivisions.most_specific.name, gd.city.name, gd.location.longitude, gd.location.latitude))
            except:
                geo = pd.Series((None, None, None, None, None))
            return geo
        
        result[[country_col, state_col, city_col, longitude_col, latitude_col]] = result[ip_col].apply(geodata)
        
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs,
        external_artifacts =
        pack('geoip2.zip', 'https://artifactswestus.blob.core.windows.net/public/geoip2-4.6.0.zip',
             'GeoLite2-City.mmdb', 'https://artifactswestus.blob.core.windows.net/public/GeoLite2-City-20230221.mmdb')
        )
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let geoip_fl=(tbl:(*), ip_col:string, country_col:string, state_col:string, city_col:string, longitude_col:string, latitude_col:string)
{
    let kwargs = bag_pack('ip_col', ip_col, 'country_col', country_col, 'state_col', state_col, 'city_col', city_col, 'longitude_col', longitude_col, 'latitude_col', latitude_col);
    let code= ```if 1:
        from sandbox_utils import Zipackage
        Zipackage.install('geoip2.zip')
        import geoip2.database

        ip_col = kargs['ip_col']
        country_col = kargs['country_col']
        state_col = kargs['state_col']
        city_col = kargs['city_col']
        longitude_col = kargs['longitude_col']
        latitude_col = kargs['latitude_col']
        result=df
        reader = geoip2.database.Reader(r'C:\\Temp\\GeoLite2-City.mmdb')

        def geodata(ip):
            try:
                gd = reader.city(ip)
                geo = pd.Series((gd.country.name, gd.subdivisions.most_specific.name, gd.city.name, gd.location.longitude, gd.location.latitude))
            except:
                geo = pd.Series((None, None, None, None, None))
            return geo
        
        result[[country_col, state_col, city_col, longitude_col, latitude_col]] = result[ip_col].apply(geodata)
        
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs,
        external_artifacts =
        pack('geoip2.zip', 'https://artifactswestus.blob.core.windows.net/public/geoip2-4.6.0.zip',
             'GeoLite2-City.mmdb', 'https://artifactswestus.blob.core.windows.net/public/GeoLite2-City-20230221.mmdb')
        )
};
datatable(ip:string) [
'8.8.8.8',
'20.53.203.50',
'20.81.111.85',
'20.103.85.33',
'20.84.181.62',
'205.251.242.103',
]
| extend country='', state='', city='', longitude=real(null), latitude=real(null)
| invoke geoip_fl('ip','country', 'state', 'city', 'longitude', 'latitude')

Stored

datatable(ip:string) [
'8.8.8.8',
'20.53.203.50',
'20.81.111.85',
'20.103.85.33',
'20.84.181.62',
'205.251.242.103',
]
| extend country='', state='', city='', longitude=real(null), latitude=real(null)
| invoke geoip_fl('ip','country', 'state', 'city', 'longitude', 'latitude')

Output

ipcountrystatecitylongitudelatitude
20.103.85.33NetherlandsNorth HollandAmsterdam4.888352.3716
20.53.203.50AustraliaNew South WalesSydney151.2006-33.8715
20.81.111.85United StatesVirginiaTappahannock-76.854537.9273
20.84.181.62United StatesIowaDes Moines-93.612441.6021
205.251.242.103United StatesVirginiaAshburn-77.490339.0469
8.8.8.8United StatesCaliforniaLos Angeles-118.244134.0544

11 - get_packages_version_fl()

Learn how to use the get_packages_version_fl() user-defined function.

get_packages_version_fl() is a user-defined function that retrieves the versions of the Python engine and packages of the inline python() plugin.

The function accepts a dynamic array containing the names of the packages to check, and returns their respective versions and the Python engine version.

Syntax

T | invoke get_packages_version_fl(packages)

Parameters

NameTypeRequiredDescription
packagesdynamicA dynamic array containing the names of the packages. Default is empty list to retrieve only the Python engine version.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let get_packages_version_fl = (packages:dynamic=dynamic([]))
{
    let kwargs = pack('packages', packages);
    let code =
    ```if 1:
        import importlib
        import sys
        
        packages = kargs["packages"]
        result = pd.DataFrame(columns=["name", "ver"])
        for i in range(len(packages)):
            result.loc[i, "name"] = packages[i]
            try:
                m = importlib.import_module(packages[i])
                result.loc[i, "ver"] = m.__version__ if hasattr(m, "__version__") else "missing __version__ attribute"
            except Exception as ex:
                result.loc[i, "ver"] = "ERROR: " + (ex.msg if hasattr(ex, "msg") else "exception, no msg")
        id = result.shape[0]
        result.loc[id, "name"] = "Python"
        result.loc[id, "ver"] = sys.version
    ```;
    print 1
    | evaluate python(typeof(name:string , ver:string), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Utils", docstring = "Returns version information of the Python engine and the specified packages")
get_packages_version_fl(packages:dynamic=dynamic([]))
{
    let kwargs = pack('packages', packages);
    let code =
    ```if 1:
        import importlib
        import sys
        
        packages = kargs["packages"]
        result = pd.DataFrame(columns=["name", "ver"])
        for i in range(len(packages)):
            result.loc[i, "name"] = packages[i]
            try:
                m = importlib.import_module(packages[i])
                result.loc[i, "ver"] = m.__version__ if hasattr(m, "__version__") else "missing __version__ attribute"
            except Exception as ex:
                result.loc[i, "ver"] = "ERROR: " + (ex.msg if hasattr(ex, "msg") else "exception, no msg")
        id = result.shape[0]
        result.loc[id, "name"] = "Python"
        result.loc[id, "ver"] = sys.version
    ```;
    print 1
    | evaluate python(typeof(name:string , ver:string), code, kwargs)
}

Example

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let get_packages_version_fl = (packages:dynamic=dynamic([]))
{
    let kwargs = pack('packages', packages);
    let code =
    ```if 1:
        import importlib
        import sys
        
        packages = kargs["packages"]
        result = pd.DataFrame(columns=["name", "ver"])
        for i in range(len(packages)):
            result.loc[i, "name"] = packages[i]
            try:
                m = importlib.import_module(packages[i])
                result.loc[i, "ver"] = m.__version__ if hasattr(m, "__version__") else "missing __version__ attribute"
            except Exception as ex:
                result.loc[i, "ver"] = "ERROR: " + (ex.msg if hasattr(ex, "msg") else "exception, no msg")
        id = result.shape[0]
        result.loc[id, "name"] = "Python"
        result.loc[id, "ver"] = sys.version
    ```;
    print 1
    | evaluate python(typeof(name:string , ver:string), code, kwargs)
};
get_packages_version_fl(pack_array('numpy', 'scipy', 'pandas', 'statsmodels', 'sklearn', 'onnxruntime', 'plotly'))

Stored

get_packages_version_fl(pack_array('numpy', 'scipy', 'pandas', 'statsmodels', 'sklearn', 'onnxruntime', 'plotly'))

Output

namever
numpy1.23.4
onnxruntime1.13.1
pandas1.5.1
plotly5.11.0
Python3.10.8 (tags/v3.10.8:aaaf517, Oct 11 2022, 16:50:30) [MSC v.1933 64 bit (AMD64)]
scipy1.9.3
sklearn1.1.3
statsmodels0.13.2

12 - kmeans_dynamic_fl()

This article describes the kmeans_dynamic_fl() user-defined function.

The function kmeans_dynamic_fl() is a UDF (user-defined function) that clusterizes a dataset using the k-means algorithm. This function is similar to kmeans_fl() just the features are supplied by a single numerical array column and not by multiple scalar columns.

Syntax

T | invoke kmeans_dynamic_fl(k, features_col, cluster_col)

Parameters

NameTypeRequiredDescription
kint✔️The number of clusters.
features_colstring✔️The name of the column containing the numeric array of features to be used for clustering.
cluster_colstring✔️The name of the column to store the output cluster ID for each record.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let kmeans_dynamic_fl=(tbl:(*),k:int, features_col:string, cluster_col:string)
{
    let kwargs = bag_pack('k', k, 'features_col', features_col, 'cluster_col', cluster_col);
    let code = ```if 1:

        from sklearn.cluster import KMeans

        k = kargs["k"]
        features_col = kargs["features_col"]
        cluster_col = kargs["cluster_col"]

        df1 = df[features_col].apply(np.array)
        matrix = np.vstack(df1.values)
        kmeans = KMeans(n_clusters=k, random_state=0)
        kmeans.fit(matrix)
        result = df
        result[cluster_col] = kmeans.labels_
    ```;
    tbl
    | evaluate python(typeof(*),code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\ML", docstring = "K-Means clustering of features passed as a single column containing numerical array")
kmeans_dynamic_fl(tbl:(*),k:int, features_col:string, cluster_col:string)
{
    let kwargs = bag_pack('k', k, 'features_col', features_col, 'cluster_col', cluster_col);
    let code = ```if 1:

        from sklearn.cluster import KMeans

        k = kargs["k"]
        features_col = kargs["features_col"]
        cluster_col = kargs["cluster_col"]

        df1 = df[features_col].apply(np.array)
        matrix = np.vstack(df1.values)
        kmeans = KMeans(n_clusters=k, random_state=0)
        kmeans.fit(matrix)
        result = df
        result[cluster_col] = kmeans.labels_
    ```;
    tbl
    | evaluate python(typeof(*),code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Clustering of artificial dataset with three clusters

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let kmeans_dynamic_fl=(tbl:(*),k:int, features_col:string, cluster_col:string)
{
    let kwargs = bag_pack('k', k, 'features_col', features_col, 'cluster_col', cluster_col);
    let code = ```if 1:

        from sklearn.cluster import KMeans

        k = kargs["k"]
        features_col = kargs["features_col"]
        cluster_col = kargs["cluster_col"]

        df1 = df[features_col].apply(np.array)
        matrix = np.vstack(df1.values)
        kmeans = KMeans(n_clusters=k, random_state=0)
        kmeans.fit(matrix)
        result = df
        result[cluster_col] = kmeans.labels_
    ```;
    tbl
    | evaluate python(typeof(*),code, kwargs)
};
union 
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| project Features=pack_array(x, y), cluster_id=int(null)
| invoke kmeans_dynamic_fl(3, "Features", "cluster_id")
| extend x=toreal(Features[0]), y=toreal(Features[1])
| render scatterchart with(series=cluster_id)

Stored

union 
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| project Features=pack_array(x, y), cluster_id=int(null)
| invoke kmeans_dynamic_fl(3, "Features", "cluster_id")
| extend x=toreal(Features[0]), y=toreal(Features[1])
| render scatterchart with(series=cluster_id)

Screenshot of scatterchart of K-Means clustering of artificial dataset with three clusters.

13 - kmeans_fl()

This article describes the kmeans_fl() user-defined function.

The function kmeans_fl() is a UDF (user-defined function) that clusterizes a dataset using the k-means algorithm.

Syntax

T | invoke kmeans_fl(k, features, cluster_col)

Parameters

NameTypeRequiredDescription
kint✔️The number of clusters.
featuresdynamic✔️An array containing the names of the features columns to use for clustering.
cluster_colstring✔️The name of the column to store the output cluster ID for each record.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let kmeans_fl=(tbl:(*), k:int, features:dynamic, cluster_col:string)
{
    let kwargs = bag_pack('k', k, 'features', features, 'cluster_col', cluster_col);
    let code = ```if 1:

        from sklearn.cluster import KMeans

        k = kargs["k"]
        features = kargs["features"]
        cluster_col = kargs["cluster_col"]

        km = KMeans(n_clusters=k)
        df1 = df[features]
        km.fit(df1)
        result = df
        result[cluster_col] = km.labels_
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create function with (folder = "Packages\\ML", docstring = "K-Means clustering")
kmeans_fl(tbl:(*), k:int, features:dynamic, cluster_col:string)
{
    let kwargs = bag_pack('k', k, 'features', features, 'cluster_col', cluster_col);
    let code = ```if 1:

        from sklearn.cluster import KMeans

        k = kargs["k"]
        features = kargs["features"]
        cluster_col = kargs["cluster_col"]

        km = KMeans(n_clusters=k)
        df1 = df[features]
        km.fit(df1)
        result = df
        result[cluster_col] = km.labels_
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Clusterize artificial dataset with three clusters

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let kmeans_fl=(tbl:(*), k:int, features:dynamic, cluster_col:string)
{
    let kwargs = bag_pack('k', k, 'features', features, 'cluster_col', cluster_col);
    let code = ```if 1:

        from sklearn.cluster import KMeans

        k = kargs["k"]
        features = kargs["features"]
        cluster_col = kargs["cluster_col"]

        km = KMeans(n_clusters=k)
        df1 = df[features]
        km.fit(df1)
        result = df
        result[cluster_col] = km.labels_
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
OccupancyDetection
| extend cluster_id=int(null)
union
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| invoke kmeans_fl(3, bag_pack("x", "y"), "cluster_id")
| render scatterchart with(series=cluster_id)

Stored

union 
(range x from 1 to 100 step 1 | extend x=rand()+3, y=rand()+2),
(range x from 101 to 200 step 1 | extend x=rand()+1, y=rand()+4),
(range x from 201 to 300 step 1 | extend x=rand()+2, y=rand()+6)
| invoke kmeans_fl(3, bag_pack("x", "y"), "cluster_id")
| render scatterchart with(series=cluster_id)

Screenshot of scatterchart of K-Means clustering of artificial dataset with three clusters.

14 - ks_test_fl()

This article describes the ks_test_fl() user-defined function.

The function ks_test_fl() is a UDF (user-defined function) that performs the Kolmogorov Smirnov Test.

Syntax

T | invoke ks_test_fl(data1, data2, test_statistic,p_value)

Parameters

NameTypeRequiredDescription
data1string✔️The name of the column containing the first set of data to be used for the test.
data2string✔️The name of the column containing the second set of data to be used for the test.
test_statisticstring✔️The name of the column to store test statistic value for the results.
p_valuestring✔️The name of the column to store p-value for the results.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let ks_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
    let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
    let code = ```if 1:
        from scipy import stats
        data1 = kargs["data1"]
        data2 = kargs["data2"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        def func(row):
            statistics = stats.ks_2samp(row[data1], row[data2])
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Stats", docstring = "Kolmogorov Smirnov Test")
ks_test_fl(tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
    let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
    let code = ```if 1:
        from scipy import stats
        data1 = kargs["data1"]
        data2 = kargs["data2"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        def func(row):
            statistics = stats.ks_2samp(row[data1], row[data2])
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let ks_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
    let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
    let code = ```if 1:
        from scipy import stats
        data1 = kargs["data1"]
        data2 = kargs["data2"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        def func(row):
            statistics = stats.ks_2samp(row[data1], row[data2])
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke ks_test_fl('sample1', 'sample2', 'test_stat', 'p_val')

Stored

datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke ks_test_fl('sample1', 'sample2', 'test_stat', 'p_val')

Output

idsample1sample2test_statp_val
Test #1[23.64, 20.57, 20.42][27.1, 22.12, 33.56]0.666666666666666740.3197243332709643
Test #2[20.85, 21.89, 23.41][35.09, 30.02, 26.52]10.03262165165202116
Test #3[20.13, 20.5, 21.7, 22.02][32.2, 32.79, 33.9, 34.22]10.01106563701580386

15 - levene_test_fl()

This article describes the levene_test_fl() user-defined function.

The function levene_test_fl() is a UDF (user-defined function) that performs the Levene Test.

Syntax

T | invoke levene_test_fl(data1, data2, test_statistic,p_value)

Parameters

NameTypeRequiredDescription
data1string✔️The name of the column containing the first set of data to be used for the test.
data2string✔️The name of the column containing the second set of data to be used for the test.
test_statisticstring✔️The name of the column to store test statistic value for the results.
p_valuestring✔️The name of the column to store p-value for the results.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

<!-- let levene_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
    let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
    let code = ```if 1:
        from scipy import stats
        data1 = kargs["data1"]
        data2 = kargs["data2"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        def func(row):
            statistics = stats.levene(row[data1], row[data2])
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Stats", docstring = "Levene Test")
levene_test_fl(tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
    let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
    let code = ```if 1:
        from scipy import stats
        data1 = kargs["data1"]
        data2 = kargs["data2"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        def func(row):
            statistics = stats.levene(row[data1], row[data2])
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

<!-- let levene_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string)
{
    let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value);
    let code = ```if 1:
        from scipy import stats
        data1 = kargs["data1"]
        data2 = kargs["data2"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        def func(row):
            statistics = stats.levene(row[data1], row[data2])
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke levene_test_fl('sample1', 'sample2', 'test_stat', 'p_val')

Stored

datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke levene_test_fl('sample1', 'sample2', 'test_stat', 'p_val')

Output

idsample1sample2test_statp_val
Test #1[23.64, 20.57, 20.42][27.1, 22.12, 33.56]1.55873959873673870.27993504690044563
Test #2[20.85, 21.89, 23.41][35.09, 30.02, 26.52]1.64024957881304820.26950872948841353
Test #3[20.13, 20.5, 21.7, 22.02][32.2, 32.79, 33.9, 34.22]0.00329896907216423950.95606240301049072

16 - log_reduce_fl()

Learn how to use the log_reduce_fl() function to find common patterns in semi-structured textual columns.

The function log_reduce_fl() finds common patterns in semi-structured textual columns, such as log lines, and clusters the lines according to the extracted patterns. It outputs a summary table containing the found patterns sorted top down by their respective frequency.

Syntax

T | invoke log_reduce_fl(reduce_col [, use_logram [, use_drain [, custom_regexes [, custom_regexes_policy [, delimiters [, similarity_th [, tree_depth [, trigram_th [, bigram_th ]]]]]]]]])

Parameters

The following parameters description is a summary. For more information, see More about the algorithm section.

NameTypeRequiredDescription
reduce_colstring✔️The name of the string column the function is applied to.
use_logramboolEnable or disable the Logram algorithm. Default value is true.
use_drainboolEnable or disable the Drain algorithm. Default value is true.
custom_regexesdynamicA dynamic array containing pairs of regular expression and replacement symbols to be searched in each input row, and replaced with their respective matching symbol. Default value is dynamic([]). The default regex table replaces numbers, IP addresses, and GUIDs.
custom_regexes_policystringEither ‘prepend’, ‘append’ or ‘replace’. Controls whether custom_regexes are prepend/append/replace the default ones. Default value is ‘prepend’.
delimitersdynamicA dynamic array containing delimiter strings. Default value is dynamic([" "]), defining space as the only single character delimiter.
similarity_threalSimilarity threshold, used by the Drain algorithm. Increasing similarity_th results in more refined databases. Default value is 0.5. If Drain is disabled, then this parameter has no effect.
tree_depthintIncreasing tree_depth improves the runtime of the Drain algorithm, but might reduce its accuracy. Default value is 4. If Drain is disabled, then this parameter has no effect.
trigram_thintDecreasing trigram_th increases the chances of Logram to replace tokens with wildcards. Default value is 10. If Logram is disabled, then this parameter has no effect.
bigram_thintDecreasing bigram_th increases the chances of Logram to replace tokens with wildcards. Default value is 15. If Logram is disabled, then this parameter has no effect.

More about the algorithm

The function runs multiples passes over the rows to be reduced to common patterns. The following list explains the passes:

  • Regular expression replacements: In this pass, each line is independently matched to a set of regular expressions, and each matched expression is replaced by a replacement symbol. The default regular expressions replace IP addresses, numbers, and GUIDs with /<IP>, <GUID> and /<NUM>. The user can prepend/append more regular expressions to those, or replace it with new ones or empty list by modifying custom_regexes and custom_regexes_policy. For example, to replace whole numbers with <WNUM> set custom_regexes=pack_array(’/^\d+$/’, ‘<WNUM>’); to cancel regular expressions replacement set custom_regexes_policy=‘replace’. For each line, the function keeps list of the original expressions (before replacements) to be output as parameters of the generic replacement tokens.

  • Tokenization: similar to the previous step, each line is processed independently and broken into tokens based on set of delimiters. For example, to define breaking to tokens by either comma, period or semicolon set delimiters=pack_array(’,’, ‘.’, ‘;’).

  • Apply Logram algorithm: this pass is optional, pending use_logram is true. We recommend using Logram when large scale is required, and when parameters can appear in the first tokens of the log entry. OTOH, disable it when the log entries are short, as the algorithm tends to replace tokens with wildcards too often in such cases. The Logram algorithm considers 3-tuples and 2-tuples of tokens. If a 3-tuple of tokens is common in the log lines (it appears more than trigram_th times), then it’s likely that all three tokens are part of the pattern. If the 3-tuple is rare, then it’s likely that it contains a variable that should be replaced by a wildcard. For rare 3-tuples, we consider the frequency with which 2-tuples contained in the 3-tuple appear. If a 2-tuple is common (it appears more than bigram_th times), then the remaining token is likely to be a parameter, and not part of the pattern.
    The Logram algorithm is easy to parallelize. It requires two passes on the log corpus: the first one to count the frequency of each 3-tuple and 2-tuple, and the second one to apply the logic previously described to each entry. To parallelize the algorithm, we only need to partition the log entries, and unify the frequency counts of different workers.

  • Apply Drain algorithm: this pass is optional, pending use_drain is true. Drain is a log parsing algorithm based on a truncated depth prefix tree. Log messages are split according to their length, and for each length the first tree_depth tokens of the log message are used to build a prefix tree. If no match for the prefix tokens was found, a new branch is created. If a match for the prefix was found, we search for the most similar pattern among the patterns contained in the tree leaf. Pattern similarity is measured by the ratio of matched nonwildcard tokens out of all tokens. If the similarity of the most similar pattern is above the similarity threshold (the parameter similarity_th), then the log entry is matched to the pattern. For that pattern, the function replaces all nonmatching tokens by wildcards. If the similarity of the most similar pattern is below the similarity threshold, a new pattern containing the log entry is created.
    We set default tree_depth to 4 based on testing various logs. Increasing this depth can improve runtime but might degrade patterns accuracy; decreasing it’s more accurate but slower, as each node performs many more similarity tests.
    Usually, Drain efficiently generalizes and reduces patterns (though it’s hard to be parallelized). However, as it relies on a prefix tree, it might not be optimal in log entries containing parameters in the first tokens. This can be resolved in most cases by applying Logram first.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let log_reduce_fl=(tbl:(*), reduce_col:string,
              use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
              delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
    let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>', 
                                         '([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>', 
                                         '(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
    let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', 'LogReduce', 'parameters_column', '', 
                          'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table, 
                          'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th, 
                          'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn', 
                          'output_type', 'summary');
    let code = ```if 1:
        from log_cluster import log_reduce
        result = log_reduce.log_reduce(df, kargs)
    ```;
    tbl
    | extend LogReduce=''
    | evaluate python(typeof(Count:int, LogReduce:string, example:string), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = 'Packages\\Text', docstring = 'Find common patterns in textual logs, output a summary table')
log_reduce_fl(tbl:(*), reduce_col:string,
              use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
              delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
    let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>', 
                                         '([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>', 
                                         '(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
    let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', 'LogReduce', 'parameters_column', '', 
                          'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table, 
                          'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th, 
                          'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn', 
                          'output_type', 'summary');
    let code = ```if 1:
        from log_cluster import log_reduce
        result = log_reduce.log_reduce(df, kargs)
    ```;
    tbl
    | extend LogReduce=''
    | evaluate python(typeof(Count:int, LogReduce:string, example:string), code, kwargs)
}

Example

The following example uses the invoke operator to run the function. This example uses Apache Hadoop distributed file system logs.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let log_reduce_fl=(tbl:(*), reduce_col:string,
              use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
              delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
    let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>', 
                                         '([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>', 
                                         '(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
    let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', 'LogReduce', 'parameters_column', '', 
                          'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table, 
                          'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th, 
                          'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn', 
                          'output_type', 'summary');
    let code = ```if 1:
        from log_cluster import log_reduce
        result = log_reduce.log_reduce(df, kargs)
    ```;
    tbl
    | extend LogReduce=''
    | evaluate python(typeof(Count:int, LogReduce:string, example:string), code, kwargs)
};
//
// Finding common patterns in HDFS logs, a commonly used benchmark for log parsing
//
HDFS_log
| take 100000
| invoke log_reduce_fl(reduce_col="data")

Stored

//
// Finding common patterns in HDFS logs, a commonly used benchmark for log parsing
//
HDFS_log
| take 100000
| invoke log_reduce_fl(reduce_col="data")

Output

CountLogReduceExample
55356081110<NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* NameSystem.delete: blk_<NUM> is added to invalidSet of <IP> 081110 220623 26 INFO dfs.FSNamesystem: BLOCK* NameSystem.delete: blk_1239016582509138045 is added to invalidSet of 10.251.123.195:50010
10278081110<NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: <IP> is added to blk_<NUM> size <NUM> 081110 215858 27 INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.11.85:50010 is added to blk_5080254298708411681 size 67108864
10256081110<NUM> <NUM> INFO dfs.DataNode$PacketResponder: PacketResponder <NUM> for block blk_<NUM> terminating 081110 215858 15496 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-7746692545918257727 terminating
10256081110<NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> 081110 215858 15485 INFO dfs.DataNode$PacketResponder: Received block blk_5080254298708411681 of size 67108864 from /10.251.43.21
9140081110<NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> 081110 215858 15494 INFO dfs.DataNode$DataXceiver: Receiving block blk_-7037346755429293022 src: /10.251.43.21:45933 dest: /10.251.43.21:50010
3047081110<NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: /user/root/rand3/temporary/task<NUM><NUM>m<NUM>_<NUM>/part-<NUM>. <> 081110 215858 26 INFO dfs.FSNamesystem: BLOCK NameSystem.allocateBlock: /user/root/rand3/_temporary/task_200811101024_0005_m_001805_0/part-01805. blk-7037346755429293022
1402081110<NUM> <NUM> INFO <>: <> block blk_<NUM> <> <> 081110 215957 15556 INFO dfs.DataNode$DataTransfer: 10.250.15.198:50010:Transmitted block blk_-3782569120714539446 to /10.251.203.129:50010
177081110<NUM> <NUM> INFO <>: <> <> <> <*> 081110 215859 13 INFO dfs.DataBlockScanner: Verification succeeded for blk_-7244926816084627474
36081110<NUM> <NUM> INFO <>: <> <> <> for block <*> 081110 215924 15636 INFO dfs.DataNode$BlockReceiver: Receiving empty packet for block blk_3991288654265301939
12081110<NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* <> <> <> <> <> <> <> <> 081110 215953 19 INFO dfs.FSNamesystem: BLOCK* ask 10.250.15.198:50010 to replicate blk_-3782569120714539446 to datanode(s) 10.251.203.129:50010
12081110<NUM> <NUM> INFO <>: <> <> <> <> <> block blk_<NUM> <> <> 081110 215955 18 INFO dfs.DataNode: 10.250.15.198:50010 Starting thread to transfer block blk_-3782569120714539446 to 10.251.203.129:50010
12081110<NUM> <NUM> INFO dfs.DataNode$DataXceiver: Received block blk_<NUM> src: <IP> dest: <IP> of size <NUM> 081110 215957 15226 INFO dfs.DataNode$DataXceiver: Received block blk_-3782569120714539446 src: /10.250.15.198:51013 dest: /10.250.15.198:50010 of size 14474705
6081110<NUM> <NUM> <> dfs.FSNamesystem: BLOCK NameSystem.addStoredBlock: <> <> <> <> <> <> <> <> size <NUM> 081110 215924 27 WARN dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: Redundant addStoredBlock request received for blk_2522553781740514003 on 10.251.202.134:50010 size 67108864
6081110<NUM> <NUM> INFO dfs.DataNode$DataXceiver: <> <> <> <> <>: <> <> <> <> <> 081110 215936 15714 INFO dfs.DataNode$DataXceiver: writeBlock blk_720939897861061328 received exception java.io.IOException: Couldn’t read from stream
3081110<NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: <> <> <> <> <> <> <> size <NUM> <> <> <> <> <> <> <> <>. 081110 220635 28 INFO dfs.FSNamesystem: BLOCK NameSystem.addStoredBlock: addStoredBlock request received for blk_-81196479666306310 on 10.250.17.177:50010 size 53457811 But it doesn’t belong to any file.
1081110<NUM> <NUM> <> <>: <> <> <> <> <> <> <>. <> <> <> <> <>. 081110 220631 19 WARN dfs.FSDataset: Unexpected error trying to delete block blk_-2012154052725261337. BlockInfo not found in volumeMap.

17 - log_reduce_full_fl()

This article describes the log_reduce_full_fl() user-defined function.

The function log_reduce_full_fl() finds common patterns in semi structured textual columns, such as log lines, and clusters the lines according to the extracted patterns. The function’s algorithm and most of the parameters are identical to log_reduce_fl(). However, log_reduce_fl() outputs a patterns summary table, whereas this function outputs a full table containing the pattern and parameters per each line.

Syntax

T | invoke log_reduce_full_fl(reduce_col [, pattern_col [, parameters_col [, use_logram [, use_drain [, custom_regexes [, custom_regexes_policy [, delimiters [, similarity_th [, tree_depth [, trigram_th [, bigram_th ]]]]]]]]]]])

Parameters

The following parameters description is a summary. For more information, see More about the algorithm section.

NameTypeRequiredDescription
reduce_colstring✔️The name of the string column the function is applied to.
pattern_colstring✔️The name of the string column to populate the pattern.
parameters_colstring✔️The name of the string column to populate the pattern’s parameters.
use_logramboolEnable or disable the Logram algorithm. Default value is true.
use_drainboolEnable or disable the Drain algorithm. Default value is true.
custom_regexesdynamicA dynamic array containing pairs of regular expression and replacement symbols to be searched in each input row, and replaced with their respective matching symbol. Default value is dynamic([]). The default regex table replaces numbers, IPs and GUIDs.
custom_regexes_policystringEither ‘prepend’, ‘append’ or ‘replace’. Controls whether custom_regexes are prepend/append/replace the default ones. Default value is ‘prepend’.
delimitersdynamicA dynamic array containing delimiter strings. Default value is dynamic([" "]), defining space as the only single character delimiter.
similarity_threalSimilarity threshold, used by the Drain algorithm. Increasing similarity_th results in more refined clusters. Default value is 0.5. If Drain is disabled, then this parameter has no effect.
tree_depthintIncreasing tree_depth improves the runtime of the Drain algorithm, but might reduce its accuracy. Default value is 4. If Drain is disabled, then this parameter has no effect.
trigram_thintDecreasing trigram_th increases the chances of Logram to replace tokens with wildcards. Default value is 10. If Logram is disabled, then this parameter has no effect.
bigram_thintDecreasing bigram_th increases the chances of Logram to replace tokens with wildcards. Default value is 15. If Logram is disabled, then this parameter has no effect.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let log_reduce_full_fl=(tbl:(*), reduce_col:string, pattern_col:string, parameters_col:string,
                   use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
                   delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
    let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>', 
                                         '([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>', 
                                         '(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
    let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', pattern_col, 'parameters_column', parameters_col, 
                          'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table, 
                          'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th, 
                          'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn', 
                          'output_type', 'full');
    let code = ```if 1:
        from log_cluster import log_reduce
        result = log_reduce.log_reduce(df, kargs)
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = 'Packages\\Text', docstring = 'Find common patterns in textual logs, output a full table')
log_reduce_full_fl(tbl:(*), reduce_col:string, pattern_col:string, parameters_col:string,
                   use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
                   delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
    let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>', 
                                         '([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>', 
                                         '(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
    let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', pattern_col, 'parameters_column', parameters_col, 
                          'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table, 
                          'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th, 
                          'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn', 
                          'output_type', 'full');
    let code = ```if 1:
        from log_cluster import log_reduce
        result = log_reduce.log_reduce(df, kargs)
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let log_reduce_full_fl=(tbl:(*), reduce_col:string, pattern_col:string, parameters_col:string,
                   use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
                   delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
    let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>', 
                                         '([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>', 
                                         '(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
    let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', pattern_col, 'parameters_column', parameters_col, 
                          'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table, 
                          'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th, 
                          'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn', 
                          'output_type', 'full');
    let code = ```if 1:
        from log_cluster import log_reduce
        result = log_reduce.log_reduce(df, kargs)
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
//
// Finding common patterns in HDFS logs, a commonly used benchmark for log parsing
//
HDFS_log
| take 100000
| extend Patterns="", Parameters=""
| invoke log_reduce_full_fl(reduce_col="data", pattern_col="Patterns", parameters_col="Parameters")
| take 10

Stored

//
// Finding common patterns in HDFS logs, a commonly used benchmark for log parsing
//
HDFS_log
| take 100000
| extend Patterns="", Parameters=""
| invoke log_reduce_full_fl(reduce_col="data", pattern_col="Patterns", parameters_col="Parameters")
| take 10

Output

dataPatternsParameters
08111021585815485 INFO dfs.DataNode$PacketResponder: Received block blk_5080254298708411681 of size 67108864 from /10.251.43.21 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15485"”, ““parameter_2"”: ““5080254298708411681"”, ““parameter_3"”: ““67108864"”, ““parameter_4"”: “"/10.251.43.21"”}”
08111021585815494 INFO dfs.DataNode$DataXceiver: Receiving block blk_-7037346755429293022 src: /10.251.43.21:45933 dest: /10.251.43.21:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15494"”, ““parameter_2"”: “"-7037346755429293022"”, ““parameter_3"”: “"/10.251.43.21:45933"”, ““parameter_4"”: “"/10.251.43.21:50010"”}”
08111021585815496 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-7746692545918257727 terminating 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: PacketResponder <NUM> for block blk_<NUM> terminating “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15496"”, ““parameter_2"”: ““2"”, ““parameter_3"”: “"-7746692545918257727"”}”
08111021585815496 INFO dfs.DataNode$PacketResponder: Received block blk_-7746692545918257727 of size 67108864 from /10.251.107.227 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15496"”, ““parameter_2"”: “"-7746692545918257727"”, ““parameter_3"”: ““67108864"”, ““parameter_4"”: “"/10.251.107.227"”}”
08111021585815511 INFO dfs.DataNode$DataXceiver: Receiving block blk_-8578644687709935034 src: /10.251.107.227:39600 dest: /10.251.107.227:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15511"”, ““parameter_2"”: “"-8578644687709935034"”, ““parameter_3"”: “"/10.251.107.227:39600"”, ““parameter_4"”: “"/10.251.107.227:50010"”}”
08111021585815514 INFO dfs.DataNode$DataXceiver: Receiving block blk_722881101738646364 src: /10.251.75.79:58213 dest: /10.251.75.79:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15514"”, ““parameter_2"”: ““722881101738646364"”, ““parameter_3"”: “"/10.251.75.79:58213"”, ““parameter_4"”: “"/10.251.75.79:50010"”}”
08111021585815517 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-7110736255599716271 terminating 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: PacketResponder <NUM> for block blk_<NUM> terminating “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15517"”, ““parameter_2"”: ““2"”, ““parameter_3"”: “"-7110736255599716271"”}”
08111021585815517 INFO dfs.DataNode$PacketResponder: Received block blk_-7110736255599716271 of size 67108864 from /10.251.42.246 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15517"”, ““parameter_2"”: “"-7110736255599716271"”, ““parameter_3"”: ““67108864"”, ““parameter_4"”: “"/10.251.42.246"”}”
08111021585815533 INFO dfs.DataNode$DataXceiver: Receiving block blk_7257432994295824826 src: /10.251.26.8:41803 dest: /10.251.26.8:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15533"”, ““parameter_2"”: ““7257432994295824826"”, ““parameter_3"”: “"/10.251.26.8:41803"”, ““parameter_4"”: “"/10.251.26.8:50010"”}”
08111021585815533 INFO dfs.DataNode$DataXceiver: Receiving block blk_-7771332301119265281 src: /10.251.43.210:34258 dest: /10.251.43.210:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> “{““parameter_0"”: ““215858"”, ““parameter_1"”: ““15533"”, ““parameter_2"”: “"-7771332301119265281"”, ““parameter_3"”: “"/10.251.43.210:34258"”, ““parameter_4"”: “"/10.251.43.210:50010"”}”

18 - log_reduce_predict_fl()

This article describes the log_reduce_predict_fl() user-defined function.

The function log_reduce_predict_fl() parses semi structured textual columns, such as log lines, and for each line it matches the respective pattern from a pretrained model or reports an anomaly if no matching pattern was found. The function’s’ output is similar to log_reduce_fl(), though the patterns are retrieved from a pretrained model that generated by log_reduce_train_fl().

Syntax

T | invoke log_reduce_predict_fl(models_tbl, model_name, reduce_col [, anomaly_str ])

Parameters

NameTypeRequiredDescription
models_tbltable✔️A table containing models generated by log_reduce_train_fl(). The table’s schema should be (name:string, timestamp: datetime, model:string).
model_namestring✔️The name of the model that will be retrieved from models_tbl. If the table contains few models matching the model name, the latest one is used.
reduce_colstring✔️The name of the string column the function is applied to.
anomaly_strstringThis string is output for lines that have no matched pattern in the model. Default value is “ANOMALY”.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let log_reduce_predict_fl=(tbl:(*), models_tbl: (name:string, timestamp: datetime, model:string), 
                      model_name:string, reduce_col:string, anomaly_str: string = 'ANOMALY')
{
    let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
    let kwargs = bag_pack('logs_col', reduce_col, 'output_patterns_col', 'LogReduce','output_parameters_col', '', 
                          'model', model_str, 'anomaly_str', anomaly_str, 'output_type', 'summary');
    let code = ```if 1:
        from log_cluster import log_reduce_predict
        result = log_reduce_predict.log_reduce_predict(df, kargs)
    ```;
    tbl
    | evaluate hint.distribution=per_node python(typeof(Count:int, LogReduce:string, example:string), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = 'Packages\\Text', docstring = 'Apply a trained model to find common patterns in textual logs, output a summary table')
log_reduce_predict_fl(tbl:(*), models_tbl: (name:string, timestamp: datetime, model:string), 
                      model_name:string, reduce_col:string, anomaly_str: string = 'ANOMALY')
{
    let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
    let kwargs = bag_pack('logs_col', reduce_col, 'output_patterns_col', 'LogReduce','output_parameters_col', '', 
                          'model', model_str, 'anomaly_str', anomaly_str, 'output_type', 'summary');
    let code = ```if 1:
        from log_cluster import log_reduce_predict
        result = log_reduce_predict.log_reduce_predict(df, kargs)
    ```;
    tbl
    | evaluate hint.distribution=per_node python(typeof(Count:int, LogReduce:string, example:string), code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let log_reduce_predict_fl=(tbl:(*), models_tbl: (name:string, timestamp: datetime, model:string), 
                      model_name:string, reduce_col:string, anomaly_str: string = 'ANOMALY')
{
    let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
    let kwargs = bag_pack('logs_col', reduce_col, 'output_patterns_col', 'LogReduce','output_parameters_col', '', 
                          'model', model_str, 'anomaly_str', anomaly_str, 'output_type', 'summary');
    let code = ```if 1:
        from log_cluster import log_reduce_predict
        result = log_reduce_predict.log_reduce_predict(df, kargs)
    ```;
    tbl
    | evaluate hint.distribution=per_node python(typeof(Count:int, LogReduce:string, example:string), code, kwargs)
};
HDFS_log_100k
| take 1000
| invoke log_reduce_predict_fl(models_tbl=ML_Models, model_name="HDFS_100K", reduce_col="data")

Stored

HDFS_log_100k
| take 1000
| invoke log_reduce_predict_fl(models_tbl=ML_Models, model_name="HDFS_100K", reduce_col="data")

Output

CountLogReduceexample
239081110<NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> 081110 215858 15494 INFO dfs.DataNode$DataXceiver: Receiving block blk_-7037346755429293022 src: /10.251.43.21:45933 dest: /10.251.43.21:50010
231081110<NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> 081110 215858 15485 INFO dfs.DataNode$PacketResponder: Received block blk_5080254298708411681 of size 67108864 from /10.251.43.21
230081110<NUM> <NUM> INFO dfs.DataNode$PacketResponder: PacketResponder <NUM> for block blk_<NUM> terminating 081110 215858 15496 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-7746692545918257727 terminating
218081110<NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: <IP> is added to blk_<NUM> size <NUM> 081110 215858 27 INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.11.85:50010 is added to blk_5080254298708411681 size 67108864
79081110<NUM> <NUM> INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: <>. <> 081110 215858 26 INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: /user/root/rand3/_temporary/task_200811101024_0005_m_001805_0/part-01805. blk-7037346755429293022
3081110<NUM> <NUM> INFO dfs.DataBlockScanner: Verification succeeded for <*> 081110 215859 13 INFO dfs.DataBlockScanner: Verification succeeded for blk_-7244926816084627474

19 - log_reduce_predict_full_fl()

This article describes the log_reduce_predict_full_fl() user-defined function.

The function log_reduce_predict_full_fl() parses semi structured textual columns, such as log lines, and for each line it matches the respective pattern from a pretrained model or reports an anomaly if no matching pattern was found. The patterns are retrieved from a pretrained model, generated by log_reduce_train_fl(). The function is similar to log_reduce_predict_fl(), but unlike log_reduce_predict_fl() that outputs a patterns summary table, this function outputs a full table containing the pattern and parameters per each line.

Syntax

T | invoke log_reduce_predict_full_fl(models_tbl, model_name, reduce_col, pattern_col, parameters_col [, anomaly_str ])

Parameters

NameTypeRequiredDescription
models_tbltable✔️A table containing models generated by log_reduce_train_fl(). The table’s schema should be (name:string, timestamp: datetime, model:string).
model_namestring✔️The name of the model that will be retrieved from models_tbl. If the table contains few models matching the model name, the latest one is used.
reduce_colstring✔️The name of the string column the function is applied to.
pattern_colstring✔️The name of the string column to populate the pattern.
parameters_colstring✔️The name of the string column to populate the pattern’s parameters.
anomaly_strstringThis string is output for lines that have no matched pattern in the model. Default value is “ANOMALY”.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let log_reduce_predict_full_fl=(tbl:(*), models_tbl: (name:string, timestamp: datetime, model:string), 
                           model_name:string, reduce_col:string, pattern_col:string, parameters_col:string, 
                           anomaly_str: string = 'ANOMALY')
{
    let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
    let kwargs = bag_pack('logs_col', reduce_col, 'output_patterns_col', pattern_col,'output_parameters_col', 
                          parameters_col, 'model', model_str, 'anomaly_str', anomaly_str, 'output_type', 'full');
    let code = ```if 1:
        from log_cluster import log_reduce_predict
        result = log_reduce_predict.log_reduce_predict(df, kargs)
    ```;
    tbl
    | evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = 'Packages\\Text', docstring = 'Apply a trained model to find common patterns in textual logs, output a full table')
log_reduce_predict_full_fl(tbl:(*), models_tbl: (name:string, timestamp: datetime, model:string), 
                           model_name:string, reduce_col:string, pattern_col:string, parameters_col:string, 
                           anomaly_str: string = 'ANOMALY')
{
    let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
    let kwargs = bag_pack('logs_col', reduce_col, 'output_patterns_col', pattern_col,'output_parameters_col', 
                          parameters_col, 'model', model_str, 'anomaly_str', anomaly_str, 'output_type', 'full');
    let code = ```if 1:
        from log_cluster import log_reduce_predict
        result = log_reduce_predict.log_reduce_predict(df, kargs)
    ```;
    tbl
    | evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let log_reduce_predict_full_fl=(tbl:(*), models_tbl: (name:string, timestamp: datetime, model:string), 
                           model_name:string, reduce_col:string, pattern_col:string, parameters_col:string, 
                           anomaly_str: string = 'ANOMALY')
{
    let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
    let kwargs = bag_pack('logs_col', reduce_col, 'output_patterns_col', pattern_col,'output_parameters_col', 
                          parameters_col, 'model', model_str, 'anomaly_str', anomaly_str, 'output_type', 'full');
    let code = ```if 1:
        from log_cluster import log_reduce_predict
        result = log_reduce_predict.log_reduce_predict(df, kargs)
    ```;
    tbl
    | evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
HDFS_log_100k
| extend Patterns='', Parameters=''
| take 10
| invoke log_reduce_predict_full_fl(models_tbl=ML_Models, model_name="HDFS_100K", reduce_col="data", pattern_col="Patterns", parameters_col="Parameters")

Stored

HDFS_log_100k
| extend Patterns='', Parameters=''
| take 10
| invoke log_reduce_predict_full_fl(models_tbl=ML_Models, model_name="HDFS_100K", reduce_col="data", pattern_col="Patterns", parameters_col="Parameters")

Output

dataPatternsParameters
08111021585815485 INFO dfs.DataNode$PacketResponder: Received block blk_5080254298708411681 of size 67108864 from /10.251.43.21 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> {“parameter_0”: “215858”, “parameter_1”: “15485”, “parameter_2”: “5080254298708411681”, “parameter_3”: “67108864”, “parameter_4”: “/10.251.43.21”}
08111021585815494 INFO dfs.DataNode$DataXceiver: Receiving block blk_-7037346755429293022 src: /10.251.43.21:45933 dest: /10.251.43.21:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> {“parameter_0”: “215858”, “parameter_1”: “15494”, “parameter_2”: “-7037346755429293022”, “parameter_3”: “/10.251.43.21:45933”, “parameter_4”: “/10.251.43.21:50010”}
08111021585815496 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-7746692545918257727 terminating 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: PacketResponder <NUM> for block blk_<NUM> terminating {“parameter_0”: “215858”, “parameter_1”: “15496”, “parameter_2”: “2”, “parameter_3”: “-7746692545918257727”}
08111021585815496 INFO dfs.DataNode$PacketResponder: Received block blk_-7746692545918257727 of size 67108864 from /10.251.107.227 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> {“parameter_0”: “215858”, “parameter_1”: “15496”, “parameter_2”: “-7746692545918257727”, “parameter_3”: “67108864”, “parameter_4”: “/10.251.107.227”}
08111021585815511 INFO dfs.DataNode$DataXceiver: Receiving block blk_-8578644687709935034 src: /10.251.107.227:39600 dest: /10.251.107.227:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> {“parameter_0”: “215858”, “parameter_1”: “15511”, “parameter_2”: “-8578644687709935034”, “parameter_3”: “/10.251.107.227:39600”, “parameter_4”: “/10.251.107.227:50010”}
08111021585815514 INFO dfs.DataNode$DataXceiver: Receiving block blk_722881101738646364 src: /10.251.75.79:58213 dest: /10.251.75.79:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> {“parameter_0”: “215858”, “parameter_1”: “15514”, “parameter_2”: “722881101738646364”, “parameter_3”: “/10.251.75.79:58213”, “parameter_4”: “/10.251.75.79:50010”}
08111021585815517 INFO dfs.DataNode$PacketResponder: PacketResponder 2 for block blk_-7110736255599716271 terminating 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: PacketResponder <NUM> for block blk_<NUM> terminating {“parameter_0”: “215858”, “parameter_1”: “15517”, “parameter_2”: “2”, “parameter_3”: “-7110736255599716271”}
08111021585815517 INFO dfs.DataNode$PacketResponder: Received block blk_-7110736255599716271 of size 67108864 from /10.251.42.246 081110 <NUM> <NUM> INFO dfs.DataNode$PacketResponder: Received block blk_<NUM> of size <NUM> from <IP> {“parameter_0”: “215858”, “parameter_1”: “15517”, “parameter_2”: “-7110736255599716271”, “parameter_3”: “67108864”, “parameter_4”: “/10.251.42.246”}
08111021585815533 INFO dfs.DataNode$DataXceiver: Receiving block blk_7257432994295824826 src: /10.251.26.8:41803 dest: /10.251.26.8:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> {“parameter_0”: “215858”, “parameter_1”: “15533”, “parameter_2”: “7257432994295824826”, “parameter_3”: “/10.251.26.8:41803”, “parameter_4”: “/10.251.26.8:50010”}
08111021585815533 INFO dfs.DataNode$DataXceiver: Receiving block blk_-7771332301119265281 src: /10.251.43.210:34258 dest: /10.251.43.210:50010 081110 <NUM> <NUM> INFO dfs.DataNode$DataXceiver: Receiving block blk_<NUM> src: <IP> dest: <IP> {“parameter_0”: “215858”, “parameter_1”: “15533”, “parameter_2”: “-7771332301119265281”, “parameter_3”: “/10.251.43.210:34258”, “parameter_4”: “/10.251.43.210:50010”}

20 - log_reduce_train_fl()

This article describes the log_reduce_train_fl() user-defined function.

The function log_reduce_train_fl() finds common patterns in semi structured textual columns, such as log lines, and clusters the lines according to the extracted patterns. The function’s algorithm and most of the parameters are identical to log_reduce_fl(), but unlike log_reduce_fl() that outputs a patterns summary table, this function outputs the serialized model. The model can be used by the function log_reduce_predict_fl()/log_reduce_predict_full_fl() to predict the matched pattern for new log lines.

Syntax

T | invoke log_reduce_train_fl(reduce_col, model_name [, use_logram [, use_drain [, custom_regexes [, custom_regexes_policy [, delimiters [, similarity_th [, tree_depth [, trigram_th [, bigram_th ]]]]]]]]])

Parameters

The following parameters description is a summary. For more information, see More about the algorithm section.

NameTypeRequiredDescription
reduce_colstring✔️The name of the string column the function is applied to.
model_namestring✔️The name of the output model.
use_logramboolEnable or disable the Logram algorithm. Default value is true.
use_drainboolEnable or disable the Drain algorithm. Default value is true.
custom_regexesdynamicA dynamic array containing pairs of regular expression and replacement symbols to be searched in each input row, and replaced with their respective matching symbol. Default value is dynamic([]). The default regex table replaces numbers, IPs and GUIDs.
custom_regexes_policystringEither ‘prepend’, ‘append’ or ‘replace’. Controls whether custom_regexes are prepend/append/replace the default ones. Default value is ‘prepend’.
delimitersdynamicA dynamic array containing delimiter strings. Default value is dynamic([" "]), defining space as the only single character delimiter.
similarity_threalSimilarity threshold, used by the Drain algorithm. Increasing similarity_th results in more refined databases. Default value is 0.5. If Drain is disabled, then this parameter has no effect.
tree_depthintIncreasing tree_depth improves the runtime of the Drain algorithm, but might reduce its accuracy. Default value is 4. If Drain is disabled, then this parameter has no effect.
trigram_thintDecreasing trigram_th increases the chances of Logram to replace tokens with wildcards. Default value is 10. If Logram is disabled, then this parameter has no effect.
bigram_thintDecreasing bigram_th increases the chances of Logram to replace tokens with wildcards. Default value is 15. If Logram, then is disabled this parameter has no effect.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let log_reduce_train_fl=(tbl:(*), reduce_col:string, model_name:string,
              use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
              delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
    let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>', 
                                         '([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>', 
                                         '(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
    let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', 'LogReduce', 'parameters_column', '', 
                          'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table, 
                          'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th, 
                          'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn', 
                          'output_type', 'model');
    let code = ```if 1:
        from log_cluster import log_reduce
        result = log_reduce.log_reduce(df, kargs)
    ```;
    tbl
    | extend LogReduce=''
    | evaluate python(typeof(model:string), code, kwargs)
    | project name=model_name, timestamp=now(), model
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = 'Packages\\Text', docstring = 'Find common patterns in textual logs, output a model')
log_reduce_train_fl(tbl:(*), reduce_col:string, model_name:string,
              use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
              delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
    let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>', 
                                         '([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>', 
                                         '(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
    let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', 'LogReduce', 'parameters_column', '', 
                          'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table, 
                          'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th, 
                          'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn', 
                          'output_type', 'model');
    let code = ```if 1:
        from log_cluster import log_reduce
        result = log_reduce.log_reduce(df, kargs)
    ```;
    tbl
    | extend LogReduce=''
    | evaluate python(typeof(model:string), code, kwargs)
    | project name=model_name, timestamp=now(), model
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

//
// Finding common patterns in HDFS logs, export and store the trained model in ML_Models table
//
.set-or-append ML_Models <|
//
let log_reduce_train_fl=(tbl:(*), reduce_col:string, model_name:string,
              use_logram:bool=True, use_drain:bool=True, custom_regexes: dynamic = dynamic([]), custom_regexes_policy: string = 'prepend',
              delimiters:dynamic = dynamic(' '), similarity_th:double=0.5, tree_depth:int = 4, trigram_th:int=10, bigram_th:int=15)
{
    let default_regex_table = pack_array('(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)', '<IP>', 
                                         '([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})', '<GUID>', 
                                         '(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$', '<NUM>');
    let kwargs = bag_pack('reduced_column', reduce_col, 'delimiters', delimiters,'output_column', 'LogReduce', 'parameters_column', '', 
                          'trigram_th', trigram_th, 'bigram_th', bigram_th, 'default_regexes', default_regex_table, 
                          'custom_regexes', custom_regexes, 'custom_regexes_policy', custom_regexes_policy, 'tree_depth', tree_depth, 'similarity_th', similarity_th, 
                          'use_drain', use_drain, 'use_logram', use_logram, 'save_regex_tuples_in_output', True, 'regex_tuples_column', 'RegexesColumn', 
                          'output_type', 'model');
    let code = ```if 1:
        from log_cluster import log_reduce
        result = log_reduce.log_reduce(df, kargs)
    ```;
    tbl
    | extend LogReduce=''
    | evaluate python(typeof(model:string), code, kwargs)
    | project name=model_name, timestamp=now(), model
};
HDFS_log_100k
| take 100000
| invoke log_reduce_train_fl(reduce_col="data", model_name="HDFS_100K")

Stored

//
// Finding common patterns in HDFS logs, export and store the trained model in ML_Models table
//
.set-or-append ML_Models <|
//
HDFS_log_100k
| take 100000
| invoke log_reduce_train_fl(reduce_col="data", model_name="HDFS_100K")

Output

ExtentIdOriginalSizeExtentSizeCompressedSizeIndexSizeRowCount
3734a525-cc08-44b9-a992-72de97b324141038311546108347121

21 - mann_whitney_u_test_fl()

This article describes the mann_whitney_u_test_fl() user-defined function.

The function mann_whitney_u_test_fl() is a UDF (user-defined function) that performs the Mann-Whitney U Test.

Syntax

T | mann_whitney_u_test_fl(data1, data2, test_statistic,p_value [, use_continuity ])

Parameters

NameTypeRequiredDescription
data1string✔️The name of the column containing the first set of data to be used for the test.
data2string✔️The name of the column containing the second set of data to be used for the test.
test_statisticstring✔️The name of the column to store test statistic value for the results.
p_valuestring✔️The name of the column to store p-value for the results.
use_continuityboolDetermines if a continuity correction (1/2) is applied. Default is true.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let mann_whitney_u_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string, use_continuity:bool=true)
{
    let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value, 'use_continuity', use_continuity);
    let code = ```if 1:
        from scipy import stats
        data1 = kargs["data1"]
        data2 = kargs["data2"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        use_continuity = kargs["use_continuity"]
        def func(row):
            statistics = stats.mannwhitneyu(row[data1], row[data2], use_continuity=use_continuity)
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
        ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Stats", docstring = "Mann-Whitney U Test")
mann_whitney_u_test_fl(tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string, use_continuity:bool=true)
{
    let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value, 'use_continuity', use_continuity);
    let code = ```if 1:
        from scipy import stats
        data1 = kargs["data1"]
        data2 = kargs["data2"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        use_continuity = kargs["use_continuity"]
        def func(row):
            statistics = stats.mannwhitneyu(row[data1], row[data2], use_continuity=use_continuity)
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
        ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let mann_whitney_u_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string, use_continuity:bool=true)
{
    let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value, 'use_continuity', use_continuity);
    let code = ```if 1:
        from scipy import stats
        data1 = kargs["data1"]
        data2 = kargs["data2"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        use_continuity = kargs["use_continuity"]
        def func(row):
            statistics = stats.mannwhitneyu(row[data1], row[data2], use_continuity=use_continuity)
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
        ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke mann_whitney_u_test_fl('sample1', 'sample2', 'test_stat', 'p_val')

Stored

datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke mann_whitney_u_test_fl('sample1', 'sample2', 'test_stat', 'p_val')

Output

idsample1sample2test_statp_val
Test #1[23.64, 20.57, 20.42][27.1, 22.12, 33.56]10.095215131912761986
Test #2[20.85, 21.89, 23.41][35.09, 30.02, 26.52]00.04042779918502612
Test #3[20.13, 20.5, 21.7, 22.02][32.2, 32.79, 33.9, 34.22]00.015191410988288745

22 - normality_test_fl()

This article describes the normality_test_fl() user-defined function.

The function normality_test_fl() is a UDF (user-defined function) that performs the Normality Test.

Syntax

T | invoke normality_test_fl(data, test_statistic,p_value)

Parameters

NameTypeRequiredDescription
datastring✔️The name of the column containing the data to be used for the test.
test_statisticstring✔️The name of the column to store test statistic value for the results.
p_valuestring✔️The name of the column to store p-value for the results.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let normality_test_fl = (tbl:(*), data:string, test_statistic:string, p_value:string)
{
    let kwargs = bag_pack('data', data, 'test_statistic', test_statistic, 'p_value', p_value);
    let code = ```if 1:
        from scipy import stats
        data = kargs["data"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        def func(row):
            statistics = stats.normaltest(row[data])
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Stats", docstring = "Normality Test")
normality_test_fl(tbl:(*), data:string, test_statistic:string, p_value:string)
{
    let kwargs = bag_pack('data', data, 'test_statistic', test_statistic, 'p_value', p_value);
    let code = ```if 1:
        from scipy import stats
        data = kargs["data"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        def func(row):
            statistics = stats.normaltest(row[data])
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let normality_test_fl = (tbl:(*), data:string, test_statistic:string, p_value:string)
{
    let kwargs = bag_pack('data', data, 'test_statistic', test_statistic, 'p_value', p_value);
    let code = ```if 1:
        from scipy import stats
        data = kargs["data"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        def func(row):
            statistics = stats.normaltest(row[data])
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, sample1:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42, 27.1, 22.12, 33.56, 23.64, 20.57]),
'Test #2', dynamic([20.85, 21.89, 23.41, 35.09, 30.02, 26.52, 20.85, 21.89]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02, 32.2, 32.79, 33.9, 34.22, 20.13, 20.5])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke normality_test_fl('sample1', 'test_stat', 'p_val')

Stored

datatable(id:string, sample1:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42, 27.1, 22.12, 33.56, 23.64, 20.57]),
'Test #2', dynamic([20.85, 21.89, 23.41, 35.09, 30.02, 26.52, 20.85, 21.89]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02, 32.2, 32.79, 33.9, 34.22, 20.13, 20.5])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke normality_test_fl('sample1', 'test_stat', 'p_val')

Output

idsample1test_statp_val
Test #1[23.64, 20.57, 20.42, 27.1, 22.12, 33.56, 23.64, 20.57]7.48818731539410360.023657060728893706
Test #2[20.85, 21.89, 23.41, 35.09, 30.02, 26.52, 20.85, 21.89]3.299827503302760.19206647332255408
Test #3[20.13, 20.5, 21.7, 22.02, 32.2, 32.79, 33.9, 34.22, 20.13, 20.5]6.98684338513643240.030396685911910585

23 - pair_probabilities_fl()

This article describes the pair_probabilities_fl() user-defined function.

Calculate various probabilities and related metrics for a pair of categorical variables.

The function pair_probabilities_fl() is a UDF (user-defined function) that calculates the following probabilities and related metrics for a pair of categorical variables, A and B, as follows:

  • P(A) is the probability of each value A=a
  • P(B) is the probability of each value B=b
  • P(A|B) is the conditional probability of A=a given B=b
  • P(B|A) is the conditional probability of B=b given A=a
  • P(A∪B) is the union probability (A=a or B=b)
  • P(A∩B) is the intersection probability (A=a and B=b)
  • The lift metric is calculated as P(A∩B)/P(A)*P(B). For more information, see lift metric.
    • A lift near 1 means that the joint probability of two values is similar to what is expected in case that both variables are independent.
    • Lift » 1 means that values cooccur more often than expected under independence assumption.
    • Lift « 1 means that values are less likely to cooccur than expected under independence assumption.
  • The Jaccard similarity coefficient is calculated as P(A∩B)/P(A∪B). For more information, see Jaccard similarity coefficient.
    • A high Jaccard coefficient, close to 1, means that the values tend to occur together.
    • A low Jaccard coefficient, close to 0, means that the values tend to stay apart.

Syntax

pair_probabilities_fl(A, B, Scope)

Parameters

NameTypeRequiredDescription
Ascalar✔️The first categorical variable.
Bscalar✔️The second categorical variable.
Scopescalar✔️The field that contains the scope, so that the probabilities for A and B are calculated independently for each scope value.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let pair_probabilities_fl = (tbl:(*), A_col:string, B_col:string, scope_col:string)
{
let T = materialize(tbl | extend _A = column_ifexists(A_col, ''), _B = column_ifexists(B_col, ''), _scope = column_ifexists(scope_col, ''));
let countOnScope = T | summarize countAllOnScope = count() by _scope;
let probAB = T | summarize countAB = count() by _A, _B, _scope | join kind = leftouter (countOnScope) on _scope | extend P_AB = todouble(countAB)/countAllOnScope;
let probA  = probAB | summarize countA = sum(countAB), countAllOnScope = max(countAllOnScope) by _A, _scope | extend P_A = todouble(countA)/countAllOnScope;
let probB  = probAB | summarize countB = sum(countAB), countAllOnScope = max(countAllOnScope) by _B, _scope | extend P_B = todouble(countB)/countAllOnScope;
probAB
| join kind = leftouter (probA) on _A, _scope           // probability for each value of A
| join kind = leftouter (probB) on _B, _scope           // probability for each value of B
| extend P_AUB = P_A + P_B - P_AB                       // union probability
       , P_AIB = P_AB/P_B                               // conditional probability of A on B
       , P_BIA = P_AB/P_A                               // conditional probability of B on A
| extend Lift_AB = P_AB/(P_A * P_B)                     // lift metric
       , Jaccard_AB = P_AB/P_AUB                        // Jaccard similarity index
| project _A, _B, _scope, bin(P_A, 0.00001), bin(P_B, 0.00001), bin(P_AB, 0.00001), bin(P_AUB, 0.00001), bin(P_AIB, 0.00001)
, bin(P_BIA, 0.00001), bin(Lift_AB, 0.00001), bin(Jaccard_AB, 0.00001)
| sort by _scope, _A, _B
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Stats", docstring = "Calculate probabilities and related metrics for a pair of categorical variables")
pair_probabilities_fl (tbl:(*), A_col:string, B_col:string, scope_col:string)
{
let T = materialize(tbl | extend _A = column_ifexists(A_col, ''), _B = column_ifexists(B_col, ''), _scope = column_ifexists(scope_col, ''));
let countOnScope = T | summarize countAllOnScope = count() by _scope;
let probAB = T | summarize countAB = count() by _A, _B, _scope | join kind = leftouter (countOnScope) on _scope | extend P_AB = todouble(countAB)/countAllOnScope;
let probA  = probAB | summarize countA = sum(countAB), countAllOnScope = max(countAllOnScope) by _A, _scope | extend P_A = todouble(countA)/countAllOnScope;
let probB  = probAB | summarize countB = sum(countAB), countAllOnScope = max(countAllOnScope) by _B, _scope | extend P_B = todouble(countB)/countAllOnScope;
probAB
| join kind = leftouter (probA) on _A, _scope           // probability for each value of A
| join kind = leftouter (probB) on _B, _scope           // probability for each value of B
| extend P_AUB = P_A + P_B - P_AB                       // union probability
       , P_AIB = P_AB/P_B                               // conditional probability of A on B
       , P_BIA = P_AB/P_A                               // conditional probability of B on A
| extend Lift_AB = P_AB/(P_A * P_B)                     // lift metric
       , Jaccard_AB = P_AB/P_AUB                        // Jaccard similarity index
| project _A, _B, _scope, bin(P_A, 0.00001), bin(P_B, 0.00001), bin(P_AB, 0.00001), bin(P_AUB, 0.00001), bin(P_AIB, 0.00001)
, bin(P_BIA, 0.00001), bin(Lift_AB, 0.00001), bin(Jaccard_AB, 0.00001)
| sort by _scope, _A, _B
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let pair_probabilities_fl = (tbl:(*), A_col:string, B_col:string, scope_col:string)
{
let T = materialize(tbl | extend _A = column_ifexists(A_col, ''), _B = column_ifexists(B_col, ''), _scope = column_ifexists(scope_col, ''));
let countOnScope = T | summarize countAllOnScope = count() by _scope;
let probAB = T | summarize countAB = count() by _A, _B, _scope | join kind = leftouter (countOnScope) on _scope | extend P_AB = todouble(countAB)/countAllOnScope;
let probA  = probAB | summarize countA = sum(countAB), countAllOnScope = max(countAllOnScope) by _A, _scope | extend P_A = todouble(countA)/countAllOnScope;
let probB  = probAB | summarize countB = sum(countAB), countAllOnScope = max(countAllOnScope) by _B, _scope | extend P_B = todouble(countB)/countAllOnScope;
probAB
| join kind = leftouter (probA) on _A, _scope           // probability for each value of A
| join kind = leftouter (probB) on _B, _scope           // probability for each value of B
| extend P_AUB = P_A + P_B - P_AB                       // union probability
       , P_AIB = P_AB/P_B                               // conditional probability of A on B
       , P_BIA = P_AB/P_A                               // conditional probability of B on A
| extend Lift_AB = P_AB/(P_A * P_B)                     // lift metric
       , Jaccard_AB = P_AB/P_AUB                        // Jaccard similarity index
| project _A, _B, _scope, bin(P_A, 0.00001), bin(P_B, 0.00001), bin(P_AB, 0.00001), bin(P_AUB, 0.00001), bin(P_AIB, 0.00001)
, bin(P_BIA, 0.00001), bin(Lift_AB, 0.00001), bin(Jaccard_AB, 0.00001)
| sort by _scope, _A, _B
};
//
let dancePairs = datatable(boy:string, girl:string, dance_class:string)[
    'James',   'Mary',      'Modern',
    'James',   'Mary',      'Modern',
    'Robert',  'Mary',      'Modern',
    'Robert',  'Mary',      'Modern',
    'Michael', 'Patricia',  'Modern',
    'Michael', 'Patricia',  'Modern',
    'James',   'Patricia',  'Modern',
    'Robert',  'Patricia',  'Modern',
    'Michael', 'Patricia',  'Modern',
    'Michael', 'Patricia',  'Modern',
    'James',   'Linda',     'Modern',
    'James',   'Linda',     'Modern',
    'Robert',  'Linda',     'Modern',
    'Robert',  'Linda',     'Modern',
    'James',   'Linda',     'Modern',
    'Robert',  'Mary',      'Modern',
    'Michael', 'Patricia',  'Modern',
    'Michael', 'Patricia',  'Modern',
    'James',   'Linda',     'Modern',
    'Robert',  'Mary',      'Classic',
    'Robert',  'Linda',     'Classic',
    'James',   'Mary',      'Classic',
    'James',   'Linda',     'Classic'
];
dancePairs
| invoke pair_probabilities_fl('boy','girl', 'dance_class')

Stored

let dancePairs = datatable(boy:string, girl:string, dance_class:string)[
    'James',   'Mary',      'Modern',
    'James',   'Mary',      'Modern',
    'Robert',  'Mary',      'Modern',
    'Robert',  'Mary',      'Modern',
    'Michael', 'Patricia',  'Modern',
    'Michael', 'Patricia',  'Modern',
    'James',   'Patricia',  'Modern',
    'Robert',  'Patricia',  'Modern',
    'Michael', 'Patricia',  'Modern',
    'Michael', 'Patricia',  'Modern',
    'James',   'Linda',     'Modern',
    'James',   'Linda',     'Modern',
    'Robert',  'Linda',     'Modern',
    'Robert',  'Linda',     'Modern',
    'James',   'Linda',     'Modern',
    'Robert',  'Mary',      'Modern',
    'Michael', 'Patricia',  'Modern',
    'Michael', 'Patricia',  'Modern',
    'James',   'Linda',     'Modern',
    'Robert',  'Mary',      'Classic',
    'Robert',  'Linda',     'Classic',
    'James',   'Mary',      'Classic',
    'James',   'Linda',     'Classic'
];
dancePairs
| invoke pair_probabilities_fl('boy','girl', 'dance_class')

Output

Let’s look at list of pairs of people dancing at two dance classes supposedly at random to find out if anything looks anomalous (meaning, not random). We’ll start by looking at each class by itself.

The Michael-Patricia pair has a lift metric of 2.375, which is significantly above 1. This value means that they’re seen together much more often that what would be expected if this pairing was random. Their Jaccard coefficient is 0.75, which is close to 1. When the pair dances, they prefer to dance together.

ABscopeP_AP_BP_ABP_AUBP_AIBP_BIALift_ABJaccard_AB
RobertPatriciaModern0.315780.421050.052630.684210.124990.166660.395830.07692
RobertMaryModern0.315780.263150.157890.421050.599990.499991.899990.37499
RobertLindaModern0.315780.315780.105260.526310.333330.333331.055550.2
MichaelPatriciaModern0.315780.421050.315780.421050.750.999992.3750.75
JamesPatriciaModern0.368420.421050.052630.736840.124990.142850.339280.07142
JamesMaryModern0.368420.263150.105260.526310.40.285711.085710.2
JamesLindaModern0.368420.315780.210520.473680.666660.571421.809520.44444
RobertMaryClassic0.499990.499990.249990.750.499990.499990.999990.33333
RobertLindaClassic0.499990.499990.249990.750.499990.499990.999990.33333
JamesMaryClassic0.499990.499990.249990.750.499990.499990.999990.33333
JamesLindaClassic0.499990.499990.249990.750.499990.499990.999990.33333

24 - pairwise_dist_fl()

Learn how to use the pairwise_dist_fl() function to calculate the multivariate distance between data points in the same partition.

Calculate pairwise distances between entities based on multiple nominal and numerical variables.

The function pairwise_dist_fl() is a UDF (user-defined function) that calculates the multivariate distance between data points belonging to the same partition, taking into account nominal and numerical variables.

  • All string fields, besides entity and partition names, are considered nominal variables. The distance is equal to 1 if the values are different, and 0 if they’re the same.
  • All numerical fields are considered numerical variables. They’re normalized by transforming to z-scores and the distance is calculated as the absolute value of the difference. The total multivariate distance between data points is calculated as the average of the distances between variables.

A distance close to zero means that the entities are similar and a distance above 1 means they’re different. Similarly, an entity with an average distance close to or above one indicates that it’s different from many other entities in the partition, indicating a potential outlier.

Syntax

pairwise_dist_fl(entity, partition)

Parameters

NameTypeRequiredDescription
entitystring✔️The name of the input table column containing the names or IDs of the entities for which the distances will be calculated.
partitionstring✔️The name of the input table column containing the partition or scope, so that the distances are calculated for all pairs of entities under the same partition.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let pairwise_dist_fl = (tbl:(*), id_col:string, partition_col:string)
{
    let generic_dist = (value1:dynamic, value2:dynamic) 
    {
        // Calculates the distance between two values; treats all strings as nominal values and numbers as numerical,
        // can be extended to other data types or tweaked by adding weights or changing formulas.
            iff(gettype(value1[0]) == "string", todouble(tostring(value1[0]) != tostring(value2[0])), abs(todouble(value1[0]) - todouble(value2[0])))
    };
    let T = (tbl | extend _entity = column_ifexists(id_col, ''), _partition = column_ifexists(partition_col, '') | project-reorder _entity, _partition);
    let sum_data = (
        // Calculates summary statistics to be used for normalization.
        T
        | project-reorder _entity
        | project _partition, p = pack_array(*)
        | mv-expand with_itemindex=idx p
        | summarize count(), avg(todouble(p)), stdev(todouble(p)) by _partition, idx
        | sort by _partition, idx asc
        | summarize make_list(avg_p), make_list(stdev_p) by _partition
    );
    let normalized_data = (
        // Performs normalization on numerical variables by substrcting mean and scaling by standard deviation. Other normalization techniques can be used
        // by adding metrics to previous function and using here.
        T
        | project _partition, p = pack_array(*)
        | join kind = leftouter (sum_data) on _partition
        | mv-apply p, list_avg_p, list_stdev_p on (
            extend normalized = iff((not(isnan(todouble(list_avg_p))) and (list_stdev_p > 0)), pack_array((todouble(p) - todouble(list_avg_p))/todouble(list_stdev_p)), p)
            | summarize a = make_list(normalized) by _partition
        )
        | project _partition, a
    );
    let dist_data = (
        // Calculates distances of included variables and sums them up to get a multivariate distance between all entities under the same partition.
        normalized_data
        | join kind = inner (normalized_data) on _partition
        | project entity = tostring(a[0]), entity1 = tostring(a1[0]), a = array_slice(a, 1, -1), a1 = array_slice(a1, 1, -1), _partition
        | mv-apply a, a1 on 
        (
            project d = generic_dist(pack_array(a), pack_array(a1))
            | summarize d = make_list(d)
        )
        | extend dist = bin((1.0*array_sum(d)-1.0)/array_length(d), 0.0001) // -1 cancels the artifact distance calculated between entity names appearing in the bag and normalizes by number of features        
        | project-away d
        | where entity != entity1
        | sort by _partition asc, entity asc, dist asc
    );
    dist_data
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Stats", docstring = "Calculate distances between pairs of entites based on multiple nominal and numerical variables")
pairwise_dist_fl (tbl:(*), id_col:string, partition_col:string)
{
    let generic_dist = (value1:dynamic, value2:dynamic) 
    {
        // Calculates the distance between two values; treats all strings as nominal values and numbers as numerical,
        // can be extended to other data types or tweaked by adding weights or changing formulas.
            iff(gettype(value1[0]) == "string", todouble(tostring(value1[0]) != tostring(value2[0])), abs(todouble(value1[0]) - todouble(value2[0])))
    };
    let T = (tbl | extend _entity = column_ifexists(id_col, ''), _partition = column_ifexists(partition_col, '') | project-reorder _entity, _partition);
    let sum_data = (
        // Calculates summary statistics to be used for normalization.
        T
        | project-reorder _entity
        | project _partition, p = pack_array(*)
        | mv-expand with_itemindex=idx p
        | summarize count(), avg(todouble(p)), stdev(todouble(p)) by _partition, idx
        | sort by _partition, idx asc
        | summarize make_list(avg_p), make_list(stdev_p) by _partition
    );
    let normalized_data = (
        // Performs normalization on numerical variables by substrcting mean and scaling by standard deviation. Other normalization techniques can be used
        // by adding metrics to previous function and using here.
        T
        | project _partition, p = pack_array(*)
        | join kind = leftouter (sum_data) on _partition
        | mv-apply p, list_avg_p, list_stdev_p on (
            extend normalized = iff((not(isnan(todouble(list_avg_p))) and (list_stdev_p > 0)), pack_array((todouble(p) - todouble(list_avg_p))/todouble(list_stdev_p)), p)
            | summarize a = make_list(normalized) by _partition
        )
        | project _partition, a
    );
    let dist_data = (
        // Calculates distances of included variables and sums them up to get a multivariate distance between all entities under the same partition.
        normalized_data
        | join kind = inner (normalized_data) on _partition
        | project entity = tostring(a[0]), entity1 = tostring(a1[0]), a = array_slice(a, 1, -1), a1 = array_slice(a1, 1, -1), _partition
        | mv-apply a, a1 on 
        (
            project d = generic_dist(pack_array(a), pack_array(a1))
            | summarize d = make_list(d)
        )
        | extend dist = bin((1.0*array_sum(d)-1.0)/array_length(d), 0.0001) // -1 cancels the artifact distance calculated between entity names appearing in the bag and normalizes by number of features        
        | project-away d
        | where entity != entity1
        | sort by _partition asc, entity asc, dist asc
    );
    dist_data
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let pairwise_dist_fl = (tbl:(*), id_col:string, partition_col:string)
{
    let generic_dist = (value1:dynamic, value2:dynamic) 
    {
        // Calculates the distance between two values; treats all strings as nominal values and numbers as numerical,
        // can be extended to other data types or tweaked by adding weights or changing formulas.
            iff(gettype(value1[0]) == "string", todouble(tostring(value1[0]) != tostring(value2[0])), abs(todouble(value1[0]) - todouble(value2[0])))
    };
    let T = (tbl | extend _entity = column_ifexists(id_col, ''), _partition = column_ifexists(partition_col, '') | project-reorder _entity, _partition);
    let sum_data = (
        // Calculates summary statistics to be used for normalization.
        T
        | project-reorder _entity
        | project _partition, p = pack_array(*)
        | mv-expand with_itemindex=idx p
        | summarize count(), avg(todouble(p)), stdev(todouble(p)) by _partition, idx
        | sort by _partition, idx asc
        | summarize make_list(avg_p), make_list(stdev_p) by _partition
    );
    let normalized_data = (
        // Performs normalization on numerical variables by substrcting mean and scaling by standard deviation. Other normalization techniques can be used
        // by adding metrics to previous function and using here.
        T
        | project _partition, p = pack_array(*)
        | join kind = leftouter (sum_data) on _partition
        | mv-apply p, list_avg_p, list_stdev_p on (
            extend normalized = iff((not(isnan(todouble(list_avg_p))) and (list_stdev_p > 0)), pack_array((todouble(p) - todouble(list_avg_p))/todouble(list_stdev_p)), p)
            | summarize a = make_list(normalized) by _partition
        )
        | project _partition, a
    );
    let dist_data = (
        // Calculates distances of included variables and sums them up to get a multivariate distance between all entities under the same partition.
        normalized_data
        | join kind = inner (normalized_data) on _partition
        | project entity = tostring(a[0]), entity1 = tostring(a1[0]), a = array_slice(a, 1, -1), a1 = array_slice(a1, 1, -1), _partition
        | mv-apply a, a1 on 
        (
            project d = generic_dist(pack_array(a), pack_array(a1))
            | summarize d = make_list(d)
        )
        | extend dist = bin((1.0*array_sum(d)-1.0)/array_length(d), 0.0001) // -1 cancels the artifact distance calculated between entity names appearing in the bag and normalizes by number of features        
        | project-away d
        | where entity != entity1
        | sort by _partition asc, entity asc, dist asc
    );
    dist_data
};
//
let raw_data = datatable(name:string, gender: string, height:int, weight:int, limbs:int, accessory:string, type:string)[
    'Andy',     'M',    160,    80,     4,  'Hat',      'Person',
    'Betsy',    'F',    170,    70,     4,  'Bag',      'Person',
    'Cindy',    'F',    130,    30,     4,  'Hat',      'Person',
    'Dan',      'M',    190,    105,    4,  'Hat',      'Person',
    'Elmie',    'M',    110,    30,     4,  'Toy',      'Person',
    'Franny',   'F',    170,    65,     4,  'Bag',      'Person',
    'Godzilla', '?',    260,    210,    5,  'Tail',     'Person',
    'Hannie',   'F',    112,    28,     4,  'Toy',      'Person',
    'Ivie',     'F',    105,    20,     4,  'Toy',      'Person',
    'Johnnie',  'M',    107,    21,     4,  'Toy',      'Person',
    'Kyle',     'M',    175,    76,     4,  'Hat',      'Person',
    'Laura',    'F',    180,    70,     4,  'Bag',      'Person',
    'Mary',     'F',    160,    60,     4,  'Bag',      'Person',
    'Noah',     'M',    178,    90,     4,  'Hat',      'Person',
    'Odelia',   'F',    186,    76,     4,  'Bag',      'Person',
    'Paul',     'M',    158,    69,     4,  'Bag',      'Person',
    'Qui',      'F',    168,    62,     4,  'Bag',      'Person',
    'Ronnie',   'M',    108,    26,     4,  'Toy',      'Person',
    'Sonic',    'F',    52,     20,     6,  'Tail',     'Pet',
    'Tweety',   'F',    52,     20,     6,  'Tail',     'Pet' ,
    'Ulfie',    'M',    39,     29,     4,  'Wings',    'Pet',
    'Vinnie',   'F',    53,     22,     1,  'Tail',     'Pet',
    'Waldo',    'F',    51,     21,     4,  'Tail',     'Pet',
    'Xander',   'M',    50,     24,     4,  'Tail',     'Pet'
];
raw_data
| invoke pairwise_dist_fl('name', 'type')
| where _partition == 'Person' | sort by entity asc, entity1 asc
| evaluate pivot (entity, max(dist), entity1) | sort by entity1 asc

Stored

let raw_data = datatable(name:string, gender: string, height:int, weight:int, limbs:int, accessory:string, type:string)[
    'Andy',     'M',    160,    80,     4,  'Hat',      'Person',
    'Betsy',    'F',    170,    70,     4,  'Bag',      'Person',
    'Cindy',    'F',    130,    30,     4,  'Hat',      'Person',
    'Dan',      'M',    190,    105,    4,  'Hat',      'Person',
    'Elmie',    'M',    110,    30,     4,  'Toy',      'Person',
    'Franny',   'F',    170,    65,     4,  'Bag',      'Person',
    'Godzilla', '?',    260,    210,    5,  'Tail',     'Person',
    'Hannie',   'F',    112,    28,     4,  'Toy',      'Person',
    'Ivie',     'F',    105,    20,     4,  'Toy',      'Person',
    'Johnnie',  'M',    107,    21,     4,  'Toy',      'Person',
    'Kyle',     'M',    175,    76,     4,  'Hat',      'Person',
    'Laura',    'F',    180,    70,     4,  'Bag',      'Person',
    'Mary',     'F',    160,    60,     4,  'Bag',      'Person',
    'Noah',     'M',    178,    90,     4,  'Hat',      'Person',
    'Odelia',   'F',    186,    76,     4,  'Bag',      'Person',
    'Paul',     'M',    158,    69,     4,  'Bag',      'Person',
    'Qui',      'F',    168,    62,     4,  'Bag',      'Person',
    'Ronnie',   'M',    108,    26,     4,  'Toy',      'Person',
    'Sonic',    'F',    52,     20,     6,  'Tail',     'Pet',
    'Tweety',   'F',    52,     20,     6,  'Tail',     'Pet' ,
    'Ulfie',    'M',    39,     29,     4,  'Wings',    'Pet',
    'Vinnie',   'F',    53,     22,     1,  'Tail',     'Pet',
    'Woody',    'F',    51,     21,     4,  'Tail',     'Pet',
    'Xander',   'M',    50,     24,     4,  'Tail',     'Pet'
];
raw_data
| invoke pairwise_dist_fl('name', 'type')
| where _partition == 'Person' | sort by entity asc, entity1 asc
| evaluate pivot (entity, max(dist), entity1) | sort by entity1 asc

Output

entity1AndyBetsyCindyDanElmieFrannyGodzillaHannie
Andy0.3540.41250.18870.48430.37021.20870.6265
Betsy0.3540.4160.47080.63070.01611.20510.4872
Cindy0.41250.4160.60120.35750.39981.47830.214
Dan0.18870.47080.60120.6730.4871.01990.8152
Elmie0.48430.63070.35750.6730.61451.55020.1565
Franny0.37020.01610.39980.4870.61451.22130.471
Godzilla1.20871.20511.47831.01991.55021.22131.5495
Hannie0.62650.48720.2140.81520.15650.4711.5495

Looking at entities of two different types, we would like to calculate distance between entities belonging to the same type, by taking into account both nominal variables (such as gender or preferred accessory) and numerical variables (such as the number of limbs, height, and weight). The numerical variables are on different scales and must be centralized and scaled, which is done automatically. The output is pairs of entities under the same partition with calculated multivariate distance. It can be analyzed directly, visualized as a distance matrix or scatterplot, or used as input data for outlier detection algorithm by calculating mean distance per entity, with entities with high values indicating global outliers. For example, when adding an optional visualization using a distance matrix, you get a table as shown in the sample. From the sample, you can see that:

  • Some pairs of entities (Betsy and Franny) have a low distance value (close to 0) indicating they’re similar.
  • Some pairs of entities (Godzilla and Elmie) have a high distance value (1 or above) indicating they’re different.

The output can further be used to calculate the average distance per entity. A high average distance might indicate global outliers. For example, we can see that on average Godzilla has a high distance from the others indicating that it’s a probable global outlier.

25 - percentiles_linear_fl()

Learn how to use the percentiles_linear_fl() function to calculate percentiles using the linear interpolation between closest ranks.

The function percentiles_linear_fl() is a user-defined function (UDF) that calculates percentiles using linear interpolation between closest ranks, the same method used by Excel’s PERCENTILES.INC function. Kusto native percentile functions use the nearest rank method. For large sets of values the difference between both methods is insignificant, and we recommend using the native function for best performance. For further details on these and additional percentile calculation methods have a look at percentile article on Wikipedia. The function accepts a table containing the column to calculate on and an optional grouping key, and a dynamic array of the required percentiles, and returns a column containing dynamic array of the percentiles’ values per each group.

Syntax

T | invoke percentiles_linear_fl(val_col, pct_arr [, aggr_col ])

Parameters

NameTypeRequiredDescription
val_colstring✔️The name of the column that contains the values with which to calculate the percentiles.
pct_arrdynamic✔️A numerical array containing the required percentiles. Each percentile should be in the range [0-100].
aggr_colstringThe name of the column that contains the grouping key.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let percentiles_linear_fl=(tbl:(*), val_col:string, pct_arr:dynamic, aggr_col:string='')
{
    tbl
    | extend _vals = column_ifexists(val_col, 0.0)
    | extend _key = column_ifexists(aggr_col, 'ALL')
    | order by _key asc, _vals asc 
    | summarize _vals=make_list(_vals) by _key
    | extend n = array_length(_vals)
    | extend pct=pct_arr
    | mv-apply pct to typeof(real) on (
          extend index=pct/100.0*(n-1)
        | extend low_index=tolong(floor(index, 1)), high_index=tolong(ceiling(index))
        | extend interval=todouble(_vals[high_index])-todouble(_vals[low_index])
        | extend pct_val=todouble(_vals[low_index])+(index-low_index)*interval
        | summarize pct_arr=make_list(pct), pct_val=make_list(pct_val))
    | project-away n
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Stats", docstring = "Calculate linear interpolated percentiles (identical to Excel's PERCENTILE.INC)")
percentiles_linear_fl(tbl:(*), val_col:string, pct_arr:dynamic, aggr_col:string='')
{
    tbl
    | extend _vals = column_ifexists(val_col, 0.0)
    | extend _key = column_ifexists(aggr_col, 'ALL')
    | order by _key asc, _vals asc 
    | summarize _vals=make_list(_vals) by _key
    | extend n = array_length(_vals)
    | extend pct=pct_arr
    | mv-apply pct to typeof(real) on (
          extend index=pct/100.0*(n-1)
        | extend low_index=tolong(floor(index, 1)), high_index=tolong(ceiling(index))
        | extend interval=todouble(_vals[high_index])-todouble(_vals[low_index])
        | extend pct_val=todouble(_vals[low_index])+(index-low_index)*interval
        | summarize pct_arr=make_list(pct), pct_val=make_list(pct_val))
    | project-away n
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let percentiles_linear_fl=(tbl:(*), val_col:string, pct_arr:dynamic, aggr_col:string='')
{
    tbl
    | extend _vals = column_ifexists(val_col, 0.0)
    | extend _key = column_ifexists(aggr_col, 'ALL')
    | order by _key asc, _vals asc 
    | summarize _vals=make_list(_vals) by _key
    | extend n = array_length(_vals)
    | extend pct=pct_arr
    | mv-apply pct to typeof(real) on (
          extend index=pct/100.0*(n-1)
        | extend low_index=tolong(floor(index, 1)), high_index=tolong(ceiling(index))
        | extend interval=todouble(_vals[high_index])-todouble(_vals[low_index])
        | extend pct_val=todouble(_vals[low_index])+(index-low_index)*interval
        | summarize pct_arr=make_list(pct), pct_val=make_list(pct_val))
    | project-away n
};
datatable(x:long, name:string) [
5, 'A',
9, 'A',
7, 'A',
5, 'B',
7, 'B',
7, 'B',
10, 'B',
]
| invoke percentiles_linear_fl('x', dynamic([0, 25, 50, 75, 100]), 'name')
| project-rename name=_key, x=_vals

Stored

datatable(x:long, name:string) [
5, 'A',
9, 'A',
7, 'A',
5, 'B',
7, 'B',
7, 'B',
10, 'B',
]
| invoke percentiles_linear_fl('x', dynamic([0, 25, 50, 75, 100]), 'name')
| project-rename name=_key, x=_vals

Output

namexpct_arrpct_val
A[5,7,9][0,25,50,75,100][5,6,7,8,9]
B[5,7,7,10][0,25,50,75,100][5,6.5,7,7.75,10]

26 - perm_fl()

This article describes perm_fl() user-defined function.

Calculate P(n, k)

The function perm_fl() is a user-defined function (UDF) that calculates P(n, k), the number of permutations for selection of k items out of n, with order. It’s based on the native gamma() function to calculate factorial, (see facorial_fl()). For selection of k items without order, use comb_fl().

Syntax

perm_fl(n, k)

Parameters

NameTypeRequiredDescription

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let perm_fl=(n:int, k:int)
{
    let fact_n = gamma(n+1);
    let fact_nk = gamma(n-k+1);
    tolong(fact_n/fact_nk)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Stats", docstring = "Calculate number of permutations for selection of k items out of n items with order")
perm_fl(n:int, k:int)
{
    let fact_n = gamma(n+1);
    let fact_nk = gamma(n-k+1);
    tolong(fact_n/fact_nk)
}

Example

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let perm_fl=(n:int, k:int)
{
    let fact_n = gamma(n+1);
    let fact_nk = gamma(n-k+1);
    tolong(fact_n/fact_nk)
}
;
range n from 3 to 10 step 3
| extend k = n-2
| extend pnk = perm_fl(n, k)

Stored

range n from 3 to 10 step 3
| extend k = n-2
| extend pnk = perm_fl(n, k)

Output

nkpnk
313
64360
97181440

27 - plotly_anomaly_fl()

Learn how to use the plotly_anomaly_fl() user-defined function.

The function plotly_anomaly_fl() is a user-defined function (UDF) that allows you to customize a plotly template to create an interactive anomaly chart.

The function accepts a table containing the source and the baseline time series, lists of positive and negative anomalies with their respective sizes, and chart labeling string. The function returns a single cell table containing plotly JSON. Optionally, you can render the data in an Azure Data Explorer dashboard tile. For more information, see Plotly (preview). The function accepts a table containing the source and the baseline time series, lists of positive and negative anomalies with their respective sizes, and chart labeling string. The function returns a single cell table containing plotly JSON. Optionally, you can render the data in a Real-Time dashboard tile. For more information, see Plotly (preview).

Prerequisite

Extract the required ‘anomaly’ template from the publicly available PlotlyTemplate table. Copy this table from the Samples database to your database by running the following KQL command from your target database:

.set PlotlyTemplate <| cluster('help.kusto.windows.net').database('Samples').PlotlyTemplate

Syntax

T | invoke plotly_anomaly_fl(time_col, val_col, baseline_col, time_high_col, val_high_col, size_high_col, time_low_col, val_low__col, size_low_col, chart_title, series_name, val_name)

Parameters

NameTypeRequiredDescription
time_colstring✔️The name of the column containing the dynamic array of the time points of the original time series
val_colstring✔️The name of the column containing the values of the original time series
baseline_colstring✔️The name of the column containing the values of the baseline time series. Anomalies are usually detected by large value offset from the expected baseline value.
time_high_colstring✔️The name of the column containing the time points of high (above the baseline) anomalies
val_high_colstring✔️The name of the column containing the values of the high anomalies
size_high_colstring✔️The name of the column containing the marker sizes of the high anomalies
time_low_colstring✔️The name of the column containing the time points of low anomalies
val_low_colstring✔️The name of the column containing the values of the low anomalies
size_low_colstring✔️The name of the column containing the marker sizes of the low anomalies
chart_titlestringChart title, default is ‘Anomaly Chart’
series_namestringTime series name, default is ‘Metric’
val_namestringValue axis name, default is ‘Value’

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let plotly_anomaly_fl=(tbl:(*), time_col:string, val_col:string, baseline_col:string, time_high_col:string , val_high_col:string, size_high_col:string,
                                time_low_col:string, val_low_col:string, size_low_col:string,
                                chart_title:string='Anomaly chart', series_name:string='Metric', val_name:string='Value')
{
    let anomaly_chart = toscalar(PlotlyTemplate | where name == "anomaly" | project plotly);
    let tbl_ex = tbl | extend _timestamp = column_ifexists(time_col, datetime(null)), _values = column_ifexists(val_col, 0.0), _baseline = column_ifexists(baseline_col, 0.0),
                              _high_timestamp = column_ifexists(time_high_col, datetime(null)), _high_values = column_ifexists(val_high_col, 0.0), _high_size = column_ifexists(size_high_col, 1),
                              _low_timestamp = column_ifexists(time_low_col, datetime(null)), _low_values = column_ifexists(val_low_col, 0.0), _low_size = column_ifexists(size_low_col, 1);
    tbl_ex
    | extend plotly = anomaly_chart
    | extend plotly=replace_string(plotly, '$TIME_STAMPS$', tostring(_timestamp))
    | extend plotly=replace_string(plotly, '$SERIES_VALS$', tostring(_values))
    | extend plotly=replace_string(plotly, '$BASELINE_VALS$', tostring(_baseline))
    | extend plotly=replace_string(plotly, '$TIME_STAMPS_HIGH_ANOMALIES$', tostring(_high_timestamp))
    | extend plotly=replace_string(plotly, '$HIGH_ANOMALIES_VALS$', tostring(_high_values))
    | extend plotly=replace_string(plotly, '$HIGH_ANOMALIES_MARKER_SIZE$', tostring(_high_size))
    | extend plotly=replace_string(plotly, '$TIME_STAMPS_LOW_ANOMALIES$', tostring(_low_timestamp))
    | extend plotly=replace_string(plotly, '$LOW_ANOMALIES_VALS$', tostring(_low_values))
    | extend plotly=replace_string(plotly, '$LOW_ANOMALIES_MARKER_SIZE$', tostring(_low_size))
    | extend plotly=replace_string(plotly, '$TITLE$', chart_title)
    | extend plotly=replace_string(plotly, '$SERIES_NAME$', series_name)
    | extend plotly=replace_string(plotly, '$Y_NAME$', val_name)
    | project plotly
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Plotly", docstring = "Render anomaly chart using plotly template")
plotly_anomaly_fl(tbl:(*), time_col:string, val_col:string, baseline_col:string, time_high_col:string , val_high_col:string, size_high_col:string,
                                time_low_col:string, val_low_col:string, size_low_col:string,
                                chart_title:string='Anomaly chart', series_name:string='Metric', val_name:string='Value')
{
    let anomaly_chart = toscalar(PlotlyTemplate | where name == "anomaly" | project plotly);
    let tbl_ex = tbl | extend _timestamp = column_ifexists(time_col, datetime(null)), _values = column_ifexists(val_col, 0.0), _baseline = column_ifexists(baseline_col, 0.0),
                              _high_timestamp = column_ifexists(time_high_col, datetime(null)), _high_values = column_ifexists(val_high_col, 0.0), _high_size = column_ifexists(size_high_col, 1),
                              _low_timestamp = column_ifexists(time_low_col, datetime(null)), _low_values = column_ifexists(val_low_col, 0.0), _low_size = column_ifexists(size_low_col, 1);
    tbl_ex
    | extend plotly = anomaly_chart
    | extend plotly=replace_string(plotly, '$TIME_STAMPS$', tostring(_timestamp))
    | extend plotly=replace_string(plotly, '$SERIES_VALS$', tostring(_values))
    | extend plotly=replace_string(plotly, '$BASELINE_VALS$', tostring(_baseline))
    | extend plotly=replace_string(plotly, '$TIME_STAMPS_HIGH_ANOMALIES$', tostring(_high_timestamp))
    | extend plotly=replace_string(plotly, '$HIGH_ANOMALIES_VALS$', tostring(_high_values))
    | extend plotly=replace_string(plotly, '$HIGH_ANOMALIES_MARKER_SIZE$', tostring(_high_size))
    | extend plotly=replace_string(plotly, '$TIME_STAMPS_LOW_ANOMALIES$', tostring(_low_timestamp))
    | extend plotly=replace_string(plotly, '$LOW_ANOMALIES_VALS$', tostring(_low_values))
    | extend plotly=replace_string(plotly, '$LOW_ANOMALIES_MARKER_SIZE$', tostring(_low_size))
    | extend plotly=replace_string(plotly, '$TITLE$', chart_title)
    | extend plotly=replace_string(plotly, '$SERIES_NAME$', series_name)
    | extend plotly=replace_string(plotly, '$Y_NAME$', val_name)
    | project plotly
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let plotly_anomaly_fl=(tbl:(*), time_col:string, val_col:string, baseline_col:string, time_high_col:string , val_high_col:string, size_high_col:string,
                                time_low_col:string, val_low_col:string, size_low_col:string,
                                chart_title:string='Anomaly chart', series_name:string='Metric', val_name:string='Value')
{
    let anomaly_chart = toscalar(PlotlyTemplate | where name == "anomaly" | project plotly);
    let tbl_ex = tbl | extend _timestamp = column_ifexists(time_col, datetime(null)), _values = column_ifexists(val_col, 0.0), _baseline = column_ifexists(baseline_col, 0.0),
                              _high_timestamp = column_ifexists(time_high_col, datetime(null)), _high_values = column_ifexists(val_high_col, 0.0), _high_size = column_ifexists(size_high_col, 1),
                              _low_timestamp = column_ifexists(time_low_col, datetime(null)), _low_values = column_ifexists(val_low_col, 0.0), _low_size = column_ifexists(size_low_col, 1);
    tbl_ex
    | extend plotly = anomaly_chart
    | extend plotly=replace_string(plotly, '$TIME_STAMPS$', tostring(_timestamp))
    | extend plotly=replace_string(plotly, '$SERIES_VALS$', tostring(_values))
    | extend plotly=replace_string(plotly, '$BASELINE_VALS$', tostring(_baseline))
    | extend plotly=replace_string(plotly, '$TIME_STAMPS_HIGH_ANOMALIES$', tostring(_high_timestamp))
    | extend plotly=replace_string(plotly, '$HIGH_ANOMALIES_VALS$', tostring(_high_values))
    | extend plotly=replace_string(plotly, '$HIGH_ANOMALIES_MARKER_SIZE$', tostring(_high_size))
    | extend plotly=replace_string(plotly, '$TIME_STAMPS_LOW_ANOMALIES$', tostring(_low_timestamp))
    | extend plotly=replace_string(plotly, '$LOW_ANOMALIES_VALS$', tostring(_low_values))
    | extend plotly=replace_string(plotly, '$LOW_ANOMALIES_MARKER_SIZE$', tostring(_low_size))
    | extend plotly=replace_string(plotly, '$TITLE$', chart_title)
    | extend plotly=replace_string(plotly, '$SERIES_NAME$', series_name)
    | extend plotly=replace_string(plotly, '$Y_NAME$', val_name)
    | project plotly
};
let min_t = datetime(2017-01-05);
let max_t = datetime(2017-02-03 22:00);
let dt = 2h;
let marker_scale = 8;
let s_name = 'TS1';
demo_make_series2
| make-series num=avg(num) on TimeStamp from min_t to max_t step dt by sid
| where sid == s_name
| extend (anomalies, score, baseline) = series_decompose_anomalies(num, 1.5, -1, 'linefit')
| mv-apply num1=num to typeof(double), anomalies1=anomalies to typeof(double), score1=score to typeof(double), TimeStamp1=TimeStamp to typeof(datetime)  on (
    summarize pAnomalies=make_list_if(num1, anomalies1 > 0), pTimeStamp=make_list_if(TimeStamp1, anomalies1 > 0), pSize=make_list_if(toint(score1*marker_scale), anomalies1 > 0),
              nAnomalies=make_list_if(num1, anomalies1 < 0), nTimeStamp=make_list_if(TimeStamp1, anomalies1 < 0), nSize=make_list_if(toint(-score1*marker_scale), anomalies1 < 0)
)
| invoke plotly_anomaly_fl('TimeStamp', 'num', 'baseline', 'pTimeStamp', 'pAnomalies', 'pSize', 'nTimeStamp', 'nAnomalies', 'nSize',
                           chart_title='Anomaly chart using plotly_anomaly_fl()', series_name=s_name, val_name='# of requests')
| render plotly

Stored

let min_t = datetime(2017-01-05);
let max_t = datetime(2017-02-03 22:00);
let dt = 2h;
let marker_scale = 8;
let s_name = 'TS1';
demo_make_series2
| make-series num=avg(num) on TimeStamp from min_t to max_t step dt by sid
| where sid == s_name
| extend (anomalies, score, baseline) = series_decompose_anomalies(num, 1.5, -1, 'linefit')
| mv-apply num1=num to typeof(double), anomalies1=anomalies to typeof(double), score1=score to typeof(double), TimeStamp1=TimeStamp to typeof(datetime)  on (
    summarize pAnomalies=make_list_if(num1, anomalies1 > 0), pTimeStamp=make_list_if(TimeStamp1, anomalies1 > 0), pSize=make_list_if(toint(score1*marker_scale), anomalies1 > 0),
              nAnomalies=make_list_if(num1, anomalies1 < 0), nTimeStamp=make_list_if(TimeStamp1, anomalies1 < 0), nSize=make_list_if(toint(-score1*marker_scale), anomalies1 < 0)
)
| invoke plotly_anomaly_fl('TimeStamp', 'num', 'baseline', 'pTimeStamp', 'pAnomalies', 'pSize', 'nTimeStamp', 'nAnomalies', 'nSize',
                           chart_title='Anomaly chart using plotly_anomaly_fl()', series_name=s_name, val_name='# of requests')
| render plotly

Output

The output is a Plotly JSON string that can be rendered using ‘| render plotly’ or in an Azure Data Explorer dashboard tile. For more information on creating dashboard tiles, see Visualize data with Azure Data Explorer dashboards . The output is a Plotly JSON string that can be rendered in a Real-Time dashboard tile. For more information on creating dashboard tiles, see Real-Time dashboards.

The following image shows a sample anomaly chart using the above function:

Screenshot of anomaly chart of the sample dataset.

You can zoom in and hover over anomalies:

Screenshot of zoom in anomalous region. Screenshot of hover over anomaly.

28 - plotly_gauge_fl()

Learn how to use the plotly_gauge_fl() user-defined function.

The function plotly_gauge_fl() is a user-defined function (UDF) that allows you to customize a plotly template to create a gauge chart.

The function accepts few parameters to customize the gauge chart and returns a single cell table containing plotly JSON. Optionally, you can render the data in an Azure Data Explorer dashboard tile. For more information, see Plotly (preview). The function accepts few parameters to customize the gauge chart and returns a single cell table containing plotly JSON. Optionally, you can render the data in a Real-Time dashboard tile. For more information, see Plotly (preview).

Prerequisite

Extract the required ‘gauge’ template from the publicly available PlotlyTemplate table. Copy this table from the Samples database to your database by running the following KQL command from your target database:

.set PlotlyTemplate <| cluster('help.kusto.windows.net').database('Samples').PlotlyTemplate

Syntax

T | invoke plotly_gauge_fl(value, max_range, mode, chart_title, font_color, bar_color, bar_bg_color, tick_color, tick_width)

Parameters

NameTypeRequiredDescription
valuereal✔️The number to be displayed.
max_rangerangeThe maximum range of the gauge.
modestringSpecifies how the value is displayed on the graph. Default is ‘gauge+number’.
chart_titlestringThe chart title. The default is empty title.
font_colorstringThe chart’s font color. Default is ‘black’.
bar_colorstringThe gauge’s filled bar color. Default is ‘green’.
bar_bg_colorstringThe gauge’s not filled bar color. Default is ’lightgreen’.
tick_colorstringThe gauge’s ticks color. Default is ‘darkblue’.
tick_widthintThe gauge’s ticks width. Default is 1.

Plotly gauge charts support many parameters, still this function exposes only the above ones. For more information, see indicator traces reference.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let plotly_gauge_fl=(value:real, max_range:real=real(null), mode:string='gauge+number', chart_title:string='',font_color:string='black',
                    bar_color:string='green', bar_bg_color:string='lightgreen', tick_color:string='darkblue', tick_width:int=1)
{
    let gauge_chart = toscalar(PlotlyTemplate | where name == "gauge" | project plotly);
    print plotly = gauge_chart
    | extend plotly=replace_string(plotly, '$VALUE$', tostring(value))
    | extend plotly=replace_string(plotly, '$MAX_RANGE$', iff(isnull(max_range), 'null', tostring(max_range)))
    | extend plotly=replace_string(plotly, '$MODE$', mode)
    | extend plotly=replace_string(plotly, '$TITLE$', chart_title)
    | extend plotly=replace_string(plotly, '$FONT_COLOR$', font_color)
    | extend plotly=replace_string(plotly, '$BAR_COLOR$', bar_color)
    | extend plotly=replace_string(plotly, '$BAR_BG_COLOR$', bar_bg_color)
    | extend plotly=replace_string(plotly, '$TICK_COLOR$', tick_color)
    | extend plotly=replace_string(plotly, '$TICK_WIDTH$', tostring(tick_width))
    | project plotly
};
// Write your query to use your function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Plotly", docstring = "Render gauge chart using plotly template")
plotly_gauge_fl(value:real, max_range:real=real(null), mode:string='gauge+number', chart_title:string='',font_color:string='black',
                    bar_color:string='green', bar_bg_color:string='lightgreen', tick_color:string='darkblue', tick_width:int=1)
{
    let gauge_chart = toscalar(PlotlyTemplate | where name == "gauge" | project plotly);
    print plotly = gauge_chart
    | extend plotly=replace_string(plotly, '$VALUE$', tostring(value))
    | extend plotly=replace_string(plotly, '$MAX_RANGE$', iff(isnull(max_range), 'null', tostring(max_range)))
    | extend plotly=replace_string(plotly, '$MODE$', mode)
    | extend plotly=replace_string(plotly, '$TITLE$', chart_title)
    | extend plotly=replace_string(plotly, '$FONT_COLOR$', font_color)
    | extend plotly=replace_string(plotly, '$BAR_COLOR$', bar_color)
    | extend plotly=replace_string(plotly, '$BAR_BG_COLOR$', bar_bg_color)
    | extend plotly=replace_string(plotly, '$TICK_COLOR$', tick_color)
    | extend plotly=replace_string(plotly, '$TICK_WIDTH$', tostring(tick_width))
    | project plotly
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let plotly_gauge_fl=(value:real, max_range:real=real(null), mode:string='gauge+number', chart_title:string='',font_color:string='black',
                    bar_color:string='green', bar_bg_color:string='lightgreen', tick_color:string='darkblue', tick_width:int=1)
{
    let gauge_chart = toscalar(PlotlyTemplate | where name == "gauge" | project plotly);
    print plotly = gauge_chart
    | extend plotly=replace_string(plotly, '$VALUE$', tostring(value))
    | extend plotly=replace_string(plotly, '$MAX_RANGE$', iff(isnull(max_range), 'null', tostring(max_range)))
    | extend plotly=replace_string(plotly, '$MODE$', mode)
    | extend plotly=replace_string(plotly, '$TITLE$', chart_title)
    | extend plotly=replace_string(plotly, '$FONT_COLOR$', font_color)
    | extend plotly=replace_string(plotly, '$BAR_COLOR$', bar_color)
    | extend plotly=replace_string(plotly, '$BAR_BG_COLOR$', bar_bg_color)
    | extend plotly=replace_string(plotly, '$TICK_COLOR$', tick_color)
    | extend plotly=replace_string(plotly, '$TICK_WIDTH$', tostring(tick_width))
    | project plotly
};
plotly_gauge_fl(value=180, chart_title='Speed', font_color='purple', tick_width=5)
| render plotly

Stored

plotly_gauge_fl(value=180, chart_title='Speed', font_color='purple', tick_width=5)
| render plotly

Output

The output is a Plotly JSON string that can be rendered in an Azure Data Explorer dashboard tile. For more information on creating dashboard tiles, see Visualize data with Azure Data Explorer dashboards. The output is a Plotly JSON string that can be rendered in a Real-Time dashboard tile. For more information on creating dashboard tiles, see Real-Time dashboards.

Screenshot of gauge chart with random data.

29 - plotly_scatter3d_fl()

Learn how to use the plotly_scatter3d_fl() user-defined function.

The function plotly_scatter3d_fl() is a user-defined function (UDF) that allows you to customize a plotly template to create an interactive 3D scatter chart.

The function accepts a table containing the records to be rendered, the names of the x, y, z & aggregation columns, and the chart title string. The function returns a single cell table containing plotly JSON. Optionally, you can render the data in an Azure Data Explorer dashboard tile. For more information, see Plotly (preview). The function accepts a table containing the records to be rendered, the names of the x, y, z & aggregation columns, and the chart title string. The function returns a single cell table containing plotly JSON. Optionally, you can render the data in a Real-Time dashboard tile. For more information, see Plotly (preview).

Prerequisite

Extract the required ‘scatter3d’ template from the publicly available PlotlyTemplate table. Copy this table from the Samples database to your database by running the following KQL command from your target database:

.set PlotlyTemplate <| cluster('help.kusto.windows.net').database('Samples').PlotlyTemplate

Syntax

T | invoke plotly_scatter3d_fl(x_col, y_col, z_col, aggr_col [, chart_title ])

Parameters

NameTypeRequiredDescription
x_colstring✔️The name of the column for the X coordinated of the 3D plot.
y_colstring✔️The name of the column for the Y coordinated of the 3D plot.
z_colstring✔️The name of the column for the Z coordinated of the 3D plot.
aggr_colstring✔️The name of the grouping column. Records in the same group are rendered in distinct color.
chart_titlestringThe chart title. The default is ‘3D Scatter chart’.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let plotly_scatter3d_fl=(tbl:(*), x_col:string, y_col:string, z_col:string, aggr_col:string='', chart_title:string='3D Scatter chart')
{
    let scatter3d_chart = toscalar(PlotlyTemplate | where name == "scatter3d" | project plotly);
    let tbl_ex = tbl | extend _x = column_ifexists(x_col, 0.0), _y = column_ifexists(y_col, 0.0), _z = column_ifexists(z_col, 0.0), _aggr = column_ifexists(aggr_col, 'ALL');
    tbl_ex
    | serialize 
    | summarize _x=pack_array(make_list(_x)), _y=pack_array(make_list(_y)), _z=pack_array(make_list(_z)) by _aggr
    | summarize _aggr=make_list(_aggr), _x=make_list(_x), _y=make_list(_y), _z=make_list(_z)
    | extend plotly = scatter3d_chart
    | extend plotly=replace_string(plotly, '$CLASS1$', tostring(_aggr[0]))
    | extend plotly=replace_string(plotly, '$CLASS2$', tostring(_aggr[1]))
    | extend plotly=replace_string(plotly, '$CLASS3$', tostring(_aggr[2]))
    | extend plotly=replace_string(plotly, '$X_NAME$', x_col)
    | extend plotly=replace_string(plotly, '$Y_NAME$', y_col)
    | extend plotly=replace_string(plotly, '$Z_NAME$', z_col)
    | extend plotly=replace_string(plotly, '$CLASS1_X$', tostring(_x[0]))
    | extend plotly=replace_string(plotly, '$CLASS1_Y$', tostring(_y[0]))
    | extend plotly=replace_string(plotly, '$CLASS1_Z$', tostring(_z[0]))
    | extend plotly=replace_string(plotly, '$CLASS2_X$', tostring(_x[1]))
    | extend plotly=replace_string(plotly, '$CLASS2_Y$', tostring(_y[1]))
    | extend plotly=replace_string(plotly, '$CLASS2_Z$', tostring(_z[1]))
    | extend plotly=replace_string(plotly, '$CLASS3_X$', tostring(_x[2]))
    | extend plotly=replace_string(plotly, '$CLASS3_Y$', tostring(_y[2]))
    | extend plotly=replace_string(plotly, '$CLASS3_Z$', tostring(_z[2]))
    | extend plotly=replace_string(plotly, '$TITLE$', chart_title)
    | project plotly
};
// Write your query to use your function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Plotly", docstring = "Render 3D scatter chart using plotly template")
plotly_scatter3d_fl(tbl:(*), x_col:string, y_col:string, z_col:string, aggr_col:string='', chart_title:string='3D Scatter chart')
{
    let scatter3d_chart = toscalar(PlotlyTemplate | where name == "scatter3d" | project plotly);
    let tbl_ex = tbl | extend _x = column_ifexists(x_col, 0.0), _y = column_ifexists(y_col, 0.0), _z = column_ifexists(z_col, 0.0), _aggr = column_ifexists(aggr_col, 'ALL');
    tbl_ex
    | serialize 
    | summarize _x=pack_array(make_list(_x)), _y=pack_array(make_list(_y)), _z=pack_array(make_list(_z)) by _aggr
    | summarize _aggr=make_list(_aggr), _x=make_list(_x), _y=make_list(_y), _z=make_list(_z)
    | extend plotly = scatter3d_chart
    | extend plotly=replace_string(plotly, '$CLASS1$', tostring(_aggr[0]))
    | extend plotly=replace_string(plotly, '$CLASS2$', tostring(_aggr[1]))
    | extend plotly=replace_string(plotly, '$CLASS3$', tostring(_aggr[2]))
    | extend plotly=replace_string(plotly, '$X_NAME$', x_col)
    | extend plotly=replace_string(plotly, '$Y_NAME$', y_col)
    | extend plotly=replace_string(plotly, '$Z_NAME$', z_col)
    | extend plotly=replace_string(plotly, '$CLASS1_X$', tostring(_x[0]))
    | extend plotly=replace_string(plotly, '$CLASS1_Y$', tostring(_y[0]))
    | extend plotly=replace_string(plotly, '$CLASS1_Z$', tostring(_z[0]))
    | extend plotly=replace_string(plotly, '$CLASS2_X$', tostring(_x[1]))
    | extend plotly=replace_string(plotly, '$CLASS2_Y$', tostring(_y[1]))
    | extend plotly=replace_string(plotly, '$CLASS2_Z$', tostring(_z[1]))
    | extend plotly=replace_string(plotly, '$CLASS3_X$', tostring(_x[2]))
    | extend plotly=replace_string(plotly, '$CLASS3_Y$', tostring(_y[2]))
    | extend plotly=replace_string(plotly, '$CLASS3_Z$', tostring(_z[2]))
    | extend plotly=replace_string(plotly, '$TITLE$', chart_title)
    | project plotly
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let plotly_scatter3d_fl=(tbl:(*), x_col:string, y_col:string, z_col:string, aggr_col:string='', chart_title:string='3D Scatter chart')
{
    let scatter3d_chart = toscalar(PlotlyTemplate | where name == "scatter3d" | project plotly);
    let tbl_ex = tbl | extend _x = column_ifexists(x_col, 0.0), _y = column_ifexists(y_col, 0.0), _z = column_ifexists(z_col, 0.0), _aggr = column_ifexists(aggr_col, 'ALL');
    tbl_ex
    | serialize 
    | summarize _x=pack_array(make_list(_x)), _y=pack_array(make_list(_y)), _z=pack_array(make_list(_z)) by _aggr
    | summarize _aggr=make_list(_aggr), _x=make_list(_x), _y=make_list(_y), _z=make_list(_z)
    | extend plotly = scatter3d_chart
    | extend plotly=replace_string(plotly, '$CLASS1$', tostring(_aggr[0]))
    | extend plotly=replace_string(plotly, '$CLASS2$', tostring(_aggr[1]))
    | extend plotly=replace_string(plotly, '$CLASS3$', tostring(_aggr[2]))
    | extend plotly=replace_string(plotly, '$X_NAME$', x_col)
    | extend plotly=replace_string(plotly, '$Y_NAME$', y_col)
    | extend plotly=replace_string(plotly, '$Z_NAME$', z_col)
    | extend plotly=replace_string(plotly, '$CLASS1_X$', tostring(_x[0]))
    | extend plotly=replace_string(plotly, '$CLASS1_Y$', tostring(_y[0]))
    | extend plotly=replace_string(plotly, '$CLASS1_Z$', tostring(_z[0]))
    | extend plotly=replace_string(plotly, '$CLASS2_X$', tostring(_x[1]))
    | extend plotly=replace_string(plotly, '$CLASS2_Y$', tostring(_y[1]))
    | extend plotly=replace_string(plotly, '$CLASS2_Z$', tostring(_z[1]))
    | extend plotly=replace_string(plotly, '$CLASS3_X$', tostring(_x[2]))
    | extend plotly=replace_string(plotly, '$CLASS3_Y$', tostring(_y[2]))
    | extend plotly=replace_string(plotly, '$CLASS3_Z$', tostring(_z[2]))
    | extend plotly=replace_string(plotly, '$TITLE$', chart_title)
    | project plotly
};
Iris
| invoke plotly_scatter3d_fl(x_col='SepalLength', y_col='PetalLength', z_col='SepalWidth', aggr_col='Class', chart_title='3D scatter chart using plotly_scatter3d_fl()')
| render plotly

Stored

Iris
| invoke plotly_scatter3d_fl(x_col='SepalLength', y_col='PetalLength', z_col='SepalWidth', aggr_col='Class', chart_title='3D scatter chart using plotly_scatter3d_fl()')

Output

The output is a Plotly JSON string that can be rendered in an Azure Data Explorer dashboard tile. For more information on creating dashboard tiles, see Visualize data with Azure Data Explorer dashboards. The output is a Plotly JSON string that can be rendered in a Real-Time dashboard tile. For more information on creating dashboard tiles, see Real-Time dashboards.

Screenshot of 3D scatter chart of a sample dataset.

You can rotate, zoom and hover over specific records:

Screenshot of rotated 3D scatter chart of a sample dataset.

30 - predict_fl()

This article describes the predict_fl() user-defined function.

The function predict_fl() is a user-defined function (UDF) that predicts using an existing trained machine learning model. This model was built using Scikit-learn, serialized to string, and saved in a standard table.

Syntax

T | invoke predict_fl(models_tbl, model_name, features_cols, pred_col)

Parameters

NameTypeRequiredDescription
models_tblstring✔️The name of the table that contains all serialized models. The table must have the following columns:
name: the model name
timestamp: time of model training
model: string representation of the serialized model
model_namestring✔️The name of the specific model to use.
features_colssynamic✔️An array containing the names of the features columns that are used by the model for prediction.
pred_colstring✔️The name of the column that stores the predictions.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let predict_fl=(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
    let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
    let kwargs = bag_pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
    let code = ```if 1:
        
        import pickle
        import binascii
        
        smodel = kargs["smodel"]
        features_cols = kargs["features_cols"]
        pred_col = kargs["pred_col"]
        bmodel = binascii.unhexlify(smodel)
        clf1 = pickle.loads(bmodel)
        df1 = df[features_cols]
        predictions = clf1.predict(df1)
        
        result = df
        result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])
        
    ```;
    samples
    | evaluate python(typeof(*), code, kwargs)
};
// Write your code to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create function with (folder = "Packages\\ML", docstring = "Predict using ML model, build by Scikit-learn")
predict_fl(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
    let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
    let kwargs = bag_pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
    let code = ```if 1:
        
        import pickle
        import binascii
        
        smodel = kargs["smodel"]
        features_cols = kargs["features_cols"]
        pred_col = kargs["pred_col"]
        bmodel = binascii.unhexlify(smodel)
        clf1 = pickle.loads(bmodel)
        df1 = df[features_cols]
        predictions = clf1.predict(df1)
        
        result = df
        result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])
        
    ```;
    samples
    | evaluate python(typeof(*), code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let predict_fl=(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
    let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
    let kwargs = bag_pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
    let code = ```if 1:
        
        import pickle
        import binascii
        
        smodel = kargs["smodel"]
        features_cols = kargs["features_cols"]
        pred_col = kargs["pred_col"]
        bmodel = binascii.unhexlify(smodel)
        clf1 = pickle.loads(bmodel)
        df1 = df[features_cols]
        predictions = clf1.predict(df1)
        
        result = df
        result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])
        
    ```;
    samples
    | evaluate python(typeof(*), code, kwargs)
};
//
// Predicts room occupancy from sensors measurements, and calculates the confusion matrix
//
// Occupancy Detection is an open dataset from UCI Repository at https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+
// It contains experimental data for binary classification of room occupancy from Temperature,Humidity,Light and CO2.
// Ground-truth labels were obtained from time stamped pictures that were taken every minute
//
OccupancyDetection 
| where Test == 1
| extend pred_Occupancy=false
| invoke predict_fl(ML_Models, 'Occupancy', pack_array('Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio'), 'pred_Occupancy')
| summarize n=count() by Occupancy, pred_Occupancy

Stored

//
// Predicts room occupancy from sensors measurements, and calculates the confusion matrix
//
// Occupancy Detection is an open dataset from UCI Repository at https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+
// It contains experimental data for binary classification of room occupancy from Temperature,Humidity,Light and CO2.
// Ground-truth labels were obtained from time stamped pictures that were taken every minute
//
OccupancyDetection 
| where Test == 1
| extend pred_Occupancy=false
| invoke predict_fl(ML_Models, 'Occupancy', pack_array('Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio'), 'pred_Occupancy')
| summarize n=count() by Occupancy, pred_Occupancy

Output

Occupancypred_Occupancyn
TRUETRUE3006
FALSETRUE112
TRUEFALSE15
FALSEFALSE9284

Model asset

Get sample dataset and pre-trained model with Python plugin enabled.

//dataset
.set OccupancyDetection <| cluster('help').database('Samples').OccupancyDetection

//model
.set ML_Models <| datatable(name:string, timestamp:datetime, model:string) [
'Occupancy', datetime(now), '800363736b6c6561726e2e6c696e6561725f6d6f64656c2e6c6f6769737469630a4c6f67697374696352656772657373696f6e0a7100298171017d710228580700000070656e616c7479710358020000006c32710458040000006475616c7105895803000000746f6c7106473f1a36e2eb1c432d5801000000437107473ff0000000000000580d0000006669745f696e746572636570747108885811000000696e746572636570745f7363616c696e6771094b01580c000000636c6173735f776569676874710a4e580c00000072616e646f6d5f7374617465710b4e5806000000736f6c766572710c58090000006c69626c696e656172710d58080000006d61785f69746572710e4b64580b0000006d756c74695f636c617373710f58030000006f767271105807000000766572626f736571114b00580a0000007761726d5f737461727471128958060000006e5f6a6f627371134b015808000000636c61737365735f7114636e756d70792e636f72652e6d756c746961727261790a5f7265636f6e7374727563740a7115636e756d70790a6e6461727261790a71164b00857117430162711887711952711a284b014b0285711b636e756d70790a64747970650a711c58020000006231711d4b004b0187711e52711f284b0358010000007c71204e4e4e4affffffff4affffffff4b007471216289430200017122747123625805000000636f65665f7124681568164b008571256818877126527127284b014b014b05867128681c5802000000663871294b004b0187712a52712b284b0358010000003c712c4e4e4e4affffffff4affffffff4b0074712d628943286a02e0d50687e0bfc6d7c974fa93a63fb3d3b8080e6e943ffceb15defdad713f14c3a76bd73202bf712e74712f62580a000000696e746572636570745f7130681568164b008571316818877132527133284b014b01857134682b894308f1e89f57711290bf71357471366258070000006e5f697465725f7137681568164b00857138681887713952713a284b014b0185713b681c58020000006934713c4b004b0187713d52713e284b03682c4e4e4e4affffffff4affffffff4b0074713f628943040c00000071407471416258100000005f736b6c6561726e5f76657273696f6e71425806000000302e31392e32714375622e'
]

31 - predict_onnx_fl()

This article describes the predict_onnx_fl() user-defined function.

The function predict_onnx_fl() is a user-defined function (UDF) that predicts using an existing trained machine learning model. This model has been converted to ONNX format, serialized to string, and saved in a standard table.

Syntax

T | invoke predict_onnx_fl(models_tbl, model_name, features_cols, pred_col)

Parameters

NameTypeRequiredDescription
models_tblstring✔️The name of the table that contains all serialized models. The table must have the following columns:
name: the model name
timestamp: time of model training
model: string representation of the serialized model
model_namestring✔️The name of the specific model to use.
features_colssynamic✔️An array containing the names of the features columns that are used by the model for prediction.
pred_colstring✔️The name of the column that stores the predictions.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let predict_onnx_fl=(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
    let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
    let kwargs = bag_pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
    let code = ```if 1:
    
    import binascii
    
    smodel = kargs["smodel"]
    features_cols = kargs["features_cols"]
    pred_col = kargs["pred_col"]
    bmodel = binascii.unhexlify(smodel)
    
    features_cols = kargs["features_cols"]
    pred_col = kargs["pred_col"]
    
    import onnxruntime as rt
    sess = rt.InferenceSession(bmodel)
    input_name = sess.get_inputs()[0].name
    label_name = sess.get_outputs()[0].name
    df1 = df[features_cols]
    predictions = sess.run([label_name], {input_name: df1.values.astype(np.float32)})[0]
    
    result = df
    result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])
    
    ```;
    samples | evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\ML", docstring = "Predict using ONNX model")
predict_onnx_fl(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
    let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
    let kwargs = bag_pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
    let code = ```if 1:
    
    import binascii
    
    smodel = kargs["smodel"]
    features_cols = kargs["features_cols"]
    pred_col = kargs["pred_col"]
    bmodel = binascii.unhexlify(smodel)
    
    features_cols = kargs["features_cols"]
    pred_col = kargs["pred_col"]
    
    import onnxruntime as rt
    sess = rt.InferenceSession(bmodel)
    input_name = sess.get_inputs()[0].name
    label_name = sess.get_outputs()[0].name
    df1 = df[features_cols]
    predictions = sess.run([label_name], {input_name: df1.values.astype(np.float32)})[0]
    
    result = df
    result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])
    
    ```;
    samples | evaluate python(typeof(*), code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let predict_onnx_fl=(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
    let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
    let kwargs = bag_pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
    let code = ```if 1:
    
    import binascii
    
    smodel = kargs["smodel"]
    features_cols = kargs["features_cols"]
    pred_col = kargs["pred_col"]
    bmodel = binascii.unhexlify(smodel)
    
    features_cols = kargs["features_cols"]
    pred_col = kargs["pred_col"]
    
    import onnxruntime as rt
    sess = rt.InferenceSession(bmodel)
    input_name = sess.get_inputs()[0].name
    label_name = sess.get_outputs()[0].name
    df1 = df[features_cols]
    predictions = sess.run([label_name], {input_name: df1.values.astype(np.float32)})[0]
    
    result = df
    result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])
    
    ```;
    samples | evaluate python(typeof(*), code, kwargs)
};
//
// Predicts room occupancy from sensors measurements, and calculates the confusion matrix
//
// Occupancy Detection is an open dataset from UCI Repository at https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+
// It contains experimental data for binary classification of room occupancy from Temperature,Humidity,Light and CO2.
// Ground-truth labels were obtained from time stamped pictures that were taken every minute
//
OccupancyDetection 
| where Test == 1
| extend pred_Occupancy=bool(0)
| invoke predict_onnx_fl(ML_Models, 'ONNX-Occupancy', pack_array('Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio'), 'pred_Occupancy')
| summarize n=count() by Occupancy, pred_Occupancy

Stored

//
// Predicts room occupancy from sensors measurements, and calculates the confusion matrix
//
// Occupancy Detection is an open dataset from UCI Repository at https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+
// It contains experimental data for binary classification of room occupancy from Temperature,Humidity,Light and CO2.
// Ground-truth labels were obtained from time stamped pictures that were taken every minute
//
OccupancyDetection 
| where Test == 1
| extend pred_Occupancy=bool(0)
| invoke predict_onnx_fl(ML_Models, 'ONNX-Occupancy', pack_array('Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio'), 'pred_Occupancy')
| summarize n=count() by Occupancy, pred_Occupancy

Output

Occupancypred_Occupancyn
TRUETRUE3006
FALSETRUE112
TRUEFALSE15
FALSEFALSE9284

32 - quantize_fl()

This article describes the quantize_fl() user-defined function.

The function quantize_fl() is a user-defined function (UDF) that bins metric columns. It quantizes metric columns to categorical labels, based on the K-Means algorithm.

Syntax

T | invoke quantize_fl(num_bins, in_cols, out_cols [, labels ])

Parameters

NameTypeRequiredDescription
num_binsint✔️The required number of bins.
in_colsdynamic✔️An array containing the names of the columns to quantize.
out_colsdynamic✔️An array containing the names of the respective output columns for the binned values.
labelsdynamicAn array containing the label names. If unspecified, bin ranges will be used.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let quantize_fl=(tbl:(*), num_bins:int, in_cols:dynamic, out_cols:dynamic, labels:dynamic=dynamic(null))
{
    let kwargs = bag_pack('num_bins', num_bins, 'in_cols', in_cols, 'out_cols', out_cols, 'labels', labels);
    let code = ```if 1:
        
        from sklearn.preprocessing import KBinsDiscretizer
        
        num_bins = kargs["num_bins"]
        in_cols = kargs["in_cols"]
        out_cols = kargs["out_cols"]
        labels = kargs["labels"]
        
        result = df
        binner = KBinsDiscretizer(n_bins=num_bins, encode="ordinal", strategy="kmeans")
        df_in = df[in_cols]
        bdata = binner.fit_transform(df_in)
        if labels is None:
            for i in range(len(out_cols)):    # loop on each column and convert it to binned labels
                ii = np.round(binner.bin_edges_[i], 3)
                labels = [str(ii[j-1]) + '-' + str(ii[j]) for j in range(1, num_bins+1)]
                result.loc[:,out_cols[i]] = np.take(labels, bdata[:, i].astype(int))
        else:
            result[out_cols] = np.take(labels, bdata.astype(int))
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create function with (folder = "Packages\\ML", docstring = "Binning metric columns")
quantize_fl(tbl:(*), num_bins:int, in_cols:dynamic, out_cols:dynamic, labels:dynamic)
{
    let kwargs = bag_pack('num_bins', num_bins, 'in_cols', in_cols, 'out_cols', out_cols, 'labels', labels);
    let code = ```if 1:
        
        from sklearn.preprocessing import KBinsDiscretizer
        
        num_bins = kargs["num_bins"]
        in_cols = kargs["in_cols"]
        out_cols = kargs["out_cols"]
        labels = kargs["labels"]
        
        result = df
        binner = KBinsDiscretizer(n_bins=num_bins, encode="ordinal", strategy="kmeans")
        df_in = df[in_cols]
        bdata = binner.fit_transform(df_in)
        if labels is None:
            for i in range(len(out_cols)):    # loop on each column and convert it to binned labels
                ii = np.round(binner.bin_edges_[i], 3)
                labels = [str(ii[j-1]) + '-' + str(ii[j]) for j in range(1, num_bins+1)]
                result.loc[:,out_cols[i]] = np.take(labels, bdata[:, i].astype(int))
        else:
            result[out_cols] = np.take(labels, bdata.astype(int))
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let quantize_fl=(tbl:(*), num_bins:int, in_cols:dynamic, out_cols:dynamic, labels:dynamic=dynamic(null))
{
    let kwargs = bag_pack('num_bins', num_bins, 'in_cols', in_cols, 'out_cols', out_cols, 'labels', labels);
    let code = ```if 1:
        
        from sklearn.preprocessing import KBinsDiscretizer
        
        num_bins = kargs["num_bins"]
        in_cols = kargs["in_cols"]
        out_cols = kargs["out_cols"]
        labels = kargs["labels"]
        
        result = df
        binner = KBinsDiscretizer(n_bins=num_bins, encode="ordinal", strategy="kmeans")
        df_in = df[in_cols]
        bdata = binner.fit_transform(df_in)
        if labels is None:
            for i in range(len(out_cols)):    # loop on each column and convert it to binned labels
                ii = np.round(binner.bin_edges_[i], 3)
                labels = [str(ii[j-1]) + '-' + str(ii[j]) for j in range(1, num_bins+1)]
                result.loc[:,out_cols[i]] = np.take(labels, bdata[:, i].astype(int))
        else:
            result[out_cols] = np.take(labels, bdata.astype(int))
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
//
union 
(range x from 1 to 5 step 1),
(range x from 10 to 15 step 1),
(range x from 20 to 25 step 1)
| extend x_label='', x_bin=''
| invoke quantize_fl(3, pack_array('x'), pack_array('x_label'), pack_array('Low', 'Med', 'High'))
| invoke quantize_fl(3, pack_array('x'), pack_array('x_bin'), dynamic(null))

Stored

union 
(range x from 1 to 5 step 1),
(range x from 10 to 15 step 1),
(range x from 20 to 25 step 1)
| extend x_label='', x_bin=''
| invoke quantize_fl(3, pack_array('x'), pack_array('x_label'), pack_array('Low', 'Med', 'High'))
| invoke quantize_fl(3, pack_array('x'), pack_array('x_bin'), dynamic(null))

Output

xx_labelx_bin
1Low1.0-7.75
2Low1.0-7.75
3Low1.0-7.75
4Low1.0-7.75
5Low1.0-7.75
20High17.5-25.0
21High17.5-25.0
22High17.5-25.0
23High17.5-25.0
24High17.5-25.0
25High17.5-25.0
10Med7.75-17.5
11Med7.75-17.5
12Med7.75-17.5
13Med7.75-17.5
14Med7.75-17.5
15Med7.75-17.5

33 - series_clean_anomalies_fl()

Learn how to use the series_clean_anomalies_fl() function to clean anomalous points in a series.

Cleans anomalous points in a series.

The function series_clean_anomalies_fl() is a user-defined function (UDF) that takes a dynamic numerical array as input and another numerical array of anomalies and replaces the anomalies in the input array with interpolated value of their adjacent points.

Syntax

series_clean_anomalies_fl(y_series, anomalies)

Parameters

NameTypeRequiredDescription
y_seriesdynamic✔️The input array of numeric values.
anomaliesdynamic✔️The anomalies array containing either 0 for normal points or any other value for anomalous points.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_clean_anomalies_fl = (y_series:dynamic, anomalies:dynamic)
{
    let fnum = array_iff(series_not_equals(anomalies, 0), real(null), y_series);  //  replace anomalies with null values
    series_fill_linear(fnum)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Replace anomalies by interpolated value", skipvalidation = "true")
series_clean_anomalies_fl(y_series:dynamic, anomalies:dynamic)
{
    let fnum = array_iff(series_not_equals(anomalies, 0), real(null), y_series);  //  replace anomalies with null values
    series_fill_linear(fnum)
}

Example

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_clean_anomalies_fl = (y_series:dynamic, anomalies:dynamic)
{
    let fnum = array_iff(series_not_equals(anomalies, 0), real(null), y_series);  //  replace anomalies with null values
    series_fill_linear(fnum)
}
;
let min_t = datetime(2016-08-29);
let max_t = datetime(2016-08-31);
demo_make_series1
| make-series num=count() on TimeStamp from min_t to max_t step 20m by OsVer
| extend anomalies = series_decompose_anomalies(num, 0.8)
| extend num_c = series_clean_anomalies_fl(num, anomalies)
| render anomalychart with (anomalycolumns=anomalies)

Stored

let min_t = datetime(2016-08-29);
let max_t = datetime(2016-08-31);
demo_make_series1
| make-series num=count() on TimeStamp from min_t to max_t step 20m by OsVer
| extend anomalies = series_decompose_anomalies(num, 0.8)
| extend num_c = series_clean_anomalies_fl(num, anomalies)
| render anomalychart with (anomalycolumns=anomalies)

Output

Graph of a time series with anomalies before and after cleaning.

34 - series_cosine_similarity_fl()

This article describes series_cosine_similarity_fl() user-defined function.

Calculates the cosine similarity of two numerical vectors.

The function series_cosine_similarity_fl() is a user-defined function (UDF) that takes an expression containing two dynamic numerical arrays as input and calculates their cosine similarity.

Syntax

series_cosine_similarity_fl(vec1, vec2, [ vec1_size [, vec2_size ]])

Parameters

NameTypeRequiredDescription
vec1dynamic✔️An array of numeric values.
vec2dynamic✔️An array of numeric values that is the same length as vec1.
vec1_sizerealThe size of vec1. This is equivalent to the square root of the dot product of the vector with itself.
vec2_sizerealThe size of vec2.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_cosine_similarity_fl=(vec1:dynamic, vec2:dynamic, vec1_size:real=double(null), vec2_size:real=double(null))
{
    let dp = series_dot_product(vec1, vec2);
    let v1l = iff(isnull(vec1_size), sqrt(series_dot_product(vec1, vec1)), vec1_size);
    let v2l = iff(isnull(vec2_size), sqrt(series_dot_product(vec2, vec2)), vec2_size);
    dp/(v1l*v2l)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Calculate the Cosine similarity of 2 numerical arrays")
series_cosine_similarity_fl(vec1:dynamic, vec2:dynamic, vec1_size:real=double(null), vec2_size:real=double(null))
{
    let dp = series_dot_product(vec1, vec2);
    let v1l = iff(isnull(vec1_size), sqrt(series_dot_product(vec1, vec1)), vec1_size);
    let v2l = iff(isnull(vec2_size), sqrt(series_dot_product(vec2, vec2)), vec2_size);
    dp/(v1l*v2l)
}

Example

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_cosine_similarity_fl=(vec1:dynamic, vec2:dynamic, vec1_size:real=double(null), vec2_size:real=double(null))
{
    let dp = series_dot_product(vec1, vec2);
    let v1l = iff(isnull(vec1_size), sqrt(series_dot_product(vec1, vec1)), vec1_size);
    let v2l = iff(isnull(vec2_size), sqrt(series_dot_product(vec2, vec2)), vec2_size);
    dp/(v1l*v2l)
};
let s1=pack_array(0, 1);
let s2=pack_array(sqrt(2), sqrt(2));
print angle=acos(series_cosine_similarity_fl(s1, s2))/(2*pi())*360

Stored

let s1=pack_array(0, 1);
let s2=pack_array(sqrt(2), sqrt(2));
print angle=acos(series_cosine_similarity_fl(s1, s2))/(2*pi())*360

Output

angle
45

35 - series_dbl_exp_smoothing_fl()

This article describes the series_dbl_exp_smoothing_fl() user-defined function.

Applies a double exponential smoothing filter on a series.

The function series_dbl_exp_smoothing_fl() is a user-defined function (UDF) that takes an expression containing a dynamic numerical array as input and applies a double exponential smoothing filter. When there is trend in the series, this function is superior to the series_exp_smoothing_fl() function, which implements a basic exponential smoothing filter.

Syntax

series_dbl_exp_smoothing_fl(y_series [, alpha [, beta ]])

Parameters

NameTypeRequiredDescription
y_seriesdynamic✔️An array of numeric values.
alpharealA value in the range [0-1] that specifies the weight of the last point vs. the weight of the previous points, which is 1 - alpha. The default is 0.5.
betarealA value in the range [0-1] that specifies the weight of the last slope vs. the weight of the previous slopes, which is 1 - beta. The default is 0.5.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_dbl_exp_smoothing_fl = (y_series:dynamic, alpha:double=0.5, beta:double=0.5)
{
    series_iir(y_series, pack_array(alpha, alpha*(beta-1)), pack_array(1, alpha*(1+beta)-2, 1-alpha))
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Double exponential smoothing for a series")
series_dbl_exp_smoothing_fl(y_series:dynamic, alpha:double=0.5, beta:double=0.5)
{
    series_iir(y_series, pack_array(alpha, alpha*(beta-1)), pack_array(1, alpha*(1+beta)-2, 1-alpha))
}

Example

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_dbl_exp_smoothing_fl = (y_series:dynamic, alpha:double=0.5, beta:double=0.5)
{
    series_iir(y_series, pack_array(alpha, alpha*(beta-1)), pack_array(1, alpha*(1+beta)-2, 1-alpha))
};
range x from 1 to 50 step 1
| extend y = x + rand()*10
| summarize x = make_list(x), y = make_list(y)
| extend dbl_exp_smooth_y = series_dbl_exp_smoothing_fl(y, 0.2, 0.4) 
| render linechart

Stored

range x from 1 to 50 step 1
| extend y = x + rand()*10
| summarize x = make_list(x), y = make_list(y)
| extend dbl_exp_smooth_y = series_dbl_exp_smoothing_fl(y, 0.2, 0.4) 
| render linechart

Output

Graph showing double exponential smoothing of artificial series.

36 - series_dot_product_fl()

This article describes series_dot_product_fl() user-defined function.

Calculates the dot product of two numerical vectors.

The function series_dot_product_fl() is a user-defined function (UDF) that takes an expression containing two dynamic numerical arrays as input and calculates their dot product.

Syntax

series_dot_product_fl(vec1, vec2)

Parameters

NameTypeRequiredDescription
vec1dynamic✔️An array of numeric values.
vec2dynamic✔️An array of numeric values that is the same length as vec1.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_dot_product_fl=(vec1:dynamic, vec2:dynamic)
{
    let elem_prod = series_multiply(vec1, vec2);
    let cum_sum = series_iir(elem_prod, dynamic([1]), dynamic([1,-1]));
    todouble(cum_sum[-1])
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Calculate the dot product of 2 numerical arrays")
series_dot_product_fl(vec1:dynamic, vec2:dynamic)
{
    let elem_prod = series_multiply(vec1, vec2);
    let cum_sum = series_iir(elem_prod, dynamic([1]), dynamic([1,-1]));
    todouble(cum_sum[-1])
}

Example

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_dot_product_fl=(vec1:dynamic, vec2:dynamic)
{
    let elem_prod = series_multiply(vec1, vec2);
    let cum_sum = series_iir(elem_prod, dynamic([1]), dynamic([1,-1]));
    todouble(cum_sum[-1])
};
union
(print 1 | project v1=range(1, 3, 1), v2=range(4, 6, 1)),
(print 1 | project v1=range(11, 13, 1), v2=range(14, 16, 1))
| extend v3=series_dot_product_fl(v1, v2)

Stored

union
(print 1 | project v1=range(1, 3, 1), v2=range(4, 6, 1)),
(print 1 | project v1=range(11, 13, 1), v2=range(14, 16, 1))
| extend v3=series_dot_product_fl(v1, v2)

Output

Table showing the result of dot product of 2 vectors using user-defined function series_dot_product_fl.

37 - series_downsample_fl()

This article describes the series_downsample_fl() user-defined function.

The function series_downsample_fl() is a user-defined function (UDF) that downsamples a time series by an integer factor. This function takes a table containing multiple time series (dynamic numerical array), and downsamples each series. The output contains both the coarser series and its respective times array. To avoid aliasing, the function applies a simple low pass filter on each series before subsampling.

Syntax

T | invoke series_downsample_fl(t_col, y_col, ds_t_col, ds_y_col, sampling_factor)

Parameters

NameTypeRequiredDescription
t_colstring✔️The name of the column that contains the time axis of the series to downsample.
y_colstring✔️The name of the column that contains the series to downsample.
ds_t_colstring✔️The name of the column to store the down sampled time axis of each series.
ds_y_colstring✔️The name of the column to store the down sampled series.
sampling_factorint✔️An integer specifying the required down sampling.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_downsample_fl=(tbl:(*), t_col:string, y_col:string, ds_t_col:string, ds_y_col:string, sampling_factor:int)
{
    tbl
    | extend _t_ = column_ifexists(t_col, dynamic(0)), _y_ = column_ifexists(y_col, dynamic(0))
    | extend _y_ = series_fir(_y_, repeat(1, sampling_factor), true, true)    //  apply a simple low pass filter before sub-sampling
    | mv-apply _t_ to typeof(DateTime), _y_ to typeof(double) on
    (extend rid=row_number()-1
    | where rid % sampling_factor == ceiling(sampling_factor/2.0)-1                    //  sub-sampling
    | summarize _t_ = make_list(_t_), _y_ = make_list(_y_))
    | extend cols = bag_pack(ds_t_col, _t_, ds_y_col, _y_)
    | project-away _t_, _y_
    | evaluate bag_unpack(cols)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Downsampling a series by an integer factor")
series_downsample_fl(tbl:(*), t_col:string, y_col:string, ds_t_col:string, ds_y_col:string, sampling_factor:int)
{
    tbl
    | extend _t_ = column_ifexists(t_col, dynamic(0)), _y_ = column_ifexists(y_col, dynamic(0))
    | extend _y_ = series_fir(_y_, repeat(1, sampling_factor), true, true)    //  apply a simple low pass filter before sub-sampling
    | mv-apply _t_ to typeof(DateTime), _y_ to typeof(double) on
    (extend rid=row_number()-1
    | where rid % sampling_factor == ceiling(sampling_factor/2.0)-1                    //  sub-sampling
    | summarize _t_ = make_list(_t_), _y_ = make_list(_y_))
    | extend cols = bag_pack(ds_t_col, _t_, ds_y_col, _y_)
    | project-away _t_, _y_
    | evaluate bag_unpack(cols)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_downsample_fl=(tbl:(*), t_col:string, y_col:string, ds_t_col:string, ds_y_col:string, sampling_factor:int)
{
    tbl
    | extend _t_ = column_ifexists(t_col, dynamic(0)), _y_ = column_ifexists(y_col, dynamic(0))
    | extend _y_ = series_fir(_y_, repeat(1, sampling_factor), true, true)    //  apply a simple low pass filter before sub-sampling
    | mv-apply _t_ to typeof(DateTime), _y_ to typeof(double) on
    (extend rid=row_number()-1
    | where rid % sampling_factor == ceiling(sampling_factor/2.0)-1                    //  sub-sampling
    | summarize _t_ = make_list(_t_), _y_ = make_list(_y_))
    | extend cols = bag_pack(ds_t_col, _t_, ds_y_col, _y_)
    | project-away _t_, _y_
    | evaluate bag_unpack(cols)
};
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| invoke series_downsample_fl('TimeStamp', 'num', 'coarse_TimeStamp', 'coarse_num', 4)
| render timechart with(xcolumn=coarse_TimeStamp, ycolumns=coarse_num)

Stored

demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| invoke series_downsample_fl('TimeStamp', 'num', 'coarse_TimeStamp', 'coarse_num', 4)
| render timechart with(xcolumn=coarse_TimeStamp, ycolumns=coarse_num)

Output

The time series downsampled by 4: Graph showing downsampling of a time series.

For reference, here is the original time series (before downsampling):

demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| render timechart with(xcolumn=TimeStamp, ycolumns=num)

Graph showing the original time series, before downsampling

38 - series_exp_smoothing_fl()

This article describes series_exp_smoothing_fl() user-defined function.

Applies a basic exponential smoothing filter on a series.

The function series_exp_smoothing_fl() is a user-defined function (UDF) that takes an expression containing a dynamic numerical array as input and applies a basic exponential smoothing filter.

Syntax

series_exp_smoothing_fl(y_series [, alpha ])

Parameters

NameTypeRequiredDescription
y_seriesdynamic✔️An array cell of numeric values.
alpharealA value in the range [0-1] that specifies the weight of the last point vs. the weight of the previous points, which is 1 - alpha. The default is 0.5.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_exp_smoothing_fl = (y_series:dynamic, alpha:double=0.5)
{
    series_iir(y_series, pack_array(alpha), pack_array(1, alpha-1))
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Basic exponential smoothing for a series")
series_exp_smoothing_fl(y_series:dynamic, alpha:double=0.5)
{
    series_iir(y_series, pack_array(alpha), pack_array(1, alpha-1))
}

Example

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_exp_smoothing_fl = (y_series:dynamic, alpha:double=0.5)
{
    series_iir(y_series, pack_array(alpha), pack_array(1, alpha-1))
};
range x from 1 to 50 step 1
| extend y = x % 10
| summarize x = make_list(x), y = make_list(y)
| extend exp_smooth_y = series_exp_smoothing_fl(y, 0.4) 
| render linechart

Stored

range x from 1 to 50 step 1
| extend y = x % 10
| summarize x = make_list(x), y = make_list(y)
| extend exp_smooth_y = series_exp_smoothing_fl(y, 0.4) 
| render linechart

Output

Graph showing exponential smoothing of artificial series.

39 - series_fbprophet_forecast_fl()

This article describes the series_fbprophet_forecast_fl() user-defined function.

The function series_fbprophet_forecast_fl() is a user-defined function (UDF) that takes an expression containing a time series as input, and predicts the values of the last trailing points using the Prophet algorithm. The function returns both the forecasted points and their confidence intervals. This function is a Kusto Query Language (KQL) wrapper to Prophet() class, and exposes only the parameters that are mandatory for prediction. Feel free to modify your copy to support more parameters. such as holidays, change points, Fourier order, and so on.

Syntax

T | invoke series_fbprophet_forecast_fl(ts_series, y_series, y_pred_series, [ points ], [ y_pred_low_series ], [ y_pred_high_series ])

Parameters

NameTypeRequiredDescription
ts_seriesstring✔️The name of the input table column containing the time stamps of the series to predict.
y_seriesstring✔️The name of the input table column containing the values of the series to predict.
y_pred_seriesstring✔️The name of the column to store the predicted series.
pointsint✔️The number of points at the end of the series to predict (forecast). These points are excluded from the learning (regression) process. The default is 0.
y_pred_low_seriesstringThe name of the column to store the series of the lowest values of the confidence interval. Omit if the confidence interval isn’t needed.
y_pred_high_seriesstringThe name of the column to store the series of the highest values of the confidence interval. Omit if the confidence interval isn’t needed.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_fbprophet_forecast_fl=(tbl:(*), ts_series:string, y_series:string, y_pred_series:string, points:int=0, y_pred_low_series:string='', y_pred_high_series:string='')
{
    let kwargs = bag_pack('ts_series', ts_series, 'y_series', y_series, 'y_pred_series', y_pred_series, 'points', points, 'y_pred_low_series', y_pred_low_series, 'y_pred_high_series', y_pred_high_series);
    let code = ```if 1:
        from sandbox_utils import Zipackage
        Zipackage.install("prophet.zip")
        ts_series = kargs["ts_series"]
        y_series = kargs["y_series"]
        y_pred_series = kargs["y_pred_series"]
        points = kargs["points"]
        y_pred_low_series = kargs["y_pred_low_series"]
        y_pred_high_series = kargs["y_pred_high_series"]
        result = df
        sr = pd.Series(df[y_pred_series])
        if y_pred_low_series != '':
            srl = pd.Series(df[y_pred_low_series])
        if y_pred_high_series != '':
            srh = pd.Series(df[y_pred_high_series])
        from prophet import Prophet
        df1 = pd.DataFrame(columns=["ds", "y"])
        for i in range(df.shape[0]):
            df1["ds"] = pd.to_datetime(df[ts_series][i])
            df1["ds"] = df1["ds"].dt.tz_convert(None)
            df1["y"] = df[y_series][i]
            df2 = df1[:-points]
            m = Prophet()
            m.fit(df2)
            future = df1[["ds"]]
            forecast = m.predict(future)
            sr[i] = list(forecast["yhat"])
            if y_pred_low_series != '':
                srl[i] = list(forecast["yhat_lower"])
            if y_pred_high_series != '':
                srh[i] = list(forecast["yhat_upper"])
        result[y_pred_series] = sr
        if y_pred_low_series != '':
            result[y_pred_low_series] = srl
        if y_pred_high_series != '':
            result[y_pred_high_series] = srh
    ```;
    tbl
     | evaluate python(typeof(*), code, kwargs
, external_artifacts=bag_pack('prophet.zip', 'https://artifactswestusnew.blob.core.windows.net/public/prophet-1.1.5.zip?*** YOUR SAS TOKEN ***'))
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Time Series Forecast using Facebook fbprophet package")
series_fbprophet_forecast_fl(tbl:(*), ts_series:string, y_series:string, y_pred_series:string, points:int=0, y_pred_low_series:string='', y_pred_high_series:string='')
{
    let kwargs = bag_pack('ts_series', ts_series, 'y_series', y_series, 'y_pred_series', y_pred_series, 'points', points, 'y_pred_low_series', y_pred_low_series, 'y_pred_high_series', y_pred_high_series);
    let code = ```if 1:
        from sandbox_utils import Zipackage
        Zipackage.install("prophet.zip")
        ts_series = kargs["ts_series"]
        y_series = kargs["y_series"]
        y_pred_series = kargs["y_pred_series"]
        points = kargs["points"]
        y_pred_low_series = kargs["y_pred_low_series"]
        y_pred_high_series = kargs["y_pred_high_series"]
        result = df
        sr = pd.Series(df[y_pred_series])
        if y_pred_low_series != '':
            srl = pd.Series(df[y_pred_low_series])
        if y_pred_high_series != '':
            srh = pd.Series(df[y_pred_high_series])
        from prophet import Prophet
        df1 = pd.DataFrame(columns=["ds", "y"])
        for i in range(df.shape[0]):
            df1["ds"] = pd.to_datetime(df[ts_series][i])
            df1["ds"] = df1["ds"].dt.tz_convert(None)
            df1["y"] = df[y_series][i]
            df2 = df1[:-points]
            m = Prophet()
            m.fit(df2)
            future = df1[["ds"]]
            forecast = m.predict(future)
            sr[i] = list(forecast["yhat"])
            if y_pred_low_series != '':
                srl[i] = list(forecast["yhat_lower"])
            if y_pred_high_series != '':
                srh[i] = list(forecast["yhat_upper"])
        result[y_pred_series] = sr
        if y_pred_low_series != '':
            result[y_pred_low_series] = srl
        if y_pred_high_series != '':
            result[y_pred_high_series] = srh
    ```;
    tbl
     | evaluate python(typeof(*), code, kwargs
, external_artifacts=bag_pack('prophet.zip', 'https://artifactswestusnew.blob.core.windows.net/public/prophet-1.1.5.zip?*** YOUR SAS TOKEN ***'))
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_fbprophet_forecast_fl=(tbl:(*), ts_series:string, y_series:string, y_pred_series:string, points:int=0, y_pred_low_series:string='', y_pred_high_series:string='')
{
    let kwargs = bag_pack('ts_series', ts_series, 'y_series', y_series, 'y_pred_series', y_pred_series, 'points', points, 'y_pred_low_series', y_pred_low_series, 'y_pred_high_series', y_pred_high_series);
    let code = ```if 1:
        from sandbox_utils import Zipackage
        Zipackage.install("prophet.zip")
        ts_series = kargs["ts_series"]
        y_series = kargs["y_series"]
        y_pred_series = kargs["y_pred_series"]
        points = kargs["points"]
        y_pred_low_series = kargs["y_pred_low_series"]
        y_pred_high_series = kargs["y_pred_high_series"]
        result = df
        sr = pd.Series(df[y_pred_series])
        if y_pred_low_series != '':
            srl = pd.Series(df[y_pred_low_series])
        if y_pred_high_series != '':
            srh = pd.Series(df[y_pred_high_series])
        from prophet import Prophet
        df1 = pd.DataFrame(columns=["ds", "y"])
        for i in range(df.shape[0]):
            df1["ds"] = pd.to_datetime(df[ts_series][i])
            df1["ds"] = df1["ds"].dt.tz_convert(None)
            df1["y"] = df[y_series][i]
            df2 = df1[:-points]
            m = Prophet()
            m.fit(df2)
            future = df1[["ds"]]
            forecast = m.predict(future)
            sr[i] = list(forecast["yhat"])
            if y_pred_low_series != '':
                srl[i] = list(forecast["yhat_lower"])
            if y_pred_high_series != '':
                srh[i] = list(forecast["yhat_upper"])
        result[y_pred_series] = sr
        if y_pred_low_series != '':
            result[y_pred_low_series] = srl
        if y_pred_high_series != '':
            result[y_pred_high_series] = srh
    ```;
    tbl
     | evaluate python(typeof(*), code, kwargs
, external_artifacts=bag_pack('prophet.zip', 'https://artifactswestusnew.blob.core.windows.net/public/prophet-1.1.5.zip?*** YOUR SAS TOKEN ***'))
};
//
//  Forecasting 3 time series using fbprophet, compare to forecasting using the native function series_decompose_forecast()
//
let min_t = datetime(2017-01-05);
let max_t = datetime(2017-02-03 22:00);
let dt = 2h;
let horizon=7d;
demo_make_series2
| make-series num=avg(num) on TimeStamp from min_t to max_t+horizon step dt by sid 
| extend pred_num_native = series_decompose_forecast(num, toint(horizon/dt))
| extend pred_num=dynamic(null), pred_num_lower=dynamic(null), pred_num_upper=dynamic(null)
| invoke series_fbprophet_forecast_fl('TimeStamp', 'num', 'pred_num', toint(horizon/dt), 'pred_num_lower', 'pred_num_upper')
| render timechart 

Stored

//
//  Forecasting 3 time series using fbprophet, compare to forecasting using the native function series_decompose_forecast()
//
let min_t = datetime(2017-01-05);
let max_t = datetime(2017-02-03 22:00);
let dt = 2h;
let horizon=7d;
demo_make_series2
| make-series num=avg(num) on TimeStamp from min_t to max_t+horizon step dt by sid 
| extend pred_num_native = series_decompose_forecast(num, toint(horizon/dt))
| extend pred_num=dynamic(null), pred_num_lower=dynamic(null), pred_num_upper=dynamic(null)
| invoke series_fbprophet_forecast_fl('TimeStamp', 'num', 'pred_num', toint(horizon/dt), 'pred_num_lower', 'pred_num_upper')
| render timechart 

Output

Graph showing forecasting few time series.

40 - series_fit_lowess_fl()

This article describes the series_fit_lowess_fl() user-defined function.

no-loc: LOWESS

series_fit_lowess_fl()

The function series_fit_lowess_fl() is a user-defined function (UDF) that applies a LOWESS regression on a series. This function takes a table with multiple series (dynamic numerical arrays) and generates a LOWESS Curve, which is a smoothed version of the original series.

Syntax

T | invoke series_fit_lowess_fl(y_series, y_fit_series, [ fit_size ], [ x_series ], [ x_istime ])

Parameters

NameTypeRequiredDescription
y_seriesstring✔️The name of the input table column containing the dependent variable. This column is the series to fit.
y_fit_seriesstring✔️The name of the column to store the fitted series.
fit_sizeintFor each point, the local regression is applied on its respective fit_size closest points. The default is 5.
x_seriesstringThe name of the column containing the independent variable, that is, the x or time axis. This parameter is optional, and is needed only for unevenly spaced series. The default value is an empty string, as x is redundant for the regression of an evenly spaced series.
x_istimeboolThis boolean parameter is needed only if x_series is specified and it’s a vector of datetime. The default is false.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_fit_lowess_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_size:int=5, x_series:string='', x_istime:bool=False)
{
    let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_size', fit_size, 'x_series', x_series, 'x_istime', x_istime);
    let code = ```if 1:
        y_series = kargs["y_series"]
        y_fit_series = kargs["y_fit_series"]
        fit_size = kargs["fit_size"]
        x_series = kargs["x_series"]
        x_istime = kargs["x_istime"]
        import statsmodels.api as sm
        def lowess_fit(ts_row, x_col, y_col, fsize):
            y = ts_row[y_col]
            fraction = fsize/len(y)
            if x_col == "": # If there is no x column creates sequential range [1, len(y)]
               x = np.arange(len(y)) + 1
            else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
               if x_istime: 
                   x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
                   x = x - x.min()
                   x = x / x.max()
                   x = x * (len(x) - 1) + 1
               else:
                   x = ts_row[x_col]
            lowess = sm.nonparametric.lowess
            z = lowess(y, x, return_sorted=False, frac=fraction)
            return list(z)
        result = df
        result[y_fit_series] = df.apply(lowess_fit, axis=1, args=(x_series, y_series, fit_size))
    ```;
    tbl
     | evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Fits a local polynomial using LOWESS method to a series")
series_fit_lowess_fl(tbl:(*), y_series:string, y_fit_series:string, fit_size:int=5, x_series:string='', x_istime:bool=False)
{
    let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_size', fit_size, 'x_series', x_series, 'x_istime', x_istime);
    let code = ```if 1:
        y_series = kargs["y_series"]
        y_fit_series = kargs["y_fit_series"]
        fit_size = kargs["fit_size"]
        x_series = kargs["x_series"]
        x_istime = kargs["x_istime"]
        import statsmodels.api as sm
        def lowess_fit(ts_row, x_col, y_col, fsize):
            y = ts_row[y_col]
            fraction = fsize/len(y)
            if x_col == "": # If there is no x column creates sequential range [1, len(y)]
               x = np.arange(len(y)) + 1
            else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
               if x_istime: 
                   x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
                   x = x - x.min()
                   x = x / x.max()
                   x = x * (len(x) - 1) + 1
               else:
                   x = ts_row[x_col]
            lowess = sm.nonparametric.lowess
            z = lowess(y, x, return_sorted=False, frac=fraction)
            return list(z)
        result = df
        result[y_fit_series] = df.apply(lowess_fit, axis=1, args=(x_series, y_series, fit_size))
    ```;
    tbl
     | evaluate python(typeof(*), code, kwargs)
}

Examples

The following examples use the invoke operator to run the function.

LOWESS regression on regular time series

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_fit_lowess_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_size:int=5, x_series:string='', x_istime:bool=False)
{
    let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_size', fit_size, 'x_series', x_series, 'x_istime', x_istime);
    let code = ```if 1:
        y_series = kargs["y_series"]
        y_fit_series = kargs["y_fit_series"]
        fit_size = kargs["fit_size"]
        x_series = kargs["x_series"]
        x_istime = kargs["x_istime"]
        import statsmodels.api as sm
        def lowess_fit(ts_row, x_col, y_col, fsize):
            y = ts_row[y_col]
            fraction = fsize/len(y)
            if x_col == "": # If there is no x column creates sequential range [1, len(y)]
               x = np.arange(len(y)) + 1
            else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
               if x_istime: 
                   x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
                   x = x - x.min()
                   x = x / x.max()
                   x = x * (len(x) - 1) + 1
               else:
                   x = ts_row[x_col]
            lowess = sm.nonparametric.lowess
            z = lowess(y, x, return_sorted=False, frac=fraction)
            return list(z)
        result = df
        result[y_fit_series] = df.apply(lowess_fit, axis=1, args=(x_series, y_series, fit_size))
    ```;
    tbl
     | evaluate python(typeof(*), code, kwargs)
};
//
// Apply 9 points LOWESS regression on regular time series
//
let max_t = datetime(2016-09-03);
demo_make_series1
| make-series num=count() on TimeStamp from max_t-1d to max_t step 5m by OsVer
| extend fnum = dynamic(null)
| invoke series_fit_lowess_fl('num', 'fnum', 9)
| render timechart

Stored

//
// Apply 9 points LOWESS regression on regular time series
//
let max_t = datetime(2016-09-03);
demo_make_series1
| make-series num=count() on TimeStamp from max_t-1d to max_t step 5m by OsVer
| extend fnum = dynamic(null)
| invoke series_fit_lowess_fl('num', 'fnum', 9)
| render timechart

Output

Graph showing nine points LOWESS fit to a regular time series.

Test irregular time series

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_fit_lowess_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_size:int=5, x_series:string='', x_istime:bool=False)
{
    let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_size', fit_size, 'x_series', x_series, 'x_istime', x_istime);
    let code = ```if 1:
        y_series = kargs["y_series"]
        y_fit_series = kargs["y_fit_series"]
        fit_size = kargs["fit_size"]
        x_series = kargs["x_series"]
        x_istime = kargs["x_istime"]
        import statsmodels.api as sm
        def lowess_fit(ts_row, x_col, y_col, fsize):
            y = ts_row[y_col]
            fraction = fsize/len(y)
            if x_col == "": # If there is no x column creates sequential range [1, len(y)]
               x = np.arange(len(y)) + 1
            else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
               if x_istime: 
                   x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
                   x = x - x.min()
                   x = x / x.max()
                   x = x * (len(x) - 1) + 1
               else:
                   x = ts_row[x_col]
            lowess = sm.nonparametric.lowess
            z = lowess(y, x, return_sorted=False, frac=fraction)
            return list(z)
        result = df
        result[y_fit_series] = df.apply(lowess_fit, axis=1, args=(x_series, y_series, fit_size))
    ```;
    tbl
     | evaluate python(typeof(*), code, kwargs)
};
let max_t = datetime(2016-09-03);
demo_make_series1
| where TimeStamp between ((max_t-1d)..max_t)
| summarize num=count() by bin(TimeStamp, 5m), OsVer
| order by TimeStamp asc
| where hourofday(TimeStamp) % 6 != 0   //  delete every 6th hour to create irregular time series
| summarize TimeStamp=make_list(TimeStamp), num=make_list(num) by OsVer
| extend fnum = dynamic(null)
| invoke series_fit_lowess_fl('num', 'fnum', 9, 'TimeStamp', True)
| render timechart 

Stored

let max_t = datetime(2016-09-03);
demo_make_series1
| where TimeStamp between ((max_t-1d)..max_t)
| summarize num=count() by bin(TimeStamp, 5m), OsVer
| order by TimeStamp asc
| where hourofday(TimeStamp) % 6 != 0   //  delete every 6th hour to create irregular time series
| summarize TimeStamp=make_list(TimeStamp), num=make_list(num) by OsVer
| extend fnum = dynamic(null)
| invoke series_fit_lowess_fl('num', 'fnum', 9, 'TimeStamp', True)
| render timechart 

Output

Graph showing nine points LOWESS fit to an irregular time series.

Compare LOWESS versus polynomial fit

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_fit_lowess_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_size:int=5, x_series:string='', x_istime:bool=False)
{
    let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_size', fit_size, 'x_series', x_series, 'x_istime', x_istime);
    let code = ```if 1:
        y_series = kargs["y_series"]
        y_fit_series = kargs["y_fit_series"]
        fit_size = kargs["fit_size"]
        x_series = kargs["x_series"]
        x_istime = kargs["x_istime"]
        import statsmodels.api as sm
        def lowess_fit(ts_row, x_col, y_col, fsize):
            y = ts_row[y_col]
            fraction = fsize/len(y)
            if x_col == "": # If there is no x column creates sequential range [1, len(y)]
               x = np.arange(len(y)) + 1
            else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
               if x_istime: 
                   x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
                   x = x - x.min()
                   x = x / x.max()
                   x = x * (len(x) - 1) + 1
               else:
                   x = ts_row[x_col]
            lowess = sm.nonparametric.lowess
            z = lowess(y, x, return_sorted=False, frac=fraction)
            return list(z)
        result = df
        result[y_fit_series] = df.apply(lowess_fit, axis=1, args=(x_series, y_series, fit_size))
    ```;
    tbl
     | evaluate python(typeof(*), code, kwargs)
};
range x from 1 to 200 step 1
| project x = rand()*5 - 2.3
| extend y = pow(x, 5)-8*pow(x, 3)+10*x+6
| extend y = y + (rand() - 0.5)*0.5*y
| summarize x=make_list(x), y=make_list(y)
| extend y_lowess = dynamic(null)
| invoke series_fit_lowess_fl('y', 'y_lowess', 15, 'x')
| extend series_fit_poly(y, x, 5)
| project x, y, y_lowess, y_polynomial=series_fit_poly_y_poly_fit
| render linechart

Stored

range x from 1 to 200 step 1
| project x = rand()*5 - 2.3
| extend y = pow(x, 5)-8*pow(x, 3)+10*x+6
| extend y = y + (rand() - 0.5)*0.5*y
| summarize x=make_list(x), y=make_list(y)
| extend y_lowess = dynamic(null)
| invoke series_fit_lowess_fl('y', 'y_lowess', 15, 'x')
| extend series_fit_poly(y, x, 5)
| project x, y, y_lowess, y_polynomial=series_fit_poly_y_poly_fit
| render linechart

Output

Graphs of LOWESS vs polynomial fit for a fifth order polynomial with noise on x & y axes

41 - series_fit_poly_fl()

This article describes the series_fit_poly_fl() user-defined function.

The function series_fit_poly_fl() is a user-defined function (UDF) that applies a polynomial regression on a series. This function takes a table containing multiple series (dynamic numerical arrays) and generates the best fit high-order polynomial for each series using polynomial regression. This function returns both the polynomial coefficients and the interpolated polynomial over the range of the series.

Syntax

T | invoke series_fit_poly_fl(y_series, y_fit_series, fit_coeff, degree, [ x_series ], [ x_istime ])

Parameters

NameTypeRequiredDescription
y_seriesstring✔️The name of the input table column containing the dependent variable. That is, the series to fit.
y_fit_seriesstring✔️The name of the column to store the best fit series.
fit_coeffstring✔️The name of the column to store the best fit polynomial coefficients.
degreeint✔️The required order of the polynomial to fit. For example, 1 for linear regression, 2 for quadratic regression, and so on.
x_seriesstringThe name of the column containing the independent variable, that is, the x or time axis. This parameter is optional, and is needed only for unevenly spaced series. The default value is an empty string, as x is redundant for the regression of an evenly spaced series.
x_istimeboolThis parameter is needed only if x_series is specified and it’s a vector of datetime.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_fit_poly_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_coeff:string, degree:int, x_series:string='', x_istime:bool=False)
{
    let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_coeff', fit_coeff, 'degree', degree, 'x_series', x_series, 'x_istime', x_istime);
    let code = ```if 1:
        y_series = kargs["y_series"]
        y_fit_series = kargs["y_fit_series"]
        fit_coeff = kargs["fit_coeff"]
        degree = kargs["degree"]
        x_series = kargs["x_series"]
        x_istime = kargs["x_istime"]
        
        def fit(ts_row, x_col, y_col, deg):
            y = ts_row[y_col]
            if x_col == "": # If there is no x column creates sequential range [1, len(y)]
               x = np.arange(len(y)) + 1
            else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
               if x_istime: 
                   x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
                   x = x - x.min()
                   x = x / x.max()
                   x = x * (len(x) - 1) + 1
               else:
                   x = ts_row[x_col]
            coeff = np.polyfit(x, y, deg)
            p = np.poly1d(coeff)
            z = p(x)
            return z, coeff
        
        result = df
        if len(df):
           result[[y_fit_series, fit_coeff]] = df.apply(fit, axis=1, args=(x_series, y_series, degree,), result_type="expand")
    ```;
    tbl
     | evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Fit a polynomial of a specified degree to a series")
series_fit_poly_fl(tbl:(*), y_series:string, y_fit_series:string, fit_coeff:string, degree:int, x_series:string='', x_istime:bool=false)
{
    let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_coeff', fit_coeff, 'degree', degree, 'x_series', x_series, 'x_istime', x_istime);
    let code = ```if 1:
        y_series = kargs["y_series"]
        y_fit_series = kargs["y_fit_series"]
        fit_coeff = kargs["fit_coeff"]
        degree = kargs["degree"]
        x_series = kargs["x_series"]
        x_istime = kargs["x_istime"]
        
        def fit(ts_row, x_col, y_col, deg):
            y = ts_row[y_col]
            if x_col == "": # If there is no x column creates sequential range [1, len(y)]
               x = np.arange(len(y)) + 1
            else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
               if x_istime: 
                   x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
                   x = x - x.min()
                   x = x / x.max()
                   x = x * (len(x) - 1) + 1
               else:
                   x = ts_row[x_col]
            coeff = np.polyfit(x, y, deg)
            p = np.poly1d(coeff)
            z = p(x)
            return z, coeff
        
        result = df
        if len(df):
           result[[y_fit_series, fit_coeff]] = df.apply(fit, axis=1, args=(x_series, y_series, degree,), result_type="expand")
    ```;
    tbl
     | evaluate python(typeof(*), code, kwargs)
}

Examples

The following examples use the invoke operator to run the function.

Fit fifth order polynomial to a regular time series

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_fit_poly_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_coeff:string, degree:int, x_series:string='', x_istime:bool=False)
{
    let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_coeff', fit_coeff, 'degree', degree, 'x_series', x_series, 'x_istime', x_istime);
    let code = ```if 1:
        y_series = kargs["y_series"]
        y_fit_series = kargs["y_fit_series"]
        fit_coeff = kargs["fit_coeff"]
        degree = kargs["degree"]
        x_series = kargs["x_series"]
        x_istime = kargs["x_istime"]
        
        def fit(ts_row, x_col, y_col, deg):
            y = ts_row[y_col]
            if x_col == "": # If there is no x column creates sequential range [1, len(y)]
               x = np.arange(len(y)) + 1
            else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
               if x_istime: 
                   x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
                   x = x - x.min()
                   x = x / x.max()
                   x = x * (len(x) - 1) + 1
               else:
                   x = ts_row[x_col]
            coeff = np.polyfit(x, y, deg)
            p = np.poly1d(coeff)
            z = p(x)
            return z, coeff
        
        result = df
        if len(df):
           result[[y_fit_series, fit_coeff]] = df.apply(fit, axis=1, args=(x_series, y_series, degree,), result_type="expand")
    ```;
    tbl
     | evaluate python(typeof(*), code, kwargs)
};
//
// Fit fifth order polynomial to a regular (evenly spaced) time series, created with make-series
//
let max_t = datetime(2016-09-03);
demo_make_series1
| make-series num=count() on TimeStamp from max_t-1d to max_t step 5m by OsVer
| extend fnum = dynamic(null), coeff=dynamic(null), fnum1 = dynamic(null), coeff1=dynamic(null)
| invoke series_fit_poly_fl('num', 'fnum', 'coeff', 5)
| render timechart with(ycolumns=num, fnum)

Stored

//
// Fit fifth order polynomial to a regular (evenly spaced) time series, created with make-series
//
let max_t = datetime(2016-09-03);
demo_make_series1
| make-series num=count() on TimeStamp from max_t-1d to max_t step 5m by OsVer
| extend fnum = dynamic(null), coeff=dynamic(null), fnum1 = dynamic(null), coeff1=dynamic(null)
| invoke series_fit_poly_fl('num', 'fnum', 'coeff', 5)
| render timechart with(ycolumns=num, fnum)

Output

Graph showing fifth order polynomial fit to a regular time series.

Test irregular time series

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_fit_poly_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_coeff:string, degree:int, x_series:string='', x_istime:bool=False)
{
    let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_coeff', fit_coeff, 'degree', degree, 'x_series', x_series, 'x_istime', x_istime);
    let code = ```if 1:
        y_series = kargs["y_series"]
        y_fit_series = kargs["y_fit_series"]
        fit_coeff = kargs["fit_coeff"]
        degree = kargs["degree"]
        x_series = kargs["x_series"]
        x_istime = kargs["x_istime"]
        
        def fit(ts_row, x_col, y_col, deg):
            y = ts_row[y_col]
            if x_col == "": # If there is no x column creates sequential range [1, len(y)]
               x = np.arange(len(y)) + 1
            else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
               if x_istime: 
                   x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
                   x = x - x.min()
                   x = x / x.max()
                   x = x * (len(x) - 1) + 1
               else:
                   x = ts_row[x_col]
            coeff = np.polyfit(x, y, deg)
            p = np.poly1d(coeff)
            z = p(x)
            return z, coeff
        
        result = df
        if len(df):
           result[[y_fit_series, fit_coeff]] = df.apply(fit, axis=1, args=(x_series, y_series, degree,), result_type="expand")
    ```;
    tbl
     | evaluate python(typeof(*), code, kwargs)
};
let max_t = datetime(2016-09-03);
demo_make_series1
| where TimeStamp between ((max_t-2d)..max_t)
| summarize num=count() by bin(TimeStamp, 5m), OsVer
| order by TimeStamp asc
| where hourofday(TimeStamp) % 6 != 0   //  delete every 6th hour to create unevenly spaced time series
| summarize TimeStamp=make_list(TimeStamp), num=make_list(num) by OsVer
| extend fnum = dynamic(null), coeff=dynamic(null)
| invoke series_fit_poly_fl('num', 'fnum', 'coeff', 8, 'TimeStamp', True)
| render timechart with(ycolumns=num, fnum)

Stored

let max_t = datetime(2016-09-03);
demo_make_series1
| where TimeStamp between ((max_t-2d)..max_t)
| summarize num=count() by bin(TimeStamp, 5m), OsVer
| order by TimeStamp asc
| where hourofday(TimeStamp) % 6 != 0   //  delete every 6th hour to create unevenly spaced time series
| summarize TimeStamp=make_list(TimeStamp), num=make_list(num) by OsVer
| extend fnum = dynamic(null), coeff=dynamic(null)
| invoke series_fit_poly_fl('num', 'fnum', 'coeff', 8, 'TimeStamp', True)
| render timechart with(ycolumns=num, fnum)

Output

Graph showing eighth order polynomial fit to an irregular time series.

Fifth order polynomial with noise on x & y axes

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_fit_poly_fl=(tbl:(*), y_series:string, y_fit_series:string, fit_coeff:string, degree:int, x_series:string='', x_istime:bool=False)
{
    let kwargs = bag_pack('y_series', y_series, 'y_fit_series', y_fit_series, 'fit_coeff', fit_coeff, 'degree', degree, 'x_series', x_series, 'x_istime', x_istime);
    let code = ```if 1:
        y_series = kargs["y_series"]
        y_fit_series = kargs["y_fit_series"]
        fit_coeff = kargs["fit_coeff"]
        degree = kargs["degree"]
        x_series = kargs["x_series"]
        x_istime = kargs["x_istime"]
        
        def fit(ts_row, x_col, y_col, deg):
            y = ts_row[y_col]
            if x_col == "": # If there is no x column creates sequential range [1, len(y)]
               x = np.arange(len(y)) + 1
            else: # if x column exists check whether its a time column. If so, normalize it to the [1, len(y)] range, else take it as is.
               if x_istime: 
                   x = pd.to_numeric(pd.to_datetime(ts_row[x_col]))
                   x = x - x.min()
                   x = x / x.max()
                   x = x * (len(x) - 1) + 1
               else:
                   x = ts_row[x_col]
            coeff = np.polyfit(x, y, deg)
            p = np.poly1d(coeff)
            z = p(x)
            return z, coeff
        
        result = df
        if len(df):
           result[[y_fit_series, fit_coeff]] = df.apply(fit, axis=1, args=(x_series, y_series, degree,), result_type="expand")
    ```;
    tbl
     | evaluate python(typeof(*), code, kwargs)
};
range x from 1 to 200 step 1
| project x = rand()*5 - 2.3
| extend y = pow(x, 5)-8*pow(x, 3)+10*x+6
| extend y = y + (rand() - 0.5)*0.5*y
| summarize x=make_list(x), y=make_list(y)
| extend y_fit = dynamic(null), coeff=dynamic(null)
| invoke series_fit_poly_fl('y', 'y_fit', 'coeff', 5, 'x')
|fork (project-away coeff) (project coeff | mv-expand coeff)
| render linechart

Stored

range x from 1 to 200 step 1
| project x = rand()*5 - 2.3
| extend y = pow(x, 5)-8*pow(x, 3)+10*x+6
| extend y = y + (rand() - 0.5)*0.5*y
| summarize x=make_list(x), y=make_list(y)
| extend y_fit = dynamic(null), coeff=dynamic(null)
| invoke series_fit_poly_fl('y', 'y_fit', 'coeff', 5, 'x')
|fork (project-away coeff) (project coeff | mv-expand coeff)
| render linechart

Output

Graph of fit of fifth order polynomial with noise on x & y axes

Coefficients of fit of fifth order polynomial with noise.

42 - series_lag_fl()

This article describes series_lag_fl() user-defined function.

Applies a lag on a series.

The function series_lag_fl() is a user-defined function (UDF) that takes an expression containing a dynamic numerical array as input and shifts it backward. It’s commonly used for shifting time series to test whether a pattern is new or it matches historical data.

Syntax

series_lag_fl(y_series, offset)

Parameters

NameTypeRequiredDescription
y_seriesdynamic✔️An array cell of numeric values.
offsetint✔️An integer specifying the required offset in bins.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_lag_fl = (series:dynamic, offset:int)
{
    let lag_f = toscalar(range x from 1 to offset+1 step 1
    | project y=iff(x == offset+1, 1, 0)
    | summarize lag_filter = make_list(y));
    fir(series, lag_f, false)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function  with (folder = "Packages\\Series", docstring = "Shift a series by a specified offset")
series_lag_fl(series:dynamic, offset:int)
{
    let lag_f = toscalar(range x from 1 to offset+1 step 1
    | project y=iff(x == offset+1, 1, 0)
    | summarize lag_filter = make_list(y));
    fir(series, lag_f, false)
} 

Example

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_lag_fl = (series:dynamic, offset:int)
{
    let lag_f = toscalar(range x from 1 to offset+1 step 1
    | project y=iff(x == offset+1, 1, 0)
    | summarize lag_filter = make_list(y));
    fir(series, lag_f, false)
};
let dt = 1h;
let time_shift = 1d;
let bins_shift = toint(time_shift/dt);
demo_make_series1
| make-series num=count() on TimeStamp step dt by OsVer
| extend num_shifted=series_lag_fl(num, bins_shift)
| render timechart

Stored

let dt = 1h;
let time_shift = 1d;
let bins_shift = toint(time_shift/dt);
demo_make_series1
| make-series num=count() on TimeStamp step dt by OsVer
| extend num_shifted=series_lag_fl(num, bins_shift)
| render timechart

Output

Graph of a time series shifted by one day.

43 - series_metric_fl()

This article describes the series_metric_fl() user-defined function.

The series_metric_fl() function is a user-defined function (UDF) that selects and retrieves time series of metrics ingested to your database using the Prometheus monitoring system. This function assumes the data stored in your database is structured following the Prometheus data model. Specifically, each record contains:

  • timestamp
  • metric name
  • metric value
  • a variable set of labels ("key":"value" pairs)

Prometheus defines a time series by its metric name and a distinct set of labels. You can retrieve sets of time series using Prometheus Query Language (PromQL) by specifying the metric name and time series selector (a set of labels).

Syntax

T | invoke series_metric_fl(timestamp_col, name_col, labels_col, value_col, metric_name, labels_selector, lookback, offset)

Parameters

NameTypeRequiredDescription
timestamp_colstring✔️The name of the column containing the timestamp.
name_colstring✔️The name of the column containing the metric name.
labels_colstring✔️The name of the column containing the labels dictionary.
value_colstring✔️The name of the column containing the metric value.
metric_namestring✔️The metric time series to retrieve.
labels_selectorstringTime series selector string, similar to PromQL. It’s a string containing a list of "key":"value" pairs, for example '"key1":"val1","key2":"val2"'. The default is an empty string, which means no filtering. Note that regular expressions are not supported.
lookbacktimespanThe range vector to retrieve, similar to PromQL. The default is 10 minutes.
offsetdatetimeOffset back from current time to retrieve, similar to PromQL. Data is retrieved from ago(offset)-lookback to ago(offset). The default is 0, which means that data is retrieved up to now().

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_metric_fl=(metrics_tbl:(*), timestamp_col:string, name_col:string, labels_col:string, value_col:string, metric_name:string, labels_selector:string='', lookback:timespan=timespan(10m), offset:timespan=timespan(0))
{
    let selector_d=iff(labels_selector == '', dynamic(['']), split(labels_selector, ','));
    let etime = ago(offset);
    let stime = etime - lookback;
    metrics_tbl
    | extend timestamp = column_ifexists(timestamp_col, datetime(null)), name = column_ifexists(name_col, ''), labels = column_ifexists(labels_col, dynamic(null)), value = column_ifexists(value_col, 0)
    | extend labels = dynamic_to_json(labels)       //  convert to string and sort by key
    | where name == metric_name and timestamp between(stime..etime)
    | order by timestamp asc
    | summarize timestamp = make_list(timestamp), value=make_list(value) by name, labels
    | where labels has_all (selector_d)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create function with (folder = "Packages\\Series", docstring = "Selecting & retrieving metrics like PromQL")
series_metric_fl(metrics_tbl:(*), timestamp_col:string, name_col:string, labels_col:string, value_col:string, metric_name:string, labels_selector:string='', lookback:timespan=timespan(10m), offset:timespan=timespan(0))
{
    let selector_d=iff(labels_selector == '', dynamic(['']), split(labels_selector, ','));
    let etime = ago(offset);
    let stime = etime - lookback;
    metrics_tbl
    | extend timestamp = column_ifexists(timestamp_col, datetime(null)), name = column_ifexists(name_col, ''), labels = column_ifexists(labels_col, dynamic(null)), value = column_ifexists(value_col, 0)
    | extend labels = dynamic_to_json(labels)       //  convert to string and sort by key
    | where name == metric_name and timestamp between(stime..etime)
    | order by timestamp asc
    | summarize timestamp = make_list(timestamp), value=make_list(value) by name, labels
    | where labels has_all (selector_d)
}

Examples

The following examples use the invoke operator to run the function.

With specifying selector

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_metric_fl=(metrics_tbl:(*), timestamp_col:string, name_col:string, labels_col:string, value_col:string, metric_name:string, labels_selector:string='', lookback:timespan=timespan(10m), offset:timespan=timespan(0))
{
    let selector_d=iff(labels_selector == '', dynamic(['']), split(labels_selector, ','));
    let etime = ago(offset);
    let stime = etime - lookback;
    metrics_tbl
    | extend timestamp = column_ifexists(timestamp_col, datetime(null)), name = column_ifexists(name_col, ''), labels = column_ifexists(labels_col, dynamic(null)), value = column_ifexists(value_col, 0)
    | extend labels = dynamic_to_json(labels)       //  convert to string and sort by key
    | where name == metric_name and timestamp between(stime..etime)
    | order by timestamp asc
    | summarize timestamp = make_list(timestamp), value=make_list(value) by name, labels
    | where labels has_all (selector_d)
};
demo_prometheus
| invoke series_metric_fl('TimeStamp', 'Name', 'Labels', 'Val', 'writes', '"disk":"sda1","host":"aks-agentpool-88086459-vmss000001"', offset=now()-datetime(2020-12-08 00:00))
| render timechart with(series=labels)

Stored

demo_prometheus
| invoke series_metric_fl('TimeStamp', 'Name', 'Labels', 'Val', 'writes', '"disk":"sda1","host":"aks-agentpool-88086459-vmss000001"', offset=now()-datetime(2020-12-08 00:00))
| render timechart with(series=labels)

Output

Graph showing disk write metric over 10 minutes.

Without specifying selector

The following example doesn’t specify selector, so all ‘writes’ metrics are selected. This example assumes that the function is already installed, and uses alternative direct calling syntax, specifying the input table as the first parameter:

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_metric_fl=(metrics_tbl:(*), timestamp_col:string, name_col:string, labels_col:string, value_col:string, metric_name:string, labels_selector:string='', lookback:timespan=timespan(10m), offset:timespan=timespan(0))
{
    let selector_d=iff(labels_selector == '', dynamic(['']), split(labels_selector, ','));
    let etime = ago(offset);
    let stime = etime - lookback;
    metrics_tbl
    | extend timestamp = column_ifexists(timestamp_col, datetime(null)), name = column_ifexists(name_col, ''), labels = column_ifexists(labels_col, dynamic(null)), value = column_ifexists(value_col, 0)
    | extend labels = dynamic_to_json(labels)       //  convert to string and sort by key
    | where name == metric_name and timestamp between(stime..etime)
    | order by timestamp asc
    | summarize timestamp = make_list(timestamp), value=make_list(value) by name, labels
    | where labels has_all (selector_d)
};
series_metric_fl(demo_prometheus, 'TimeStamp', 'Name', 'Labels', 'Val', 'writes', offset=now()-datetime(2020-12-08 00:00))
| render timechart with(series=labels, ysplit=axes)

Stored

series_metric_fl(demo_prometheus, 'TimeStamp', 'Name', 'Labels', 'Val', 'writes', offset=now()-datetime(2020-12-08 00:00))
| render timechart with(series=labels, ysplit=axes)

Output

Graph showing disk write metric for all disks over 10 minutes.

44 - series_monthly_decompose_anomalies_fl()

Learn how to use the series_monthly_decompose_anomalies_fl() function to detect anomalies in a series with monthly seasonality.

Detect anomalous points in a daily series with monthly seasonality.

The function series_monthly_decompose_anomalies_fl() is a user-defined function (UDF) that detects anomalies in multiple time series that have monthly seasonality. The function is built on top of series_decompose_anomalies(). The challenge is that the length of a month is variable between 28 to 31 days, so building a baseline by using series_decompose_anomalies() out of the box detects fixed seasonality thus fails to match spikes or other patterns that occur in the 1st or other day in each month.

Syntax

series_monthly_decompose_anomalies_fl(threshold)

Parameters

NameTypeRequiredDescription
thresholdrealAnomaly threshold. Default is 1.5.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_monthly_decompose_anomalies_fl=(tbl:(_key:string, _date:datetime, _val:real), threshold:real=1.5)
{
    let _tbl=materialize(tbl
    | extend _year=getyear(_date), _dom = dayofmonth(_date), _moy=monthofyear(_date), _doy=dayofyear(_date)
    | extend _vdoy = 31*(_moy-1)+_dom                  //  virtual day of year (assuming all months have 31 days)
    );
    let median_tbl = _tbl | summarize p50=percentiles(_val, 50) by _key, _dom;
    let keys = _tbl | summarize by _key | extend dummy=1;
    let years = _tbl | summarize by _year | extend dummy=1;
    let vdoys = range _vdoy from 0 to 31*12-1 step 1 | extend _moy=_vdoy/31+1, _vdom=_vdoy%31+1, _vdoy=_vdoy+1 | extend dummy=1
    | join kind=fullouter years on dummy | join kind=fullouter keys on dummy | project-away dummy, dummy1, dummy2;
    vdoys
    | join kind=leftouter _tbl on _key, _year, _vdoy
    | project-away _key1, _year1, _moy1, _vdoy1
    | extend _adoy=31*12*_year+_doy, _vadoy = 31*12*_year+_vdoy
    | partition by _key (as T
        | where _vadoy >= toscalar(T | summarize (_adoy, _vadoy)=arg_min(_adoy, _vadoy) | project _vadoy) and 
          _vadoy <= toscalar(T | summarize (_adoy, _vadoy)=arg_max(_adoy, _vadoy) | project _vadoy)
    )
    | join kind=inner median_tbl on _key, $left._vdom == $right._dom
    | extend _vval = coalesce(_val, p50)
    //| order by _key asc, _vadoy asc     //  for debugging
    | make-series _vval=avg(_vval), _date=any(_date) default=datetime(null) on _vadoy step 1 by _key
    | extend (anomalies, score, baseline) = series_decompose_anomalies(_vval, threshold, 31)
    | mv-expand _date to typeof(datetime), _vval to typeof(real), _vadoy to typeof(long), anomalies to typeof(int), score to typeof(real), baseline to typeof(real)
    | project-away _vadoy
    | project-rename _val=_vval
    | where isnotnull(_date)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Anomaly Detection for daily time series with monthly seasonality")
series_monthly_decompose_anomalies_fl(tbl:(_key:string, _date:datetime, _val:real), threshold:real=1.5)
{
    let _tbl=materialize(tbl
    | extend _year=getyear(_date), _dom = dayofmonth(_date), _moy=monthofyear(_date), _doy=dayofyear(_date)
    | extend _vdoy = 31*(_moy-1)+_dom                  //  virtual day of year (assuming all months have 31 days)
    );
    let median_tbl = _tbl | summarize p50=percentiles(_val, 50) by _key, _dom;
    let keys = _tbl | summarize by _key | extend dummy=1;
    let years = _tbl | summarize by _year | extend dummy=1;
    let vdoys = range _vdoy from 0 to 31*12-1 step 1 | extend _moy=_vdoy/31+1, _vdom=_vdoy%31+1, _vdoy=_vdoy+1 | extend dummy=1
    | join kind=fullouter years on dummy | join kind=fullouter keys on dummy | project-away dummy, dummy1, dummy2;
    vdoys
    | join kind=leftouter _tbl on _key, _year, _vdoy
    | project-away _key1, _year1, _moy1, _vdoy1
    | extend _adoy=31*12*_year+_doy, _vadoy = 31*12*_year+_vdoy
    | partition by _key (as T
        | where _vadoy >= toscalar(T | summarize (_adoy, _vadoy)=arg_min(_adoy, _vadoy) | project _vadoy) and 
          _vadoy <= toscalar(T | summarize (_adoy, _vadoy)=arg_max(_adoy, _vadoy) | project _vadoy)
    )
    | join kind=inner median_tbl on _key, $left._vdom == $right._dom
    | extend _vval = coalesce(_val, p50)
    //| order by _key asc, _vadoy asc     //  for debugging
    | make-series _vval=avg(_vval), _date=any(_date) default=datetime(null) on _vadoy step 1 by _key
    | extend (anomalies, score, baseline) = series_decompose_anomalies(_vval, threshold, 31)
    | mv-expand _date to typeof(datetime), _vval to typeof(real), _vadoy to typeof(long), anomalies to typeof(int), score to typeof(real), baseline to typeof(real)
    | project-away _vadoy
    | project-rename _val=_vval
    | where isnotnull(_date)
}

Example

The input table must contain _key, _date and _val columns. The query builds a set of time series of _val per each _key and adds anomalies, score and baseline columns.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_monthly_decompose_anomalies_fl=(tbl:(_key:string, _date:datetime, _val:real), threshold:real=1.5)
{
    let _tbl=materialize(tbl
    | extend _year=getyear(_date), _dom = dayofmonth(_date), _moy=monthofyear(_date), _doy=dayofyear(_date)
    | extend _vdoy = 31*(_moy-1)+_dom                  //  virtual day of year (assuming all months have 31 days)
    );
    let median_tbl = _tbl | summarize p50=percentiles(_val, 50) by _key, _dom;
    let keys = _tbl | summarize by _key | extend dummy=1;
    let years = _tbl | summarize by _year | extend dummy=1;
    let vdoys = range _vdoy from 0 to 31*12-1 step 1 | extend _moy=_vdoy/31+1, _vdom=_vdoy%31+1, _vdoy=_vdoy+1 | extend dummy=1
    | join kind=fullouter years on dummy | join kind=fullouter keys on dummy | project-away dummy, dummy1, dummy2;
    vdoys
    | join kind=leftouter _tbl on _key, _year, _vdoy
    | project-away _key1, _year1, _moy1, _vdoy1
    | extend _adoy=31*12*_year+_doy, _vadoy = 31*12*_year+_vdoy
    | partition by _key (as T
        | where _vadoy >= toscalar(T | summarize (_adoy, _vadoy)=arg_min(_adoy, _vadoy) | project _vadoy) and 
          _vadoy <= toscalar(T | summarize (_adoy, _vadoy)=arg_max(_adoy, _vadoy) | project _vadoy)
    )
    | join kind=inner median_tbl on _key, $left._vdom == $right._dom
    | extend _vval = coalesce(_val, p50)
    //| order by _key asc, _vadoy asc     //  for debugging
    | make-series _vval=avg(_vval), _date=any(_date) default=datetime(null) on _vadoy step 1 by _key
    | extend (anomalies, score, baseline) = series_decompose_anomalies(_vval, threshold, 31)
    | mv-expand _date to typeof(datetime), _vval to typeof(real), _vadoy to typeof(long), anomalies to typeof(int), score to typeof(real), baseline to typeof(real)
    | project-away _vadoy
    | project-rename _val=_vval
    | where isnotnull(_date)
};
demo_monthly_ts
| project _key=key, _date=ts, _val=val
| invoke series_monthly_decompose_anomalies_fl()
| project-rename key=_key, ts=_date, val=_val
| render anomalychart with(anomalycolumns=anomalies, xcolumn=ts, ycolumns=val)

Stored

demo_monthly_ts
| project _key=key, _date=ts, _val=val
| invoke series_monthly_decompose_anomalies_fl()
| project-rename key=_key, ts=_date, val=_val
| render anomalychart with(anomalycolumns=anomalies, xcolumn=ts, ycolumns=val)

Output

Series A with monthly anomalies:

Graph of time series ‘A’ with monthly anomalies.

Series B with monthly anomalies:

Graph of time series ‘B’ with monthly anomalies.

45 - series_moving_avg_fl()

This article describes series_moving_avg_fl() user-defined function.

Applies a moving average filter on a series.

The function series_moving_avg_fl() is a user-defined function (UDF) that takes an expression containing a dynamic numerical array as input and applies on it a simple moving average filter.

Syntax

series_moving_avg_fl(y_series, n [, center ])

Parameters

NameTypeRequiredDescription
y_seriesdynamic✔️An array cell of numeric values.
nint✔️The width of the moving average filter.
centerboolIndicates whether the moving average is either applied symmetrically on a window before and after the current point or applied on a window from the current point backwards. By default, center is false.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_moving_avg_fl = (y_series:dynamic, n:int, center:bool=false)
{
    series_fir(y_series, repeat(1, n), true, center)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Calculate moving average of specified width")
series_moving_avg_fl(y_series:dynamic, n:int, center:bool=false)
{
    series_fir(y_series, repeat(1, n), true, center)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_moving_avg_fl = (y_series:dynamic, n:int, center:bool=false)
{
    series_fir(y_series, repeat(1, n), true, center)
};
//
//  Moving average of 5 bins
//
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| extend num_ma=series_moving_avg_fl(num, 5, True)
| render timechart 

Stored

//
//  Moving average of 5 bins
//
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| extend num_ma=series_moving_avg_fl(num, 5, True)
| render timechart 

Output

Graph depicting moving average of 5 bins.

46 - series_moving_var_fl()

This article describes series_moving_var_fl() user-defined function.

Applies a moving variance filter on a series.

The function series_moving_var_fl() is a user-defined function (UDF) that takes an expression containing a dynamic numerical array as input and applies on it a moving variance filter.

Syntax

series_moving_var_fl(y_series, n [, center ])

Parameters

NameTypeRequiredDescription
y_seriesdynamic✔️An array cell of numeric values.
nint✔️The width of the moving variance filter.
centerboolIndicates whether the moving variance is either applied symmetrically on a window before and after the current point or applied on a window from the current point backwards. By default, center is false.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_moving_var_fl = (y_series:dynamic, n:int, center:bool=false)
{
    let ey = series_fir(y_series, repeat(1, n), true, center);
    let e2y = series_multiply(ey, ey);
    let y2 = series_multiply(y_series, y_series);
    let ey2 = series_fir(y2, repeat(1, n), true, center);
    let var_series = series_subtract(ey2, e2y);
    var_series
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Calculate moving variance of specified width")
series_moving_var_fl(y_series:dynamic, n:int, center:bool=false)
{
    let ey = series_fir(y_series, repeat(1, n), true, center);
    let e2y = series_multiply(ey, ey);
    let y2 = series_multiply(y_series, y_series);
    let ey2 = series_fir(y2, repeat(1, n), true, center);
    let var_series = series_subtract(ey2, e2y);
    var_series
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_moving_var_fl = (y_series:dynamic, n:int, center:bool=false)
{
    let ey = series_fir(y_series, repeat(1, n), true, center);
    let e2y = series_multiply(ey, ey);
    let y2 = series_multiply(y_series, y_series);
    let ey2 = series_fir(y2, repeat(1, n), true, center);
    let var_series = series_subtract(ey2, e2y);
    var_series
}
;
let sinewave=(x:double, period:double, gain:double=1.0, phase:double=0.0)
{
    gain*sin(2*pi()/period*(x+phase))
}
;
let n=128;
let T=10;
let window=T*2;
union
(range x from 0 to n-1 step 1 | extend y=sinewave(x, T)),
(range x from n to 2*n-1 step 1 | extend y=0.0),
(range x from 2*n to 3*n-1 step 1 | extend y=sinewave(x, T)),
(range x from 3*n to 4*n-1 step 1 | extend y=(x-3.0*n)/128.0),
(range x from 4*n to 5*n-1 step 1 | extend y=sinewave(x, T))
| order by x asc 
| summarize x=make_list(x), y=make_list(y)
| extend y_var=series_moving_var_fl(y, T, true)
| render linechart  

Stored

let sinewave=(x:double, period:double, gain:double=1.0, phase:double=0.0)
{
    gain*sin(2*pi()/period*(x+phase))
}
;
let n=128;
let T=10;
let window=T*2;
union
(range x from 0 to n-1 step 1 | extend y=sinewave(x, T)),
(range x from n to 2*n-1 step 1 | extend y=0.0),
(range x from 2*n to 3*n-1 step 1 | extend y=sinewave(x, T)),
(range x from 3*n to 4*n-1 step 1 | extend y=(x-3.0*n)/128.0),
(range x from 4*n to 5*n-1 step 1 | extend y=sinewave(x, T))
| order by x asc 
| summarize x=make_list(x), y=make_list(y)
| extend y_var=series_moving_var_fl(y, T, true)
| render linechart

Output

Graph depicting moving variance applied over a sine wave.

47 - series_mv_ee_anomalies_fl()

Learn how to use the series_mv_ee_anomalies_fl() user-defined function.

The function series_mv_ee_anomalies_fl() is a user-defined function (UDF) that detects multivariate anomalies in series by applying elliptic envelope model from scikit-learn. This model assumes that the source of the multivariate data is multi-dimensional normal distribution. The function accepts a set of series as numerical dynamic arrays, the names of the features columns and the expected percentage of anomalies out of the whole series. The function builds a multi-dimensional elliptical envelope for each series and marks the points that fall outside this normal envelope as anomalies.

Syntax

T | invoke series_mv_ee_anomalies_fl(features_cols, anomaly_col [, score_col [, anomalies_pct ]])

Parameters

NameTypeRequiredDescription
features_colsdynamic✔️An array containing the names of the columns that are used for the multivariate anomaly detection model.
anomaly_colstring✔️The name of the column to store the detected anomalies.
score_colstringThe name of the column to store the scores of the anomalies.
anomalies_pctrealA real number in the range [0-50] specifying the expected percentage of anomalies in the data. Default value: 4%.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

// Define function
let series_mv_ee_anomalies_fl=(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0)
{
    let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct);
    let code = ```if 1:
        from sklearn.covariance import EllipticEnvelope
        features_cols = kargs['features_cols']
        anomaly_col = kargs['anomaly_col']
        score_col = kargs['score_col']
        anomalies_pct = kargs['anomalies_pct']
        dff = df[features_cols]
        ellipsoid = EllipticEnvelope(contamination=anomalies_pct/100.0)
        for i in range(len(dff)):
            dffi = dff.iloc[[i], :]
            dffe = dffi.explode(features_cols)
            ellipsoid.fit(dffe)
            df.loc[i, anomaly_col] = (ellipsoid.predict(dffe) < 0).astype(int).tolist()
            if score_col != '':
                df.loc[i, score_col] = ellipsoid.decision_function(dffe).tolist()
        result = df
    ```;
    tbl
    | evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Anomaly Detection for multi dimensional normally distributed data using elliptical envelope model")
series_mv_ee_anomalies_fl(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0)
{
    let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct);
    let code = ```if 1:
        from sklearn.covariance import EllipticEnvelope
        features_cols = kargs['features_cols']
        anomaly_col = kargs['anomaly_col']
        score_col = kargs['score_col']
        anomalies_pct = kargs['anomalies_pct']
        dff = df[features_cols]
        ellipsoid = EllipticEnvelope(contamination=anomalies_pct/100.0)
        for i in range(len(dff)):
            dffi = dff.iloc[[i], :]
            dffe = dffi.explode(features_cols)
            ellipsoid.fit(dffe)
            df.loc[i, anomaly_col] = (ellipsoid.predict(dffe) < 0).astype(int).tolist()
            if score_col != '':
                df.loc[i, score_col] = ellipsoid.decision_function(dffe).tolist()
        result = df
    ```;
    tbl
    | evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

// Define function
let series_mv_ee_anomalies_fl=(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0)
{
    let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct);
    let code = ```if 1:
        from sklearn.covariance import EllipticEnvelope
        features_cols = kargs['features_cols']
        anomaly_col = kargs['anomaly_col']
        score_col = kargs['score_col']
        anomalies_pct = kargs['anomalies_pct']
        dff = df[features_cols]
        ellipsoid = EllipticEnvelope(contamination=anomalies_pct/100.0)
        for i in range(len(dff)):
            dffi = dff.iloc[[i], :]
            dffe = dffi.explode(features_cols)
            ellipsoid.fit(dffe)
            df.loc[i, anomaly_col] = (ellipsoid.predict(dffe) < 0).astype(int).tolist()
            if score_col != '':
                df.loc[i, score_col] = ellipsoid.decision_function(dffe).tolist()
        result = df
    ```;
    tbl
    | evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Usage
normal_2d_with_anomalies
| extend anomalies=dynamic(null), scores=dynamic(null)
| invoke series_mv_ee_anomalies_fl(pack_array('x', 'y'), 'anomalies', 'scores')
| extend anomalies=series_multiply(80, anomalies)
| render timechart

Stored

normal_2d_with_anomalies
| extend anomalies=dynamic(null), scores=dynamic(null)
| invoke series_mv_ee_anomalies_fl(pack_array('x', 'y'), 'anomalies', 'scores')
| extend anomalies=series_multiply(80, anomalies)
| render timechart

Output

The table normal_2d_with_anomalies contains a set of 3 time series. Each time series has two-dimensional normal distribution with daily anomalies added at midnight, 8am, and 4pm respectively. You can create this sample dataset using an example query.

Graph showing multivariate anomalies on a time chart.

To view the data as a scatter chart, replace the usage code with the following:

normal_2d_with_anomalies
| extend anomalies=dynamic(null)
| invoke series_mv_ee_anomalies_fl(pack_array('x', 'y'), 'anomalies')
| where name == 'TS1'
| project x, y, anomalies
| mv-expand x to typeof(real), y to typeof(real), anomalies to typeof(string)
| render scatterchart with(series=anomalies)

Graph showing multivariate anomalies on a scatter chart.

You can see that on TS1 most of the midnight anomalies were detected using this multivariate model.

Create a sample dataset

.set normal_2d_with_anomalies <|
//
let window=14d;
let dt=1h;
let n=toint(window/dt);
let rand_normal_fl=(avg:real=0.0, stdv:real=1.0)
{
    let x =rand()+rand()+rand()+rand()+rand()+rand()+rand()+rand()+rand()+rand()+rand()+rand();
    (x - 6)*stdv + avg
};
union
(range s from 0 to n step 1
| project t=startofday(now())-s*dt
| extend x=rand_normal_fl(10, 5)
| extend y=iff(hourofday(t) == 0, 2*(10-x)+7+rand_normal_fl(0, 3), 2*x+7+rand_normal_fl(0, 3))  //  anomalies every midnight
| extend name='TS1'),
(range s from 0 to n step 1
| project t=startofday(now())-s*dt
| extend x=rand_normal_fl(15, 3)
| extend y=iff(hourofday(t) == 8, (15-x)+10+rand_normal_fl(0, 2), x-7+rand_normal_fl(0, 1)) //  anomalies every 8am
| extend name='TS2'),
(range s from 0 to n step 1
| project t=startofday(now())-s*dt
| extend x=rand_normal_fl(8, 6)
| extend y=iff(hourofday(t) == 16, x+5+rand_normal_fl(0, 4), (12-x)+rand_normal_fl(0, 4)) //  anomalies every 4pm
| extend name='TS3')
| summarize t=make_list(t), x=make_list(x), y=make_list(y) by name

Scatter chart of the sample dataset.

48 - series_mv_if_anomalies_fl()

This article describes the series_mv_if_anomalies_fl() user-defined function.

The function series_mv_if_anomalies_fl() is a user-defined function (UDF) that detects multivariate anomalies in series by applying isolation forest model from scikit-learn. The function accepts a set of series as numerical dynamic arrays, the names of the features columns and the expected percentage of anomalies out of the whole series. The function builds an ensemble of isolation trees for each series and marks the points that are quickly isolated as anomalies.

Syntax

T | invoke series_mv_if_anomalies_fl(features_cols, anomaly_col [, score_col [, anomalies_pct [, num_trees [, samples_pct ]]]])

Parameters

NameTypeRequiredDescription
features_colsdynamic✔️An array containing the names of the columns that are used for the multivariate anomaly detection model.
anomaly_colstring✔️The name of the column to store the detected anomalies.
score_colstringThe name of the column to store the scores of the anomalies.
anomalies_pctrealA real number in the range [0-50] specifying the expected percentage of anomalies in the data. Default value: 4%.
num_treesintThe number of isolation trees to build for each time series. Default value: 100.
samples_pctrealA real number in the range [0-100] specifying the percentage of samples used to build each tree. Default value: 100%, i.e. use the full series.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

// Define function
let series_mv_if_anomalies_fl=(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0, num_trees:int=100, samples_pct:real=100.0)
{
    let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct, 'num_trees', num_trees, 'samples_pct', samples_pct);
    let code = ```if 1:
        from sklearn.ensemble import IsolationForest
        features_cols = kargs['features_cols']
        anomaly_col = kargs['anomaly_col']
        score_col = kargs['score_col']
        anomalies_pct = kargs['anomalies_pct']
        num_trees = kargs['num_trees']
        samples_pct = kargs['samples_pct']
        dff = df[features_cols]
        iforest = IsolationForest(contamination=anomalies_pct/100.0, random_state=0, n_estimators=num_trees, max_samples=samples_pct/100.0)
        for i in range(len(dff)):
            dffi = dff.iloc[[i], :]
            dffe = dffi.explode(features_cols)
            iforest.fit(dffe)
            df.loc[i, anomaly_col] = (iforest.predict(dffe) < 0).astype(int).tolist()
            if score_col != '':
                df.loc[i, score_col] = iforest.decision_function(dffe).tolist()
        result = df
    ```;
    tbl
    | evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Anomaly Detection for multi dimensional data using isolation forest model")
series_mv_if_anomalies_fl(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0, num_trees:int=100, samples_pct:real=100.0)
{
    let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct, 'num_trees', num_trees, 'samples_pct', samples_pct);
    let code = ```if 1:
        from sklearn.ensemble import IsolationForest
        features_cols = kargs['features_cols']
        anomaly_col = kargs['anomaly_col']
        score_col = kargs['score_col']
        anomalies_pct = kargs['anomalies_pct']
        num_trees = kargs['num_trees']
        samples_pct = kargs['samples_pct']
        dff = df[features_cols]
        iforest = IsolationForest(contamination=anomalies_pct/100.0, random_state=0, n_estimators=num_trees, max_samples=samples_pct/100.0)
        for i in range(len(dff)):
            dffi = dff.iloc[[i], :]
            dffe = dffi.explode(features_cols)
            iforest.fit(dffe)
            df.loc[i, anomaly_col] = (iforest.predict(dffe) < 0).astype(int).tolist()
            if score_col != '':
                df.loc[i, score_col] = iforest.decision_function(dffe).tolist()
        result = df
    ```;
    tbl
    | evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

// Define function
let series_mv_if_anomalies_fl=(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0, num_trees:int=100, samples_pct:real=100.0)
{
    let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct, 'num_trees', num_trees, 'samples_pct', samples_pct);
    let code = ```if 1:
        from sklearn.ensemble import IsolationForest
        features_cols = kargs['features_cols']
        anomaly_col = kargs['anomaly_col']
        score_col = kargs['score_col']
        anomalies_pct = kargs['anomalies_pct']
        num_trees = kargs['num_trees']
        samples_pct = kargs['samples_pct']
        dff = df[features_cols]
        iforest = IsolationForest(contamination=anomalies_pct/100.0, random_state=0, n_estimators=num_trees, max_samples=samples_pct/100.0)
        for i in range(len(dff)):
            dffi = dff.iloc[[i], :]
            dffe = dffi.explode(features_cols)
            iforest.fit(dffe)
            df.loc[i, anomaly_col] = (iforest.predict(dffe) < 0).astype(int).tolist()
            if score_col != '':
                df.loc[i, score_col] = iforest.decision_function(dffe).tolist()
        result = df
    ```;
    tbl
    | evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Usage
normal_2d_with_anomalies
| extend anomalies=dynamic(null), scores=dynamic(null)
| invoke series_mv_if_anomalies_fl(pack_array('x', 'y'), 'anomalies', 'scores', anomalies_pct=8, num_trees=1000)
| extend anomalies=series_multiply(40, anomalies)
| render timechart

Stored

normal_2d_with_anomalies
| extend anomalies=dynamic(null), scores=dynamic(null)
| invoke series_mv_if_anomalies_fl(pack_array('x', 'y'), 'anomalies', 'scores', anomalies_pct=8, num_trees=1000)
| extend anomalies=series_multiply(40, anomalies)
| render timechart

Output

The table normal_2d_with_anomalies contains a set of 3 time series. Each time series has two-dimensional normal distribution with daily anomalies added at midnight, 8am, and 4pm respectively. You can create this sample dataset using an example query.

Graph showing multivariate anomalies on a time chart.

To view the data as a scatter chart, replace the usage code with the following:

normal_2d_with_anomalies
| extend anomalies=dynamic(null)
| invoke series_mv_if_anomalies_fl(pack_array('x', 'y'), 'anomalies')
| where name == 'TS1'
| project x, y, anomalies
| mv-expand x to typeof(real), y to typeof(real), anomalies to typeof(string)
| render scatterchart with(series=anomalies)

Graph showing multivariate anomalies on a scatter chart.

You can see that on TS2 most of the anomalies occurring at 8am were detected using this multivariate model.

49 - series_mv_oc_anomalies_fl()

This article describes the series_mv_oc_anomalies_fl() user-defined function.

The function series_mv_oc_anomalies_fl() is a user-defined function (UDF) that detects multivariate anomalies in series by applying the One Class SVM model from scikit-learn. The function accepts a set of series as numerical dynamic arrays, the names of the features columns and the expected percentage of anomalies out of the whole series. The function trains one class SVM for each series and marks the points that fall outside the hyper sphere as anomalies.

Syntax

T | invoke series_mv_oc_anomalies_fl(features_cols, anomaly_col [, score_col [, anomalies_pct ]])

Parameters

NameTypeRequiredDescription
features_colsdynamic✔️An array containing the names of the columns that are used for the multivariate anomaly detection model.
anomaly_colstring✔️The name of the column to store the detected anomalies.
score_colstringThe name of the column to store the scores of the anomalies.
anomalies_pctrealA real number in the range [0-50] specifying the expected percentage of anomalies in the data. Default value: 4%.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_mv_oc_anomalies_fl=(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0)
{
    let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct);
    let code = ```if 1:
        from sklearn.svm import OneClassSVM
        features_cols = kargs['features_cols']
        anomaly_col = kargs['anomaly_col']
        score_col = kargs['score_col']
        anomalies_pct = kargs['anomalies_pct']
        dff = df[features_cols]
        svm = OneClassSVM(nu=anomalies_pct/100.0)
        for i in range(len(dff)):
            dffi = dff.iloc[[i], :]
            dffe = dffi.explode(features_cols)
            svm.fit(dffe)
            df.loc[i, anomaly_col] = (svm.predict(dffe) < 0).astype(int).tolist()
            if score_col != '':
                df.loc[i, score_col] = svm.decision_function(dffe).tolist()
        result = df
    ```;
    tbl
    | evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Write your query to use the function.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Anomaly Detection for multi dimensional data using One Class SVM model")
series_mv_oc_anomalies_fl(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0)
{
    let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct);
    let code = ```if 1:
        from sklearn.svm import OneClassSVM
        features_cols = kargs['features_cols']
        anomaly_col = kargs['anomaly_col']
        score_col = kargs['score_col']
        anomalies_pct = kargs['anomalies_pct']
        dff = df[features_cols]
        svm = OneClassSVM(nu=anomalies_pct/100.0)
        for i in range(len(dff)):
            dffi = dff.iloc[[i], :]
            dffe = dffi.explode(features_cols)
            svm.fit(dffe)
            df.loc[i, anomaly_col] = (svm.predict(dffe) < 0).astype(int).tolist()
            if score_col != '':
                df.loc[i, score_col] = svm.decision_function(dffe).tolist()
        result = df
    ```;
    tbl
    | evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_mv_oc_anomalies_fl=(tbl:(*), features_cols:dynamic, anomaly_col:string, score_col:string='', anomalies_pct:real=4.0)
{
    let kwargs = bag_pack('features_cols', features_cols, 'anomaly_col', anomaly_col, 'score_col', score_col, 'anomalies_pct', anomalies_pct);
    let code = ```if 1:
        from sklearn.svm import OneClassSVM
        features_cols = kargs['features_cols']
        anomaly_col = kargs['anomaly_col']
        score_col = kargs['score_col']
        anomalies_pct = kargs['anomalies_pct']
        dff = df[features_cols]
        svm = OneClassSVM(nu=anomalies_pct/100.0)
        for i in range(len(dff)):
            dffi = dff.iloc[[i], :]
            dffe = dffi.explode(features_cols)
            svm.fit(dffe)
            df.loc[i, anomaly_col] = (svm.predict(dffe) < 0).astype(int).tolist()
            if score_col != '':
                df.loc[i, score_col] = svm.decision_function(dffe).tolist()
        result = df
    ```;
    tbl
    | evaluate hint.distribution=per_node python(typeof(*), code, kwargs)
};
// Usage
normal_2d_with_anomalies
| extend anomalies=dynamic(null), scores=dynamic(null)
| invoke series_mv_oc_anomalies_fl(pack_array('x', 'y'), 'anomalies', 'scores', anomalies_pct=6)
| extend anomalies=series_multiply(80, anomalies)
| render timechart

Stored

normal_2d_with_anomalies
| extend anomalies=dynamic(null), scores=dynamic(null)
| invoke series_mv_oc_anomalies_fl(pack_array('x', 'y'), 'anomalies', 'scores', anomalies_pct=6)
| extend anomalies=series_multiply(80, anomalies)
| render timechart

Output

The table normal_2d_with_anomalies contains a set of 3 time series. Each time series has two-dimensional normal distribution with daily anomalies added at midnight, 8am, and 4pm respectively. You can create this sample dataset using an example query.

Graph showing multivariate anomalies on a time chart.

To view the data as a scatter chart, replace the usage code with the following:

normal_2d_with_anomalies
| extend anomalies=dynamic(null)
| invoke series_mv_oc_anomalies_fl(pack_array('x', 'y'), 'anomalies')
| where name == 'TS1'
| project x, y, anomalies
| mv-expand x to typeof(real), y to typeof(real), anomalies to typeof(string)
| render scatterchart with(series=anomalies)

Graph showing multivariate anomalies on a scatter chart.

You can see that on TS1 most of the anomalies occurring at midnights were detected using this multivariate model.

50 - series_rate_fl()

This article describes the series_rate_fl() user-defined function.

The function series_rate_fl() is a user-defined function (UDF) that calculates the average rate of metric increase per second. Its logic follows PromQL rate() function. It should be used for time series of counter metrics ingested to your database by Prometheus monitoring system, and retrieved by series_metric_fl().

Syntax

T | invoke series_rate_fl([ n_bins [, fix_reset ]])

T is a table returned from series_metric_fl(). Its schema includes (timestamp:dynamic, name:string, labels:string, value:dynamic).

Parameters

NameTypeRequiredDescription
n_binsintThe number of bins to specify the gap between the extracted metric values for calculation of the rate. The function calculates the difference between the current sample and the one n_bins before, and divide it by the difference of their respective timestamps in seconds. The default is one bin. The default settings calculate irate(), the PromQL instantaneous rate function.
fix_resetboolControls whether to check for counter resets and correct it like PromQL rate() function. The default is true. Set it to false to save redundant analysis in case no need to check for resets.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_rate_fl=(tbl:(timestamp:dynamic, value:dynamic), n_bins:int=1, fix_reset:bool=true)
{
    tbl
    | where fix_reset                                                   //  Prometheus counters can only go up
    | mv-apply value to typeof(double) on   
    ( extend correction = iff(value < prev(value), prev(value), 0.0)    // if the value decreases we assume it was reset to 0, so add last value
    | extend cum_correction = row_cumsum(correction)
    | extend corrected_value = value + cum_correction
    | summarize value = make_list(corrected_value))
    | union (tbl | where not(fix_reset))
    | extend timestampS = array_shift_right(timestamp, n_bins), valueS = array_shift_right(value, n_bins)
    | extend dt = series_subtract(timestamp, timestampS)
    | extend dt = series_divide(dt, 1e7)                              //  converts from ticks to seconds
    | extend dv = series_subtract(value, valueS)
    | extend rate = series_divide(dv, dt)
    | project-away dt, dv, timestampS, value, valueS
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create function with (folder = "Packages\\Series", docstring = "Simulate PromQL rate()")
series_rate_fl(tbl:(timestamp:dynamic, value:dynamic), n_bins:int=1, fix_reset:bool=true)
{
    tbl
    | where fix_reset                                                   //  Prometheus counters can only go up
    | mv-apply value to typeof(double) on   
    ( extend correction = iff(value < prev(value), prev(value), 0.0)    // if the value decreases we assume it was reset to 0, so add last value
    | extend cum_correction = row_cumsum(correction)
    | extend corrected_value = value + cum_correction
    | summarize value = make_list(corrected_value))
    | union (tbl | where not(fix_reset))
    | extend timestampS = array_shift_right(timestamp, n_bins), valueS = array_shift_right(value, n_bins)
    | extend dt = series_subtract(timestamp, timestampS)
    | extend dt = series_divide(dt, 1e7)                              //  converts from ticks to seconds
    | extend dv = series_subtract(value, valueS)
    | extend rate = series_divide(dv, dt)
    | project-away dt, dv, timestampS, value, valueS
}

Examples

The following examples use the invoke operator to run the function.

Calculate average rate of metric increase

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_rate_fl=(tbl:(timestamp:dynamic, value:dynamic), n_bins:int=1, fix_reset:bool=true)
{
    tbl
    | where fix_reset                                                   //  Prometheus counters can only go up
    | mv-apply value to typeof(double) on   
    ( extend correction = iff(value < prev(value), prev(value), 0.0)    // if the value decreases we assume it was reset to 0, so add last value
    | extend cum_correction = row_cumsum(correction)
    | extend corrected_value = value + cum_correction
    | summarize value = make_list(corrected_value))
    | union (tbl | where not(fix_reset))
    | extend timestampS = array_shift_right(timestamp, n_bins), valueS = array_shift_right(value, n_bins)
    | extend dt = series_subtract(timestamp, timestampS)
    | extend dt = series_divide(dt, 1e7)                              //  converts from ticks to seconds
    | extend dv = series_subtract(value, valueS)
    | extend rate = series_divide(dv, dt)
    | project-away dt, dv, timestampS, value, valueS
};
//
demo_prometheus
| invoke series_metric_fl('TimeStamp', 'Name', 'Labels', 'Val', 'writes', offset=now()-datetime(2020-12-08 00:00))
| invoke series_rate_fl(2)
| render timechart with(series=labels)

Stored

demo_prometheus
| invoke series_metric_fl('TimeStamp', 'Name', 'Labels', 'Val', 'writes', offset=now()-datetime(2020-12-08 00:00))
| invoke series_rate_fl(2)
| render timechart with(series=labels)

Output

Graph showing rate per second of disk write metric for all disks.

Selects the main disk of two hosts

The following example selects the main disk of two hosts, and assumes that the function is already installed. This example uses alternative direct calling syntax, specifying the input table as the first parameter:

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_rate_fl=(tbl:(timestamp:dynamic, value:dynamic), n_bins:int=1, fix_reset:bool=true)
{
    tbl
    | where fix_reset                                                   //  Prometheus counters can only go up
    | mv-apply value to typeof(double) on   
    ( extend correction = iff(value < prev(value), prev(value), 0.0)    // if the value decreases we assume it was reset to 0, so add last value
    | extend cum_correction = row_cumsum(correction)
    | extend corrected_value = value + cum_correction
    | summarize value = make_list(corrected_value))
    | union (tbl | where not(fix_reset))
    | extend timestampS = array_shift_right(timestamp, n_bins), valueS = array_shift_right(value, n_bins)
    | extend dt = series_subtract(timestamp, timestampS)
    | extend dt = series_divide(dt, 1e7)                              //  converts from ticks to seconds
    | extend dv = series_subtract(value, valueS)
    | extend rate = series_divide(dv, dt)
    | project-away dt, dv, timestampS, value, valueS
};
//
series_rate_fl(series_metric_fl(demo_prometheus, 'TimeStamp', 'Name', 'Labels', 'Val', 'writes', '"disk":"sda1"', lookback=2h, offset=now()-datetime(2020-12-08 00:00)), n_bins=10)
| render timechart with(series=labels)

Stored

series_rate_fl(series_metric_fl(demo_prometheus, 'TimeStamp', 'Name', 'Labels', 'Val', 'writes', '"disk":"sda1"', lookback=2h, offset=now()-datetime(2020-12-08 00:00)), n_bins=10)
| render timechart with(series=labels)

Output

Graph showing rate per second of main disk write metric in the last two hours with 10 bins gap.

51 - series_rolling_fl()

This article describes the series_rolling_fl() user-defined function.

The function series_rolling_fl() is a user-defined function (UDF) that applies rolling aggregation on a series. It takes a table containing multiple series (dynamic numerical array) and applies, for each series, a rolling aggregation function.

Syntax

T | invoke series_rolling_fl(y_series, y_rolling_series, n, aggr, aggr_params, center)

Parameters

NameTypeRequiredDescription
y_seriesstring✔️The name of the column that contains the series to fit.
y_rolling_seriesstring✔️The name of the column to store the rolling aggregation series.
nint✔️The width of the rolling window.
aggrstring✔️The name of the aggregation function to use. See aggregation functions.
aggr_paramsstringOptional parameters for the aggregation function.
centerboolIndicates whether the rolling window is applied symmetrically before and after the current point or applied from the current point backwards. By default, center is false, for calculation on streaming data.

Aggregation functions

This function supports any aggregation function from numpy or scipy.stats that calculates a scalar out of a series. The following list isn’t exhaustive:

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_rolling_fl = (tbl:(*), y_series:string, y_rolling_series:string, n:int, aggr:string, aggr_params:dynamic=dynamic([null]), center:bool=true)
{
    let kwargs = bag_pack('y_series', y_series, 'y_rolling_series', y_rolling_series, 'n', n, 'aggr', aggr, 'aggr_params', aggr_params, 'center', center);
    let code = ```if 1:
        y_series = kargs["y_series"]
        y_rolling_series = kargs["y_rolling_series"]
        n = kargs["n"]
        aggr = kargs["aggr"]
        aggr_params = kargs["aggr_params"]
        center = kargs["center"]
        result = df
        in_s = df[y_series]
        func = getattr(np, aggr, None)
        if not func:
            import scipy.stats
            func = getattr(scipy.stats, aggr)
        if func:
            result[y_rolling_series] = list(pd.Series(in_s[i]).rolling(n, center=center, min_periods=1).apply(func, args=tuple(aggr_params)).values for i in range(len(in_s)))
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Rolling window functions on a series")
series_rolling_fl(tbl:(*), y_series:string, y_rolling_series:string, n:int, aggr:string, aggr_params:dynamic, center:bool=true)
{
    let kwargs = bag_pack('y_series', y_series, 'y_rolling_series', y_rolling_series, 'n', n, 'aggr', aggr, 'aggr_params', aggr_params, 'center', center);
    let code = ```if 1:
        y_series = kargs["y_series"]
        y_rolling_series = kargs["y_rolling_series"]
        n = kargs["n"]
        aggr = kargs["aggr"]
        aggr_params = kargs["aggr_params"]
        center = kargs["center"]
        result = df
        in_s = df[y_series]
        func = getattr(np, aggr, None)
        if not func:
            import scipy.stats
            func = getattr(scipy.stats, aggr)
        if func:
            result[y_rolling_series] = list(pd.Series(in_s[i]).rolling(n, center=center, min_periods=1).apply(func, args=tuple(aggr_params)).values for i in range(len(in_s)))
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
}

Examples

The following examples use the invoke operator to run the function.

Calculate rolling median of 9 elements

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_rolling_fl = (tbl:(*), y_series:string, y_rolling_series:string, n:int, aggr:string, aggr_params:dynamic=dynamic([null]), center:bool=true)
{
    let kwargs = bag_pack('y_series', y_series, 'y_rolling_series', y_rolling_series, 'n', n, 'aggr', aggr, 'aggr_params', aggr_params, 'center', center);
    let code = ```if 1:
        y_series = kargs["y_series"]
        y_rolling_series = kargs["y_rolling_series"]
        n = kargs["n"]
        aggr = kargs["aggr"]
        aggr_params = kargs["aggr_params"]
        center = kargs["center"]
        result = df
        in_s = df[y_series]
        func = getattr(np, aggr, None)
        if not func:
            import scipy.stats
            func = getattr(scipy.stats, aggr)
        if func:
            result[y_rolling_series] = list(pd.Series(in_s[i]).rolling(n, center=center, min_periods=1).apply(func, args=tuple(aggr_params)).values for i in range(len(in_s)))
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
//
//  Calculate rolling median of 9 elements
//
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| extend rolling_med = dynamic(null)
| invoke series_rolling_fl('num', 'rolling_med', 9, 'median')
| render timechart

Stored

//
//  Calculate rolling median of 9 elements
//
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| extend rolling_med = dynamic(null)
| invoke series_rolling_fl('num', 'rolling_med', 9, 'median', dynamic([null]))
| render timechart

Output

Graph depicting rolling median of 9 elements.

Calculate rolling min, max & 75th percentile of 15 elements

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_rolling_fl = (tbl:(*), y_series:string, y_rolling_series:string, n:int, aggr:string, aggr_params:dynamic=dynamic([null]), center:bool=true)
{
    let kwargs = bag_pack('y_series', y_series, 'y_rolling_series', y_rolling_series, 'n', n, 'aggr', aggr, 'aggr_params', aggr_params, 'center', center);
    let code = ```if 1:
        y_series = kargs["y_series"]
        y_rolling_series = kargs["y_rolling_series"]
        n = kargs["n"]
        aggr = kargs["aggr"]
        aggr_params = kargs["aggr_params"]
        center = kargs["center"]
        result = df
        in_s = df[y_series]
        func = getattr(np, aggr, None)
        if not func:
            import scipy.stats
            func = getattr(scipy.stats, aggr)
        if func:
            result[y_rolling_series] = list(pd.Series(in_s[i]).rolling(n, center=center, min_periods=1).apply(func, args=tuple(aggr_params)).values for i in range(len(in_s)))
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
//
//  Calculate rolling min, max & 75th percentile of 15 elements
//
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| extend rolling_min = dynamic(null), rolling_max = dynamic(null), rolling_pct = dynamic(null)
| invoke series_rolling_fl('num', 'rolling_min', 15, 'min', dynamic([null]))
| invoke series_rolling_fl('num', 'rolling_max', 15, 'max', dynamic([null]))
| invoke series_rolling_fl('num', 'rolling_pct', 15, 'percentile', dynamic([75]))
| render timechart

Stored

//
//  Calculate rolling min, max & 75th percentile of 15 elements
//
demo_make_series1
| make-series num=count() on TimeStamp step 1h by OsVer
| extend rolling_min = dynamic(null), rolling_max = dynamic(null), rolling_pct = dynamic(null)
| invoke series_rolling_fl('num', 'rolling_min', 15, 'min', dynamic([null]))
| invoke series_rolling_fl('num', 'rolling_max', 15, 'max', dynamic([null]))
| invoke series_rolling_fl('num', 'rolling_pct', 15, 'percentile', dynamic([75]))
| render timechart

Output

Graph depicting rolling min, max & 75th percentile of 15 elements.

Calculate the rolling trimmed mean

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_rolling_fl = (tbl:(*), y_series:string, y_rolling_series:string, n:int, aggr:string, aggr_params:dynamic=dynamic([null]), center:bool=true)
{
    let kwargs = bag_pack('y_series', y_series, 'y_rolling_series', y_rolling_series, 'n', n, 'aggr', aggr, 'aggr_params', aggr_params, 'center', center);
    let code = ```if 1:
        y_series = kargs["y_series"]
        y_rolling_series = kargs["y_rolling_series"]
        n = kargs["n"]
        aggr = kargs["aggr"]
        aggr_params = kargs["aggr_params"]
        center = kargs["center"]
        result = df
        in_s = df[y_series]
        func = getattr(np, aggr, None)
        if not func:
            import scipy.stats
            func = getattr(scipy.stats, aggr)
        if func:
            result[y_rolling_series] = list(pd.Series(in_s[i]).rolling(n, center=center, min_periods=1).apply(func, args=tuple(aggr_params)).values for i in range(len(in_s)))
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
range x from 1 to 100 step 1
| extend y=iff(x % 13 == 0, 2.0, iff(x % 23 == 0, -2.0, rand()))
| summarize x=make_list(x), y=make_list(y)
| extend yr = dynamic(null)
| invoke series_rolling_fl('y', 'yr', 7, 'tmean', pack_array(pack_array(-2, 2), pack_array(false, false))) //  trimmed mean: ignoring values outside [-2,2] inclusive
| render linechart

Stored

range x from 1 to 100 step 1
| extend y=iff(x % 13 == 0, 2.0, iff(x % 23 == 0, -2.0, rand()))
| summarize x=make_list(x), y=make_list(y)
| extend yr = dynamic(null)
| invoke series_rolling_fl('y', 'yr', 7, 'tmean', pack_array(pack_array(-2, 2), pack_array(false, false))) //  trimmed mean: ignoring values outside [-2,2] inclusive
| render linechart

Output

Graph depicting rolling trimmed mean.

52 - series_shapes_fl()

This article describes the series_shapes_fl() user-defined function.

The function series_shapes_fl() is a user-defined function (UDF) that detects positive/negative trend or jump in a series. This function takes a table containing multiple time series (dynamic numerical array), and calculates trend and jump scores for each series. The output is a dictionary (dynamic) containing the scores.

Syntax

T | extend series_shapes_fl(y_series, advanced)

Parameters

NameTypeRequiredDescription
y_seriesdynamic✔️An array cell of numeric values.
advancedboolThe default is false. Set to true to output additional calculated parameters.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_shapes_fl=(series:dynamic, advanced:bool=false)
{
    let n = array_length(series);
//  calculate normal dynamic range between 10th and 90th percentiles
    let xs = array_sort_asc(series);
    let low_idx = tolong(n*0.1);
    let high_idx = tolong(n*0.9);
    let low_pct = todouble(xs[low_idx]);
    let high_pct = todouble(xs[high_idx]);
    let norm_range = high_pct-low_pct;
//  trend score
    let lf = series_fit_line_dynamic(series);
    let slope = todouble(lf.slope);
    let rsquare = todouble(lf.rsquare);
    let rel_slope = abs(n*slope/norm_range);
    let sign_slope = iff(slope >= 0.0, 1.0, -1.0);
    let norm_slope = sign_slope*rel_slope/(rel_slope+0.1);  //  map rel_slope from [-Inf, +Inf] to [-1, 1]; 0.1 is a clibration constant
    let trend_score = norm_slope*rsquare;
//  jump score
    let lf2=series_fit_2lines_dynamic(series);
    let lslope = todouble(lf2.left.slope);
    let rslope = todouble(lf2.right.slope);
    let rsquare2 = todouble(lf2.rsquare);
    let split_idx = tolong(lf2.split_idx);
    let last_left = todouble(lf2.left.interception)+lslope*split_idx;
    let first_right = todouble(lf2.right.interception)+rslope;
    let jump = first_right-last_left;
    let rel_jump = abs(jump/norm_range);
    let sign_jump = iff(first_right >= last_left, 1.0, -1.0);
    let norm_jump = sign_jump*rel_jump/(rel_jump+0.1);  //  map rel_jump from [-Inf, +Inf] to [-1, 1]; 0.1 is a clibration constant
    let jump_score1 = norm_jump*rsquare2;
//  filter for jumps that are not close to the series edges and the right slope has the same direction
    let norm_rslope = abs(rslope/norm_range);
    let jump_score = iff((sign_jump*rslope >= 0.0 or norm_rslope < 0.02) and split_idx between((0.1*n)..(0.9*n)), jump_score1, 0.0);
    let res = iff(advanced, bag_pack("n", n, "low_pct", low_pct, "high_pct", high_pct, "norm_range", norm_range, "slope", slope, "rsquare", rsquare, "rel_slope", rel_slope, "norm_slope", norm_slope,
                              "trend_score", trend_score, "split_idx", split_idx, "jump", jump, "rsquare2", rsquare2, "last_left", last_left, "first_right", first_right, "rel_jump", rel_jump,
                              "lslope", lslope, "rslope", rslope, "norm_rslope", norm_rslope, "norm_jump", norm_jump, "jump_score", jump_score)
                              , bag_pack("trend_score", trend_score, "jump_score", jump_score));
    res
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Series detector for positive/negative trend or step. Returns a dynamic with trend and jump scores")
series_shapes_fl(series:dynamic, advanced:bool=false)
{
    let n = array_length(series);
//  calculate normal dynamic range between 10th and 90th percentiles
    let xs = array_sort_asc(series);
    let low_idx = tolong(n*0.1);
    let high_idx = tolong(n*0.9);
    let low_pct = todouble(xs[low_idx]);
    let high_pct = todouble(xs[high_idx]);
    let norm_range = high_pct-low_pct;
//  trend score
    let lf = series_fit_line_dynamic(series);
    let slope = todouble(lf.slope);
    let rsquare = todouble(lf.rsquare);
    let rel_slope = abs(n*slope/norm_range);
    let sign_slope = iff(slope >= 0.0, 1.0, -1.0);
    let norm_slope = sign_slope*rel_slope/(rel_slope+0.1);  //  map rel_slope from [-Inf, +Inf] to [-1, 1]; 0.1 is a clibration constant
    let trend_score = norm_slope*rsquare;
//  jump score
    let lf2=series_fit_2lines_dynamic(series);
    let lslope = todouble(lf2.left.slope);
    let rslope = todouble(lf2.right.slope);
    let rsquare2 = todouble(lf2.rsquare);
    let split_idx = tolong(lf2.split_idx);
    let last_left = todouble(lf2.left.interception)+lslope*split_idx;
    let first_right = todouble(lf2.right.interception)+rslope;
    let jump = first_right-last_left;
    let rel_jump = abs(jump/norm_range);
    let sign_jump = iff(first_right >= last_left, 1.0, -1.0);
    let norm_jump = sign_jump*rel_jump/(rel_jump+0.1);  //  map rel_jump from [-Inf, +Inf] to [-1, 1]; 0.1 is a clibration constant
    let jump_score1 = norm_jump*rsquare2;
//  filter for jumps that are not close to the series edges and the right slope has the same direction
    let norm_rslope = abs(rslope/norm_range);
    let jump_score = iff((sign_jump*rslope >= 0.0 or norm_rslope < 0.02) and split_idx between((0.1*n)..(0.9*n)), jump_score1, 0.0);
    let res = iff(advanced, bag_pack("n", n, "low_pct", low_pct, "high_pct", high_pct, "norm_range", norm_range, "slope", slope, "rsquare", rsquare, "rel_slope", rel_slope, "norm_slope", norm_slope,
                              "trend_score", trend_score, "split_idx", split_idx, "jump", jump, "rsquare2", rsquare2, "last_left", last_left, "first_right", first_right, "rel_jump", rel_jump,
                              "lslope", lslope, "rslope", rslope, "norm_rslope", norm_rslope, "norm_jump", norm_jump, "jump_score", jump_score)
                              , bag_pack("trend_score", trend_score, "jump_score", jump_score));
    res
}

Example

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_shapes_fl=(series:dynamic, advanced:bool=false)
{
    let n = array_length(series);
//  calculate normal dynamic range between 10th and 90th percentiles
    let xs = array_sort_asc(series);
    let low_idx = tolong(n*0.1);
    let high_idx = tolong(n*0.9);
    let low_pct = todouble(xs[low_idx]);
    let high_pct = todouble(xs[high_idx]);
    let norm_range = high_pct-low_pct;
//  trend score
    let lf = series_fit_line_dynamic(series);
    let slope = todouble(lf.slope);
    let rsquare = todouble(lf.rsquare);
    let rel_slope = abs(n*slope/norm_range);
    let sign_slope = iff(slope >= 0.0, 1.0, -1.0);
    let norm_slope = sign_slope*rel_slope/(rel_slope+0.1);  //  map rel_slope from [-Inf, +Inf] to [-1, 1]; 0.1 is a clibration constant
    let trend_score = norm_slope*rsquare;
//  jump score
    let lf2=series_fit_2lines_dynamic(series);
    let lslope = todouble(lf2.left.slope);
    let rslope = todouble(lf2.right.slope);
    let rsquare2 = todouble(lf2.rsquare);
    let split_idx = tolong(lf2.split_idx);
    let last_left = todouble(lf2.left.interception)+lslope*split_idx;
    let first_right = todouble(lf2.right.interception)+rslope;
    let jump = first_right-last_left;
    let rel_jump = abs(jump/norm_range);
    let sign_jump = iff(first_right >= last_left, 1.0, -1.0);
    let norm_jump = sign_jump*rel_jump/(rel_jump+0.1);  //  map rel_jump from [-Inf, +Inf] to [-1, 1]; 0.1 is a clibration constant
    let jump_score1 = norm_jump*rsquare2;
//  filter for jumps that are not close to the series edges and the right slope has the same direction
    let norm_rslope = abs(rslope/norm_range);
    let jump_score = iff((sign_jump*rslope >= 0.0 or norm_rslope < 0.02) and split_idx between((0.1*n)..(0.9*n)), jump_score1, 0.0);
    let res = iff(advanced, bag_pack("n", n, "low_pct", low_pct, "high_pct", high_pct, "norm_range", norm_range, "slope", slope, "rsquare", rsquare, "rel_slope", rel_slope, "norm_slope", norm_slope,
                              "trend_score", trend_score, "split_idx", split_idx, "jump", jump, "rsquare2", rsquare2, "last_left", last_left, "first_right", first_right, "rel_jump", rel_jump,
                              "lslope", lslope, "rslope", rslope, "norm_rslope", norm_rslope, "norm_jump", norm_jump, "jump_score", jump_score)
                              , bag_pack("trend_score", trend_score, "jump_score", jump_score));
    res
};
let ts_len = 100;
let noise_pct = 2;
let noise_gain = 3;
union
(print tsid=1 | extend y = array_concat(repeat(20, ts_len/2), repeat(150, ts_len/2))),
(print tsid=2 | extend y = array_concat(repeat(0, ts_len*3/4), repeat(-50, ts_len/4))),
(print tsid=3 | extend y = range(40, 139, 1)),
(print tsid=4 | extend y = range(-20, -109, -1))
| extend x = range(1, array_length(y), 1)
//
| extend shapes = series_shapes_fl(y)
| order by tsid asc 
| fork (take 4) (project tsid, shapes)
| render timechart with(series=tsid, xcolumn=x, ycolumns=y)

Stored

let ts_len = 100;
let noise_pct = 2;
let noise_gain = 3;
union
(print tsid=1 | extend y = array_concat(repeat(20, ts_len/2), repeat(150, ts_len/2))),
(print tsid=2 | extend y = array_concat(repeat(0, ts_len*3/4), repeat(-50, ts_len/4))),
(print tsid=3 | extend y = range(40, 139, 1)),
(print tsid=4 | extend y = range(-20, -109, -1))
| extend x = range(1, array_length(y), 1)
//
| extend shapes = series_shapes_fl(y)
| order by tsid asc 
| fork (take 4) (project tsid, shapes)
| render timechart with(series=tsid, xcolumn=x, ycolumns=y)

Output

Graph showing 4 time series with trends and jumps.

The respective trend and jump scores:

tsid	shapes
1	    {
          "trend_score": 0.703199714530169,
          "jump_score": 0.90909090909090906
        }
2	    {
          "trend_score": -0.51663751343174869,
          "jump_score": -0.90909090909090906
        }
3	    {
          "trend_score": 0.92592592592592582,
          "jump_score": 0.0
        }
4	    {
          "trend_score": -0.92592592592592582,
          "jump_score": 0.0
        }

53 - series_uv_anomalies_fl()

This article describes the series_uv_anomalies_fl() user-defined function.

The function series_uv_anomalies_fl() is a user-defined function (UDF) that detects anomalies in time series by calling the Univariate Anomaly Detection API, part of Azure Cognitive Services. The function accepts a limited set of time series as numerical dynamic arrays and the required anomaly detection sensitivity level. Each time series is converted into the required JSON format and posts it to the Anomaly Detector service endpoint. The service response contains dynamic arrays of high/low/all anomalies, the modeled baseline time series, its normal high/low boundaries (a value above or below the high/low boundary is an anomaly) and the detected seasonality.

Prerequisites

In the following function example, replace YOUR-AD-RESOURCE-NAME in the uri and YOUR-KEY in the Ocp-Apim-Subscription-Key of the header with your Anomaly Detector resource name and key.

Syntax

T | invoke series_uv_anomalies_fl(y_series [, sensitivity [, tsid]])

Parameters

NameTypeRequiredDescription
y_seriesstring✔️The name of the input table column containing the values of the series to be anomaly detected.
sensitivityintegerAn integer in the range [0-100] specifying the anomaly detection sensitivity. 0 is the least sensitive detection, while 100 is the most sensitive indicating even a small deviation from the expected baseline would be tagged as anomaly. Default value: 85
tsidstringThe name of the input table column containing the time series ID. Can be omitted when analyzing a single time series.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let series_uv_anomalies_fl=(tbl:(*), y_series:string, sensitivity:int=85, tsid:string='_tsid')
{
    let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/entire/detect';
    let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
    let kwargs = bag_pack('y_series', y_series, 'sensitivity', sensitivity);
    let code = ```if 1:
        import json
        y_series = kargs["y_series"]
        sensitivity = kargs["sensitivity"]
        json_str = []
        for i in range(len(df)):
            row = df.iloc[i, :]
            ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
            json_data = {'series': ts, "sensitivity":sensitivity}     # auto-detect period, or we can force 'period': 84. We can also add 'maxAnomalyRatio':0.25 for maximum 25% anomalies
            json_str = json_str + [json.dumps(json_data)]
        result = df
        result['json_str'] = json_str
    ```;
    tbl
    | evaluate python(typeof(*, json_str:string), code, kwargs)
    | extend _tsid = column_ifexists(tsid, 1)
    | partition by _tsid (
       project json_str
       | evaluate http_request_post(uri, headers, dynamic(null))
       | project period=ResponseBody.period, baseline_ama=ResponseBody.expectedValues, ad_ama=series_add(0, ResponseBody.isAnomaly), pos_ad_ama=series_add(0, ResponseBody.isPositiveAnomaly)
       , neg_ad_ama=series_add(0, ResponseBody.isNegativeAnomaly), upper_ama=series_add(ResponseBody.expectedValues, ResponseBody.upperMargins), lower_ama=series_subtract(ResponseBody.expectedValues, ResponseBody.lowerMargins)
       | extend _tsid=toscalar(_tsid)
      )
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Time Series Anomaly Detection by Azure Cognitive Service")
series_uv_anomalies_fl(tbl:(*), y_series:string, sensitivity:int=85, tsid:string='_tsid')
{
    let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/entire/detect';
    let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
    let kwargs = bag_pack('y_series', y_series, 'sensitivity', sensitivity);
    let code = ```if 1:
        import json
        y_series = kargs["y_series"]
        sensitivity = kargs["sensitivity"]
        json_str = []
        for i in range(len(df)):
            row = df.iloc[i, :]
            ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
            json_data = {'series': ts, "sensitivity":sensitivity}     # auto-detect period, or we can force 'period': 84. We can also add 'maxAnomalyRatio':0.25 for maximum 25% anomalies
            json_str = json_str + [json.dumps(json_data)]
        result = df
        result['json_str'] = json_str
    ```;
    tbl
    | evaluate python(typeof(*, json_str:string), code, kwargs)
    | extend _tsid = column_ifexists(tsid, 1)
    | partition by _tsid (
       project json_str
       | evaluate http_request_post(uri, headers, dynamic(null))
       | project period=ResponseBody.period, baseline_ama=ResponseBody.expectedValues, ad_ama=series_add(0, ResponseBody.isAnomaly), pos_ad_ama=series_add(0, ResponseBody.isPositiveAnomaly)
       , neg_ad_ama=series_add(0, ResponseBody.isNegativeAnomaly), upper_ama=series_add(ResponseBody.expectedValues, ResponseBody.upperMargins), lower_ama=series_subtract(ResponseBody.expectedValues, ResponseBody.lowerMargins)
       | extend _tsid=toscalar(_tsid)
      )
}

Examples

The following examples use the invoke operator to run the function.

Use series_uv_anomalies_fl() to detect anomalies

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_uv_anomalies_fl=(tbl:(*), y_series:string, sensitivity:int=85, tsid:string='_tsid')
{
    let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/entire/detect';
    let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
    let kwargs = bag_pack('y_series', y_series, 'sensitivity', sensitivity);
    let code = ```if 1:
        import json
        y_series = kargs["y_series"]
        sensitivity = kargs["sensitivity"]
        json_str = []
        for i in range(len(df)):
            row = df.iloc[i, :]
            ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
            json_data = {'series': ts, "sensitivity":sensitivity}     # auto-detect period, or we can force 'period': 84. We can also add 'maxAnomalyRatio':0.25 for maximum 25% anomalies
            json_str = json_str + [json.dumps(json_data)]
        result = df
        result['json_str'] = json_str
    ```;
    tbl
    | evaluate python(typeof(*, json_str:string), code, kwargs)
    | extend _tsid = column_ifexists(tsid, 1)
    | partition by _tsid (
       project json_str
       | evaluate http_request_post(uri, headers, dynamic(null))
       | project period=ResponseBody.period, baseline_ama=ResponseBody.expectedValues, ad_ama=series_add(0, ResponseBody.isAnomaly), pos_ad_ama=series_add(0, ResponseBody.isPositiveAnomaly)
       , neg_ad_ama=series_add(0, ResponseBody.isNegativeAnomaly), upper_ama=series_add(ResponseBody.expectedValues, ResponseBody.upperMargins), lower_ama=series_subtract(ResponseBody.expectedValues, ResponseBody.lowerMargins)
       | extend _tsid=toscalar(_tsid)
      )
};
let etime=datetime(2017-03-02);
let stime=datetime(2017-01-01);
let dt=1h;
let ts = requests
| make-series value=avg(value) on timestamp from stime to etime step dt
| extend _tsid='TS1';
ts
| invoke series_uv_anomalies_fl('value')
| lookup ts on _tsid
| render anomalychart with(xcolumn=timestamp, ycolumns=value, anomalycolumns=ad_ama)

Stored

let etime=datetime(2017-03-02);
let stime=datetime(2017-01-01);
let dt=1h;
let ts = requests
| make-series value=avg(value) on timestamp from stime to etime step dt
| extend _tsid='TS1';
ts
| invoke series_uv_anomalies_fl('value')
| lookup ts on _tsid
| render anomalychart with(xcolumn=timestamp, ycolumns=value, anomalycolumns=ad_ama)

Output

Graph showing anomalies on a time series.

Compare series_uv_anomalies_fl() and native series_decompose_anomalies()

The following example compares the Univariate Anomaly Detection API to the native series_decompose_anomalies() function over three time series and assumes the series_uv_anomalies_fl() function is already defined in the database:

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_uv_anomalies_fl=(tbl:(*), y_series:string, sensitivity:int=85, tsid:string='_tsid')
{
    let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/entire/detect';
    let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
    let kwargs = bag_pack('y_series', y_series, 'sensitivity', sensitivity);
    let code = ```if 1:
        import json
        y_series = kargs["y_series"]
        sensitivity = kargs["sensitivity"]
        json_str = []
        for i in range(len(df)):
            row = df.iloc[i, :]
            ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
            json_data = {'series': ts, "sensitivity":sensitivity}     # auto-detect period, or we can force 'period': 84. We can also add 'maxAnomalyRatio':0.25 for maximum 25% anomalies
            json_str = json_str + [json.dumps(json_data)]
        result = df
        result['json_str'] = json_str
    ```;
    tbl
    | evaluate python(typeof(*, json_str:string), code, kwargs)
    | extend _tsid = column_ifexists(tsid, 1)
    | partition by _tsid (
       project json_str
       | evaluate http_request_post(uri, headers, dynamic(null))
       | project period=ResponseBody.period, baseline_ama=ResponseBody.expectedValues, ad_ama=series_add(0, ResponseBody.isAnomaly), pos_ad_ama=series_add(0, ResponseBody.isPositiveAnomaly)
       , neg_ad_ama=series_add(0, ResponseBody.isNegativeAnomaly), upper_ama=series_add(ResponseBody.expectedValues, ResponseBody.upperMargins), lower_ama=series_subtract(ResponseBody.expectedValues, ResponseBody.lowerMargins)
       | extend _tsid=toscalar(_tsid)
      )
};
let ts = demo_make_series2
| summarize TimeStamp=make_list(TimeStamp), num=make_list(num) by sid;
ts
| invoke series_uv_anomalies_fl('num', 'sid', 90)
| join ts on $left._tsid == $right.sid
| project-away _tsid
| extend (ad_adx, score_adx, baseline_adx)=series_decompose_anomalies(num, 1.5, -1, 'linefit')
| project-reorder num, *
| render anomalychart with(series=sid, xcolumn=TimeStamp, ycolumns=num, baseline_adx, baseline_ama, lower_ama, upper_ama, anomalycolumns=ad_adx, ad_ama)

Stored

let ts = demo_make_series2
| summarize TimeStamp=make_list(TimeStamp), num=make_list(num) by sid;
ts
| invoke series_uv_anomalies_fl('num', 'sid', 90)
| join ts on $left._tsid == $right.sid
| project-away _tsid
| extend (ad_adx, score_adx, baseline_adx)=series_decompose_anomalies(num, 1.5, -1, 'linefit')
| project-reorder num, *
| render anomalychart with(series=sid, xcolumn=TimeStamp, ycolumns=num, baseline_adx, baseline_ama, lower_ama, upper_ama, anomalycolumns=ad_adx, ad_ama)

Output

The following graph shows anomalies detected by the Univariate Anomaly Detection API on TS1. You can also select TS2 or TS3 in the chart filter box.

Graph showing anomalies using the Univariate A P I on a time series.

The following graph shows the anomalies detected by native function on TS1.

Graph showing anomalies using the native function on a time series.

54 - series_uv_change_points_fl()

This article describes the series_uv_change_points_fl() user-defined function.

The function series_uv_change_points_fl() is a user-defined function (UDF) that finds change points in time series by calling the Univariate Anomaly Detection API, part of Azure Cognitive Services. The function accepts a limited set of time series as numerical dynamic arrays, the change point detection threshold, and the minimum size of the stable trend window. Each time series is converted into the required JSON format and posts it to the Anomaly Detector service endpoint. The service response contains dynamic arrays of change points, their respective confidence, and the detected seasonality.

Prerequisites

Syntax

T | invoke series_uv_change_points_fl(y_series [, score_threshold [, trend_window [, tsid]]])

Parameters

NameTypeRequiredDescription
y_seriesstring✔️The name of the input table column containing the values of the series to be anomaly detected.
score_thresholdrealA value specifying the minimum confidence to declare a change point. Each point whose confidence is above the threshold is defined as a change point. Default value: 0.9
trend_windowintegerA value specifying the minimal window size for robust calculation of trend changes. Default value: 5
tsidstringThe name of the input table column containing the time series ID. Can be omitted when analyzing a single time series.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required. In the following function definition, replace YOUR-AD-RESOURCE-NAME in the uri and YOUR-KEY in the Ocp-Apim-Subscription-Key of the header with your Anomaly Detector resource name and key.

let series_uv_change_points_fl=(tbl:(*), y_series:string, score_threshold:real=0.9, trend_window:int=5, tsid:string='_tsid')
{
    let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/changepoint/detect';
    let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
    let kwargs = bag_pack('y_series', y_series, 'score_threshold', score_threshold, 'trend_window', trend_window);
    let code = ```if 1:
        import json
        y_series = kargs["y_series"]
        score_threshold = kargs["score_threshold"]
        trend_window = kargs["trend_window"]
        json_str = []
        for i in range(len(df)):
            row = df.iloc[i, :]
            ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
            json_data = {'series': ts, "threshold":score_threshold, "stableTrendWindow": trend_window}     # auto-detect period, or we can force 'period': 84
            json_str = json_str + [json.dumps(json_data)]
        result = df
        result['json_str'] = json_str
    ```;
    tbl
    | evaluate python(typeof(*, json_str:string), code, kwargs)
    | extend _tsid = column_ifexists(tsid, 1)
    | partition by _tsid (
       project json_str
       | evaluate http_request_post(uri, headers, dynamic(null))
        | project period=ResponseBody.period, change_point=series_add(0, ResponseBody.isChangePoint), confidence=ResponseBody.confidenceScores
        | extend _tsid=toscalar(_tsid)
       )
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required. In the following function definition, replace YOUR-AD-RESOURCE-NAME in the uri and YOUR-KEY in the Ocp-Apim-Subscription-Key of the header with your Anomaly Detector resource name and key.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Time Series Change Points Detection by Azure Cognitive Service")
series_uv_change_points_fl(tbl:(*), y_series:string, score_threshold:real=0.9, trend_window:int=5, tsid:string='_tsid')
{
    let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/changepoint/detect';
    let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
    let kwargs = bag_pack('y_series', y_series, 'score_threshold', score_threshold, 'trend_window', trend_window);
    let code = ```if 1:
        import json
        y_series = kargs["y_series"]
        score_threshold = kargs["score_threshold"]
        trend_window = kargs["trend_window"]
        json_str = []
        for i in range(len(df)):
            row = df.iloc[i, :]
            ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
            json_data = {'series': ts, "threshold":score_threshold, "stableTrendWindow": trend_window}     # auto-detect period, or we can force 'period': 84
            json_str = json_str + [json.dumps(json_data)]
        result = df
        result['json_str'] = json_str
    ```;
    tbl
    | evaluate python(typeof(*, json_str:string), code, kwargs)
    | extend _tsid = column_ifexists(tsid, 1)
    | partition by _tsid (
       project json_str
       | evaluate http_request_post(uri, headers, dynamic(null))
        | project period=ResponseBody.period, change_point=series_add(0, ResponseBody.isChangePoint), confidence=ResponseBody.confidenceScores
        | extend _tsid=toscalar(_tsid)
       )
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let series_uv_change_points_fl=(tbl:(*), y_series:string, score_threshold:real=0.9, trend_window:int=5, tsid:string='_tsid')
{
    let uri = 'https://YOUR-AD-RESOURCE-NAME.cognitiveservices.azure.com/anomalydetector/v1.0/timeseries/changepoint/detect';
    let headers=dynamic({'Ocp-Apim-Subscription-Key': h'YOUR-KEY'});
    let kwargs = bag_pack('y_series', y_series, 'score_threshold', score_threshold, 'trend_window', trend_window);
    let code = ```if 1:
        import json
        y_series = kargs["y_series"]
        score_threshold = kargs["score_threshold"]
        trend_window = kargs["trend_window"]
        json_str = []
        for i in range(len(df)):
            row = df.iloc[i, :]
            ts = [{'value':row[y_series][j]} for j in range(len(row[y_series]))]
            json_data = {'series': ts, "threshold":score_threshold, "stableTrendWindow": trend_window}     # auto-detect period, or we can force 'period': 84
            json_str = json_str + [json.dumps(json_data)]
        result = df
        result['json_str'] = json_str
    ```;
    tbl
    | evaluate python(typeof(*, json_str:string), code, kwargs)
    | extend _tsid = column_ifexists(tsid, 1)
    | partition by _tsid (
       project json_str
       | evaluate http_request_post(uri, headers, dynamic(null))
        | project period=ResponseBody.period, change_point=series_add(0, ResponseBody.isChangePoint), confidence=ResponseBody.confidenceScores
        | extend _tsid=toscalar(_tsid)
       )
};
let ts = range x from 1 to 300 step 1
| extend y=iff(x between (100 .. 110) or x between (200 .. 220), 20, 5)
| extend ts=datetime(2021-01-01)+x*1d
| extend y=y+4*rand()
| summarize ts=make_list(ts), y=make_list(y)
| extend sid=1;
ts
| invoke series_uv_change_points_fl('y', 0.8, 10, 'sid')
| join ts on $left._tsid == $right.sid
| project-away _tsid
| project-reorder y, *      //  just to visualize the anomalies on top of y series
| render anomalychart with(xcolumn=ts, ycolumns=y, confidence, anomalycolumns=change_point)

Stored

let ts = range x from 1 to 300 step 1
| extend y=iff(x between (100 .. 110) or x between (200 .. 220), 20, 5)
| extend ts=datetime(2021-01-01)+x*1d
| extend y=y+4*rand()
| summarize ts=make_list(ts), y=make_list(y)
| extend sid=1;
ts
| invoke series_uv_change_points_fl('y', 0.8, 10, 'sid')
| join ts on $left._tsid == $right.sid
| project-away _tsid
| project-reorder y, *      //  just to visualize the anomalies on top of y series
| render anomalychart with(xcolumn=ts, ycolumns=y, confidence, anomalycolumns=change_point)

Output

The following graph shows change points on a time series.

Graph showing change points on a time series.

55 - time_weighted_avg_fl()

This article describes time_weighted_avg_fl() user-defined function.

The function time_weighted_avg_fl() is a user-defined function (UDF) that calculates the time weighted average of a metric in a given time window, over input time bins. This function is similar to summarize operator. The function aggregates the metric by time bins, but instead of calculating simple avg() of the metric value in each bin, it weights each value by its duration. The duration is defined from the timestamp of the current value to the timestamp of the next value.

There are two options to calculate time weighted average. This function fills forward the value from the current sample until the next one. Alternatively time_weighted_avg2_fl() linearly interpolates the metric value between consecutive samples.

Syntax

T | invoke time_weighted_avg_fl(t_col, y_col, key_col, stime, etime, dt)

Parameters

NameTypeRequiredDescription
t_colstring✔️The name of the column containing the time stamp of the records.
y_colstring✔️The name of the column containing the metric value of the records.
key_colstring✔️The name of the column containing the partition key of the records.
stimedatetime✔️The start time of the aggregation window.
etimedatetime✔️The end time of the aggregation window.
dttimespan✔️The aggregation time bin.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let time_weighted_avg_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
    let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
    let _etime = etime + dt;
    let gridTimes = range _ts from stime to _etime step dt | extend _val=real(null), dummy=1;
    let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
    gridTimes
    | join kind=fullouter keys on dummy
    | project-away dummy, dummy1
    | union tbl_ex
    | where _ts between (stime.._etime)
    | partition hint.strategy=native by _key (
        order by _ts asc, _val nulls last
        | scan declare(f_value:real=0.0) with (step s: true => f_value = iff(isnull(_val), s.f_value, _val);) // fill forward null values
        | extend diff_t=(next(_ts)-_ts)/1m
    )
    | where isnotnull(diff_t)
    | summarize tw_sum=sum(f_value*diff_t), t_sum =sum(diff_t) by bin_at(_ts, dt, stime), _key
    | where t_sum > 0 and _ts <= etime
    | extend tw_avg = tw_sum/t_sum
    | project-away tw_sum, t_sum
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Time weighted average of a metric using fill forward interpolation")
time_weighted_avg_fl(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
    let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
    let _etime = etime + dt;
    let gridTimes = range _ts from stime to _etime step dt | extend _val=real(null), dummy=1;
    let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
    gridTimes
    | join kind=fullouter keys on dummy
    | project-away dummy, dummy1
    | union tbl_ex
    | where _ts between (stime.._etime)
    | partition hint.strategy=native by _key (
        order by _ts asc, _val nulls last
        | scan declare(f_value:real=0.0) with (step s: true => f_value = iff(isnull(_val), s.f_value, _val);) // fill forward null values
        | extend diff_t=(next(_ts)-_ts)/1m
    )
    | where isnotnull(diff_t)
    | summarize tw_sum=sum(f_value*diff_t), t_sum =sum(diff_t) by bin_at(_ts, dt, stime), _key
    | where t_sum > 0 and _ts <= etime
    | extend tw_avg = tw_sum/t_sum
    | project-away tw_sum, t_sum
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let time_weighted_avg_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
    let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
    let _etime = etime + dt;
    let gridTimes = range _ts from stime to _etime step dt | extend _val=real(null), dummy=1;
    let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
    gridTimes
    | join kind=fullouter keys on dummy
    | project-away dummy, dummy1
    | union tbl_ex
    | where _ts between (stime.._etime)
    | partition hint.strategy=native by _key (
        order by _ts asc, _val nulls last
        | scan declare(f_value:real=0.0) with (step s: true => f_value = iff(isnull(_val), s.f_value, _val);) // fill forward null values
        | extend diff_t=(next(_ts)-_ts)/1m
    )
    | where isnotnull(diff_t)
    | summarize tw_sum=sum(f_value*diff_t), t_sum =sum(diff_t) by bin_at(_ts, dt, stime), _key
    | where t_sum > 0 and _ts <= etime
    | extend tw_avg = tw_sum/t_sum
    | project-away tw_sum, t_sum
};
let tbl = datatable(ts:datetime,  val:real, key:string) [
    datetime(2021-04-26 00:00), 100, 'Device1',
    datetime(2021-04-26 00:45), 300, 'Device1',
    datetime(2021-04-26 01:15), 200, 'Device1',
    datetime(2021-04-26 00:00), 600, 'Device2',
    datetime(2021-04-26 00:30), 400, 'Device2',
    datetime(2021-04-26 01:30), 500, 'Device2',
    datetime(2021-04-26 01:45), 300, 'Device2'
];
let minmax=materialize(tbl | summarize mint=min(ts), maxt=max(ts));
let stime=toscalar(minmax | project mint);
let etime=toscalar(minmax | project maxt);
let dt = 1h;
tbl
| invoke time_weighted_avg_fl('ts', 'val', 'key', stime, etime, dt)
| project-rename val = tw_avg
| order by _key asc, _ts asc

Stored

let tbl = datatable(ts:datetime,  val:real, key:string) [
    datetime(2021-04-26 00:00), 100, 'Device1',
    datetime(2021-04-26 00:45), 300, 'Device1',
    datetime(2021-04-26 01:15), 200, 'Device1',
    datetime(2021-04-26 00:00), 600, 'Device2',
    datetime(2021-04-26 00:30), 400, 'Device2',
    datetime(2021-04-26 01:30), 500, 'Device2',
    datetime(2021-04-26 01:45), 300, 'Device2'
];
let minmax=materialize(tbl | summarize mint=min(ts), maxt=max(ts));
let stime=toscalar(minmax | project mint);
let etime=toscalar(minmax | project maxt);
let dt = 1h;
tbl
| invoke time_weighted_avg_fl('ts', 'val', 'key', stime, etime, dt)
| project-rename val = tw_avg
| order by _key asc, _ts asc

Output

_ts_keyval
2021-04-26 00:00:00.0000000Device1150
2021-04-26 01:00:00.0000000Device1225
2021-04-26 00:00:00.0000000Device2500
2021-04-26 01:00:00.0000000Device2400

The first value of Device1 is (45m*100 + 15m*300)/60m = 150, the second value is (15m*300 + 45m*200)/60m = 225.
The first value of Device2 is (30m*600 + 30m*400)/60m = 500, the second value is (30m*400 + 15m*500 + 15m*300)/60m = 400.

56 - time_weighted_avg2_fl()

This article describes time_weighted_avg2_fl() user-defined function.

The function time_weighted_avg2_fl() is a user-defined function (UDF) that calculates the time weighted average of a metric in a given time window, over input time bins. This function is similar to summarize operator. The function aggregates the metric by time bins, but instead of calculating simple avg() of the metric value in each bin, it weights each value by its duration. The duration is defined from the timestamp of the current value to the timestamp of the next value.

There are two options to calculate time weighted average. This function linearly interpolates the metric value between consecutive samples. Alternatively time_weighted_avg_fl() fills forward the value from the current sample until the next one.

Syntax

T | invoke time_weighted_avg2_fl(t_col, y_col, key_col, stime, etime, dt)

Parameters

NameTypeRequiredDescription
t_colstring✔️The name of the column containing the time stamp of the records.
y_colstring✔️The name of the column containing the metric value of the records.
key_colstring✔️The name of the column containing the partition key of the records.
stimedatetime✔️The start time of the aggregation window.
etimedatetime✔️The end time of the aggregation window.
dttimespan✔️The aggregation time bin.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let time_weighted_avg2_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
    let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
    let _etime = etime + dt;
    let gridTimes = range _ts from stime to _etime step dt | extend _val=real(null), dummy=1;
    let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
    gridTimes
    | join kind=fullouter keys on dummy
    | project-away dummy, dummy1
    | union tbl_ex
    | where _ts between (stime.._etime)
    | partition hint.strategy=native by _key (
      order by _ts desc, _val nulls last
    | scan declare(val1:real=0.0, t1:datetime) with (                // fill backward null values
        step s: true => val1=iff(isnull(_val), s.val1, _val), t1=iff(isnull(_val), s.t1, _ts);)
    | extend dt1=(t1-_ts)/1m
    | order by _ts asc, _val nulls last
    | scan declare(val0:real=0.0, t0:datetime) with (                // fill forward null values
        step s: true => val0=iff(isnull(_val), s.val0, _val), t0=iff(isnull(_val), s.t0, _ts);)
    | extend dt0=(_ts-t0)/1m
    | extend _twa_val=iff(dt0+dt1 == 0, _val, ((val0*dt1)+(val1*dt0))/(dt0+dt1))
    | scan with (                                                    // fill forward null twa values
        step s: true => _twa_val=iff(isnull(_twa_val), s._twa_val, _twa_val);)
    | extend diff_t=(next(_ts)-_ts)/1m
    )
    | where isnotnull(diff_t)
    | order by _key asc, _ts asc
    | extend next_twa_val=iff(_key == next(_key), next(_twa_val), _twa_val)
    | summarize tw_sum=sum((_twa_val+next_twa_val)*diff_t/2.0), t_sum =sum(diff_t) by bin_at(_ts, dt, stime), _key
    | where t_sum > 0 and _ts <= etime
    | extend tw_avg = tw_sum/t_sum
    | project-away tw_sum, t_sum
    | order by _key asc, _ts asc 
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Time weighted average of a metric using linear interpolation")
time_weighted_avg2_fl(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
    let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
    let _etime = etime + dt;
    let gridTimes = range _ts from stime to _etime step dt | extend _val=real(null), dummy=1;
    let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
    gridTimes
    | join kind=fullouter keys on dummy
    | project-away dummy, dummy1
    | union tbl_ex
    | where _ts between (stime.._etime)
    | partition hint.strategy=native by _key (
      order by _ts desc, _val nulls last
    | scan declare(val1:real=0.0, t1:datetime) with (                // fill backward null values
        step s: true => val1=iff(isnull(_val), s.val1, _val), t1=iff(isnull(_val), s.t1, _ts);)
    | extend dt1=(t1-_ts)/1m
    | order by _ts asc, _val nulls last
    | scan declare(val0:real=0.0, t0:datetime) with (                // fill forward null values
        step s: true => val0=iff(isnull(_val), s.val0, _val), t0=iff(isnull(_val), s.t0, _ts);)
    | extend dt0=(_ts-t0)/1m
    | extend _twa_val=iff(dt0+dt1 == 0, _val, ((val0*dt1)+(val1*dt0))/(dt0+dt1))
    | scan with (                                                    // fill forward null twa values
        step s: true => _twa_val=iff(isnull(_twa_val), s._twa_val, _twa_val);)
    | extend diff_t=(next(_ts)-_ts)/1m
    )
    | where isnotnull(diff_t)
    | order by _key asc, _ts asc
    | extend next_twa_val=iff(_key == next(_key), next(_twa_val), _twa_val)
    | summarize tw_sum=sum((_twa_val+next_twa_val)*diff_t/2.0), t_sum =sum(diff_t) by bin_at(_ts, dt, stime), _key
    | where t_sum > 0 and _ts <= etime
    | extend tw_avg = tw_sum/t_sum
    | project-away tw_sum, t_sum
    | order by _key asc, _ts asc 
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let time_weighted_avg2_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
    let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
    let _etime = etime + dt;
    let gridTimes = range _ts from stime to _etime step dt | extend _val=real(null), dummy=1;
    let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
    gridTimes
    | join kind=fullouter keys on dummy
    | project-away dummy, dummy1
    | union tbl_ex
    | where _ts between (stime.._etime)
    | partition hint.strategy=native by _key (
      order by _ts desc, _val nulls last
    | scan declare(val1:real=0.0, t1:datetime) with (                // fill backward null values
        step s: true => val1=iff(isnull(_val), s.val1, _val), t1=iff(isnull(_val), s.t1, _ts);)
    | extend dt1=(t1-_ts)/1m
    | order by _ts asc, _val nulls last
    | scan declare(val0:real=0.0, t0:datetime) with (                // fill forward null values
        step s: true => val0=iff(isnull(_val), s.val0, _val), t0=iff(isnull(_val), s.t0, _ts);)
    | extend dt0=(_ts-t0)/1m
    | extend _twa_val=iff(dt0+dt1 == 0, _val, ((val0*dt1)+(val1*dt0))/(dt0+dt1))
    | scan with (                                                    // fill forward null twa values
        step s: true => _twa_val=iff(isnull(_twa_val), s._twa_val, _twa_val);)
    | extend diff_t=(next(_ts)-_ts)/1m
    )
    | where isnotnull(diff_t)
    | order by _key asc, _ts asc
    | extend next_twa_val=iff(_key == next(_key), next(_twa_val), _twa_val)
    | summarize tw_sum=sum((_twa_val+next_twa_val)*diff_t/2.0), t_sum =sum(diff_t) by bin_at(_ts, dt, stime), _key
    | where t_sum > 0 and _ts <= etime
    | extend tw_avg = tw_sum/t_sum
    | project-away tw_sum, t_sum
    | order by _key asc, _ts asc 
};
let tbl = datatable(ts:datetime,  val:real, key:string) [
    datetime(2021-04-26 00:00), 100, 'Device1',
    datetime(2021-04-26 00:45), 300, 'Device1',
    datetime(2021-04-26 01:15), 200, 'Device1',
    datetime(2021-04-26 00:00), 600, 'Device2',
    datetime(2021-04-26 00:30), 400, 'Device2',
    datetime(2021-04-26 01:30), 500, 'Device2',
    datetime(2021-04-26 01:45), 300, 'Device2'
];
let minmax=materialize(tbl | summarize mint=min(ts), maxt=max(ts));
let stime=toscalar(minmax | project mint);
let etime=toscalar(minmax | project maxt);
let dt = 1h;
tbl
| invoke time_weighted_avg2_fl('ts', 'val', 'key', stime, etime, dt)
| project-rename val = tw_avg
| order by _key asc, _ts asc

Stored

let tbl = datatable(ts:datetime,  val:real, key:string) [
    datetime(2021-04-26 00:00), 100, 'Device1',
    datetime(2021-04-26 00:45), 300, 'Device1',
    datetime(2021-04-26 01:15), 200, 'Device1',
    datetime(2021-04-26 00:00), 600, 'Device2',
    datetime(2021-04-26 00:30), 400, 'Device2',
    datetime(2021-04-26 01:30), 500, 'Device2',
    datetime(2021-04-26 01:45), 300, 'Device2'
];
let minmax=materialize(tbl | summarize mint=min(ts), maxt=max(ts));
let stime=toscalar(minmax | project mint);
let etime=toscalar(minmax | project maxt);
let dt = 1h;
tbl
| invoke time_weighted_avg2_fl('ts', 'val', 'key', stime, etime, dt)
| project-rename val = tw_avg
| order by _key asc, _ts asc

Output

_ts_keyval
2021-04-26 00:00:00.0000000Device1218.75
2021-04-26 01:00:00.0000000Device1206.25
2021-04-26 00:00:00.0000000Device2462.5
2021-04-26 01:00:00.0000000Device2412.5

The first value of Device1 is (45m*(100+300)/2 + 15m*(300+250)/2)/60m = 218.75, the second value is (15m*(250+200)/2 + 45m*200)/60m = 206.25.
The first value of Device2 is (30m*(600+400)/2 + 30m*(400+450)/2)/60m = 462.5, the second value is (30m*(450+500)/2 + 15m*(500+300)/2 + 15m*300)/60m = 412.5.

57 - time_weighted_val_fl()

This article describes time_weighted_val_fl() user-defined function.

The function time_weighted_val_fl() is a user-defined function (UDF) that linearly interpolates metric value by time weighted average of the values of its previous point and its next point.

Syntax

T | invoke time_weighted_avg_fl(t_col, y_col, key_col, stime, etime, dt)

Parameters

NameTypeRequiredDescription
t_colstring✔️The name of the column containing the time stamp of the records.
y_colstring✔️The name of the column containing the metric value of the records.
key_colstring✔️The name of the column containing the partition key of the records.
stimedatetime✔️The start time of the aggregation window.
etimedatetime✔️The end time of the aggregation window.
dttimespan✔️The aggregation time bin.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let time_weighted_val_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
    let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
    let gridTimes = range _ts from stime to etime step dt | extend _val=real(null), grid=1, dummy=1;
    let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
    gridTimes
    | join kind=fullouter keys on dummy
    | project-away dummy, dummy1
    | union (tbl_ex | extend grid=0)
    | where _ts between (stime..etime)
    | partition hint.strategy=native by _key (
      order by _ts desc, _val nulls last
    | scan declare(val1:real=0.0, t1:datetime) with (                // fill backward null values
        step s: true => val1=iff(isnull(_val), s.val1, _val), t1=iff(isnull(_val), s.t1, _ts);)
    | extend dt1=(t1-_ts)/1m
    | order by _ts asc, _val nulls last
    | scan declare(val0:real=0.0, t0:datetime) with (                // fill forward null values
        step s: true => val0=iff(isnull(_val), s.val0, _val), t0=iff(isnull(_val), s.t0, _ts);)
    | extend dt0=(_ts-t0)/1m
    | extend _twa_val=iff(dt0+dt1 == 0, _val, ((val0*dt1)+(val1*dt0))/(dt0+dt1))
    | scan with (                                                    // fill forward null twa values
        step s: true => _twa_val=iff(isnull(_twa_val), s._twa_val, _twa_val);)
    | where grid == 0 or (grid == 1 and _ts != prev(_ts))
    )
    | project _ts, _key, _twa_val, orig_val=iff(grid == 1, 0, 1)
    | order by _key asc, _ts asc
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Linear interpolation of metric value by time weighted average")
time_weighted_val_fl(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
    let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
    let gridTimes = range _ts from stime to etime step dt | extend _val=real(null), grid=1, dummy=1;
    let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
    gridTimes
    | join kind=fullouter keys on dummy
    | project-away dummy, dummy1
    | union (tbl_ex | extend grid=0)
    | where _ts between (stime..etime)
    | partition hint.strategy=native by _key (
      order by _ts desc, _val nulls last
    | scan declare(val1:real=0.0, t1:datetime) with (                // fill backward null values
        step s: true => val1=iff(isnull(_val), s.val1, _val), t1=iff(isnull(_val), s.t1, _ts);)
    | extend dt1=(t1-_ts)/1m
    | order by _ts asc, _val nulls last
    | scan declare(val0:real=0.0, t0:datetime) with (                // fill forward null values
        step s: true => val0=iff(isnull(_val), s.val0, _val), t0=iff(isnull(_val), s.t0, _ts);)
    | extend dt0=(_ts-t0)/1m
    | extend _twa_val=iff(dt0+dt1 == 0, _val, ((val0*dt1)+(val1*dt0))/(dt0+dt1))
    | scan with (                                                    // fill forward null twa values
        step s: true => _twa_val=iff(isnull(_twa_val), s._twa_val, _twa_val);)
    | where grid == 0 or (grid == 1 and _ts != prev(_ts))
    )
    | project _ts, _key, _twa_val, orig_val=iff(grid == 1, 0, 1)
    | order by _key asc, _ts asc
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let time_weighted_val_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, stime:datetime, etime:datetime, dt:timespan)
{
    let tbl_ex = tbl | extend _ts = column_ifexists(t_col, datetime(null)), _val = column_ifexists(y_col, 0.0), _key = column_ifexists(key_col, '');
    let gridTimes = range _ts from stime to etime step dt | extend _val=real(null), grid=1, dummy=1;
    let keys = materialize(tbl_ex | summarize by _key | extend dummy=1);
    gridTimes
    | join kind=fullouter keys on dummy
    | project-away dummy, dummy1
    | union (tbl_ex | extend grid=0)
    | where _ts between (stime..etime)
    | partition hint.strategy=native by _key (
      order by _ts desc, _val nulls last
    | scan declare(val1:real=0.0, t1:datetime) with (                // fill backward null values
        step s: true => val1=iff(isnull(_val), s.val1, _val), t1=iff(isnull(_val), s.t1, _ts);)
    | extend dt1=(t1-_ts)/1m
    | order by _ts asc, _val nulls last
    | scan declare(val0:real=0.0, t0:datetime) with (                // fill forward null values
        step s: true => val0=iff(isnull(_val), s.val0, _val), t0=iff(isnull(_val), s.t0, _ts);)
    | extend dt0=(_ts-t0)/1m
    | extend _twa_val=iff(dt0+dt1 == 0, _val, ((val0*dt1)+(val1*dt0))/(dt0+dt1))
    | scan with (                                                    // fill forward null twa values
        step s: true => _twa_val=iff(isnull(_twa_val), s._twa_val, _twa_val);)
    | where grid == 0 or (grid == 1 and _ts != prev(_ts))
    )
    | project _ts, _key, _twa_val, orig_val=iff(grid == 1, 0, 1)
    | order by _key asc, _ts asc
};
let tbl = datatable(ts:datetime,  val:real, key:string) [
    datetime(2021-04-26 00:00), 100, 'Device1',
    datetime(2021-04-26 00:45), 300, 'Device1',
    datetime(2021-04-26 01:15), 200, 'Device1',
    datetime(2021-04-26 00:00), 600, 'Device2',
    datetime(2021-04-26 00:30), 400, 'Device2',
    datetime(2021-04-26 01:30), 500, 'Device2',
    datetime(2021-04-26 01:45), 300, 'Device2'
];
let minmax=materialize(tbl | summarize mint=min(ts), maxt=max(ts));
let stime=toscalar(minmax | project mint);
let etime=toscalar(minmax | project maxt);
let dt = 1h;
tbl
| invoke time_weighted_val_fl('ts', 'val', 'key', stime, etime, dt)
| project-rename val = _twa_val
| order by _key asc, _ts asc

Stored

let tbl = datatable(ts:datetime,  val:real, key:string) [
    datetime(2021-04-26 00:00), 100, 'Device1',
    datetime(2021-04-26 00:45), 300, 'Device1',
    datetime(2021-04-26 01:15), 200, 'Device1',
    datetime(2021-04-26 00:00), 600, 'Device2',
    datetime(2021-04-26 00:30), 400, 'Device2',
    datetime(2021-04-26 01:30), 500, 'Device2',
    datetime(2021-04-26 01:45), 300, 'Device2'
];
let minmax=materialize(tbl | summarize mint=min(ts), maxt=max(ts));
let stime=toscalar(minmax | project mint);
let etime=toscalar(minmax | project maxt);
let dt = 1h;
tbl
| invoke time_weighted_val_fl('ts', 'val', 'key', stime, etime, dt)
| project-rename val = _twa_val
| order by _key asc, _ts asc

Output

_ts_keyvalorig_val
2021-04-26 00:00:00.0000000Device11001
2021-04-26 00:45:00.0000000Device13001
2021-04-26 01:00:00.0000000Device12500
2021-04-26 01:15:00.0000000Device12001
2021-04-26 00:00:00.0000000Device26001
2021-04-26 00:30:00.0000000Device24001
2021-04-26 01:00:00.0000000Device24500
2021-04-26 01:30:00.0000000Device25001
2021-04-26 01:45:00.0000000Device23001

58 - time_window_rolling_avg_fl()

This article describes time_window_rolling_avg_fl() user-defined function.

The function time_window_rolling_avg_fl() is a user-defined function (UDF) that calculates the rolling average of the required value over a constant duration time window.

Calculating rolling average over a constant time window for regular time series (that is, having constant intervals) can be achieved using series_fir(), as the constant time window can be converted to a fixed width filter of equal coefficients. However, calculating it for irregular time series is more complex, as the actual number of samples in the window varies. Still it can be achieved using the powerful scan operator.

This type of rolling window calculation is required for use cases where the metric values are emitted only when changed (and not in constant intervals). For example in IoT, where edge devices send metrics to the cloud only upon changes, optimizing communication bandwidth.

Syntax

T | invoke time_window_rolling_avg_fl(t_col, y_col, key_col, dt [, direction ])

Parameters

NameTypeRequiredDescription
t_colstring✔️The name of the column containing the time stamp of the records.
y_colstring✔️The name of the column containing the metric value of the records.
key_colstring✔️The name of the column containing the partition key of the records.
dttimespan✔️The duration of the rolling window.
directionintThe aggregation direction. The possible values are +1 or -1. A rolling window is set from current time forward/backward respectively. Default is -1, as backward rolling window is the only possible method for streaming scenarios.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let time_window_rolling_avg_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, dt:timespan, direction:int=int(-1))
{
    let tbl_ex = tbl | extend timestamp = column_ifexists(t_col, datetime(null)), value = column_ifexists(y_col, 0.0), key = column_ifexists(key_col, '');
    tbl_ex 
    | partition hint.strategy=shuffle by key 
    (
        extend timestamp=pack_array(timestamp, timestamp - direction*dt), delta = pack_array(-direction, direction)
        | mv-expand timestamp to typeof(datetime), delta to typeof(long)
        | sort by timestamp asc, delta desc    
        | scan declare (cum_sum:double=0.0, cum_count:long=0) with 
        (
            step s: true => cum_count = s.cum_count + delta, 
                            cum_sum = s.cum_sum + delta * value; 
        )
        | extend avg_value = iff(direction == 1, prev(cum_sum)/prev(cum_count), cum_sum/cum_count)
        | where delta == -direction 
        | project timestamp, value, avg_value, key
    )
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Series", docstring = "Time based rolling average of a metric")
time_window_rolling_avg_fl(tbl:(*), t_col:string, y_col:string, key_col:string, dt:timespan, direction:int=int(-1))
{
    let tbl_ex = tbl | extend timestamp = column_ifexists(t_col, datetime(null)), value = column_ifexists(y_col, 0.0), key = column_ifexists(key_col, '');
    tbl_ex 
    | partition hint.strategy=shuffle by key 
    (
        extend timestamp=pack_array(timestamp, timestamp - direction*dt), delta = pack_array(-direction, direction)
        | mv-expand timestamp to typeof(datetime), delta to typeof(long)
        | sort by timestamp asc, delta desc    
        | scan declare (cum_sum:double=0.0, cum_count:long=0) with 
        (
            step s: true => cum_count = s.cum_count + delta, 
                            cum_sum = s.cum_sum + delta * value; 
        )
        | extend avg_value = iff(direction == 1, prev(cum_sum)/prev(cum_count), cum_sum/cum_count)
        | where delta == -direction 
        | project timestamp, value, avg_value, key
    )
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let time_window_rolling_avg_fl=(tbl:(*), t_col:string, y_col:string, key_col:string, dt:timespan, direction:int=int(-1))
{
    let tbl_ex = tbl | extend timestamp = column_ifexists(t_col, datetime(null)), value = column_ifexists(y_col, 0.0), key = column_ifexists(key_col, '');
    tbl_ex 
    | partition hint.strategy=shuffle by key 
    (
        extend timestamp=pack_array(timestamp, timestamp - direction*dt), delta = pack_array(-direction, direction)
        | mv-expand timestamp to typeof(datetime), delta to typeof(long)
        | sort by timestamp asc, delta desc    
        | scan declare (cum_sum:double=0.0, cum_count:long=0) with 
        (
            step s: true => cum_count = s.cum_count + delta, 
                            cum_sum = s.cum_sum + delta * value; 
        )
        | extend avg_value = iff(direction == 1, prev(cum_sum)/prev(cum_count), cum_sum/cum_count)
        | where delta == -direction 
        | project timestamp, value, avg_value, key
    )
};
let tbl = datatable(ts:datetime,  val:real, key:string) [
    datetime(8:00), 1, 'Device1',
    datetime(8:01), 2, 'Device1',
    datetime(8:05), 3, 'Device1',
    datetime(8:05), 10, 'Device2',
    datetime(8:09), 20, 'Device2',
    datetime(8:40), 4, 'Device1',
    datetime(9:00), 5, 'Device1',
    datetime(9:01), 6, 'Device1',
    datetime(9:05), 30, 'Device2',
    datetime(9:50), 7, 'Device1'
];
tbl
| invoke time_window_rolling_avg_fl('ts', 'val', 'key', 10m)

Stored

let tbl = datatable(ts:datetime,  val:real, key:string) [
    datetime(8:00), 1, 'Device1',
    datetime(8:01), 2, 'Device1',
    datetime(8:05), 3, 'Device1',
    datetime(8:05), 10, 'Device2',
    datetime(8:09), 20, 'Device2',
    datetime(8:40), 4, 'Device1',
    datetime(9:00), 5, 'Device1',
    datetime(9:01), 6, 'Device1',
    datetime(9:05), 30, 'Device2',
    datetime(9:50), 7, 'Device1'
];
tbl
| invoke time_window_rolling_avg_fl('ts', 'val', 'key', 10m)

Output

timestampvalueavg_valuekey
2021-11-29 08:05:00.00000001010Device2
2021-11-29 08:09:00.00000002015Device2
2021-11-29 09:05:00.00000003030Device2
2021-11-29 08:00:00.000000011Device1
2021-11-29 08:01:00.000000021.5Device1
2021-11-29 08:05:00.000000032Device1
2021-11-29 08:40:00.000000044Device1
2021-11-29 09:00:00.000000055Device1
2021-11-29 09:01:00.000000065.5Device1
2021-11-29 09:50:00.000000077Device1

The first value (10) at 8:05 contains only a single value, which fell in the 10-minute backward window, the second value (15) is the average of two samples at 8:09 and at 8:05, etc.

59 - two_sample_t_test_fl()

This article describes the two_sample_t_test_fl() user-defined function.

The function two_sample_t_test_fl() is a user-defined function (UDF) that performs the Two-Sample T-Test.

Syntax

T | invoke two_sample_t_test_fl(data1, data2, test_statistic,p_value, equal_var)

Parameters

NameTypeRequiredDescription
data1string✔️The name of the column containing the first set of data to be used for the test.
data2string✔️The name of the column containing the second set of data to be used for the test.
test_statisticstring✔️The name of the column to store test statistic value for the results.
p_valuestring✔️The name of the column to store p-value for the results.
equal_varboolIf true (default), performs a standard independent 2 sample test that assumes equal population variances. If false, performs Welch’s t-test, which does not assume equal population variance. As mentioned above, consider using the native welch_test().

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let two_sample_t_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string, equal_var:bool=true)
{
    let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value, 'equal_var', equal_var);
    let code = ```if 1:
        from scipy import stats
        import pandas
        
        data1 = kargs["data1"]
        data2 = kargs["data2"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        equal_var = kargs["equal_var"]
        
        def func(row):
            statistics = stats.ttest_ind(row[data1], row[data2], equal_var=equal_var)
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Stats", docstring = "Two-Sample t-Test")
two_sample_t_test_fl(tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string, equal_var:bool=true)
{
    let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value, 'equal_var', equal_var);
    let code = ```if 1:
        from scipy import stats
        import pandas
        
        data1 = kargs["data1"]
        data2 = kargs["data2"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        equal_var = kargs["equal_var"]
        
        def func(row):
            statistics = stats.ttest_ind(row[data1], row[data2], equal_var=equal_var)
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let two_sample_t_test_fl = (tbl:(*), data1:string, data2:string, test_statistic:string, p_value:string, equal_var:bool=true)
{
    let kwargs = bag_pack('data1', data1, 'data2', data2, 'test_statistic', test_statistic, 'p_value', p_value, 'equal_var', equal_var);
    let code = ```if 1:
        from scipy import stats
        import pandas
        
        data1 = kargs["data1"]
        data2 = kargs["data2"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        equal_var = kargs["equal_var"]
        
        def func(row):
            statistics = stats.ttest_ind(row[data1], row[data2], equal_var=equal_var)
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke two_sample_t_test_fl('sample1', 'sample2', 'test_stat', 'p_val')

Stored

datatable(id:string, sample1:dynamic, sample2:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]), dynamic([27.1, 22.12, 33.56]),
'Test #2', dynamic([20.85, 21.89, 23.41]), dynamic([35.09, 30.02, 26.52]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02]), dynamic([32.2, 32.79, 33.9, 34.22])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke two_sample_t_test_fl('sample1', 'sample2', 'test_stat', 'p_val')

Output

IDsample1sample2test_statp_val
Test #1[23.64, 20.57, 20.42][27.1, 22.12, 33.56]-1.74156754575656450.15655096653487446
Test #2[20.85, 21.89, 23.41][35.09, 30.02, 26.52], -3.27116734910225790.030755331219276136
Test #3[20.13, 20.5, 21.7, 22.02][32.2, 32.79, 33.9, 34.22]-18.55159462017421.5823717131966134E-06

60 - User-defined functions

This article describes user-defined functions (scalar and views).

User-defined functions are reusable subqueries that can be defined as part of the query itself (query-defined functions), or stored as part of the database metadata (stored functions). User-defined functions are invoked through a name, are provided with zero or more input arguments (which can be scalar or tabular), and produce a single value (which can be scalar or tabular) based on the function body.

A user-defined function belongs to one of two categories:

  • Scalar functions
  • Tabular functions

The function’s input arguments and output determine whether it’s scalar or tabular, which then establishes how it might be used.

To optimize multiple uses of the user-defined functions within a single query, see Optimize queries that use named expressions.

We’ve created an assortment of user-defined functions that you can use in your queries. For more information, see Functions library.

Scalar function

  • Has zero input arguments, or all its input arguments are scalar values
  • Produces a single scalar value
  • Can be used wherever a scalar expression is allowed
  • May only use the row context in which it’s defined
  • Can only refer to tables (and views) that are in the accessible schema

Tabular function

  • Accepts one or more tabular input arguments, and zero or more scalar input arguments, and/or:
  • Produces a single tabular value

Function names

Valid user-defined function names must follow the same identifier naming rules as other entities.

The name must also be unique in its scope of definition.

Input arguments

Valid user-defined functions follow these rules:

  • A user-defined function has a strongly typed list of zero or more input arguments.
  • An input argument has a name, a type, and (for scalar arguments) a default value.
  • The name of an input argument is an identifier.
  • The type of an input argument is either one of the scalar data types, or a tabular schema.

Syntactically, the input arguments list is a comma-separated list of argument definitions, wrapped in parenthesis. Each argument definition is specified as

ArgName:ArgType [= ArgDefaultValue]

For tabular arguments, ArgType has the same syntax as the table definition (parenthesis and a list of column name/type pairs), with the addition of a solitary (*) indicating “any tabular schema”.

For example:

SyntaxInput arguments list description
()No arguments
(s:string)Single scalar argument called s taking a value of type string
(a:long, b:bool=true)Two scalar arguments, the second of which has a default value
(T1:(*), T2:(r:real), b:bool)Three arguments (two tabular arguments and one scalar argument)

Examples

Scalar function

let Add7 = (arg0:long = 5) { arg0 + 7 };
range x from 1 to 10 step 1
| extend x_plus_7 = Add7(x), five_plus_seven = Add7()

Tabular function with no arguments

let tenNumbers = () { range x from 1 to 10 step 1};
tenNumbers
| extend x_plus_7 = x + 7

Tabular function with arguments

let MyFilter = (T:(x:long), v:long) {
  T | where x >= v
};
MyFilter((range x from 1 to 10 step 1), 9)

Output

x
9
10

A tabular function that uses a tabular input with no column specified. Any table can be passed to a function, and no table columns can be referenced inside the function.

let MyDistinct = (T:(*)) {
  T | distinct *
};
MyDistinct((range x from 1 to 3 step 1))

Output

x
1
2
3

Declaring user-defined functions

The declaration of a user-defined function provides:

  • Function name
  • Function schema (parameters it accepts, if any)
  • Function body
let f=(s:string, i:long) {
    tolong(s) * i
};

The function body includes:

  • Exactly one expression, which provides the function’s return value (scalar or tabular value).
  • Any number (zero or more) of let statements, whose scope is that of the function body. If specified, the let statements must precede the expression defining the function’s return value.
  • Any number (zero or more) of query parameters statements, which declare query parameters used by the function. If specified, they must precede the expression defining the function’s return value.

Examples of user-defined functions

The following section shows examples of how to use user-defined functions.

User-defined function that uses a let statement

The following example shows a user-defined function (lambda) that accepts a parameter named ID. The function is bound to the name Test and makes use of three let statements, in which the Test3 definition uses the ID parameter. When run, the output from the query is 70:

let Test = (id: int) {
  let Test2 = 10;
  let Test3 = 10 + Test2 + id;
  let Test4 = (arg: int) {
      let Test5 = 20;
      Test2 + Test3 + Test5 + arg
  };
  Test4(10)
};
range x from 1 to Test(10) step 1
| count

User-defined function that defines a default value for a parameter

The following example shows a function that accepts three arguments. The latter two have a default value and don’t have to be present at the call site.

let f = (a:long, b:string = "b.default", c:long = 0) {
  strcat(a, "-", b, "-", c)
};
print f(12, c=7) // Returns "12-b.default-7"

Invoking a user-defined function

The method to invoke a user-defined function depends on the arguments that the function expects to receive. The following sections cover how to invoke a UDF without arguments, invoke a UDF with scalar arguments, and invoke a UDF with tabular arguments.

Invoke a UDF without arguments

A user-defined function that takes no arguments and can be invoked either by its name, or by its name and an empty argument list in parentheses.

// Bind the identifier a to a user-defined function (lambda) that takes
// no arguments and returns a constant of type long:
let a=(){123};
// Invoke the function in two equivalent ways:
range x from 1 to 10 step 1
| extend y = x * a, z = x * a()
// Bind the identifier T to a user-defined function (lambda) that takes
// no arguments and returns a random two-by-two table:
let T=(){
  range x from 1 to 2 step 1
  | project x1 = rand(), x2 = rand()
};
// Invoke the function in two equivalent ways:
// (Note that the second invocation must be itself wrapped in
// an additional set of parentheses, as the union operator
// differentiates between "plain" names and expressions)
union T, (T())

Invoke a UDF with scalar arguments

A user-defined function that takes one or more scalar arguments can be invoked by using the function name and a concrete argument list in parentheses:

let f=(a:string, b:string) {
  strcat(a, " (la la la)", b)
};
print f("hello", "world")

Invoke a UDF with tabular arguments

A user-defined function that takes one or more table arguments (with any number of scalar arguments) and can be invoked using the function name and a concrete argument list in parentheses:

let MyFilter = (T:(x:long), v:long) {
  T | where x >= v
};
MyFilter((range x from 1 to 10 step 1), 9)

You can also use the operator invoke to invoke a user-defined function that takes one or more table arguments and returns a table. This function is useful when the first concrete table argument to the function is the source of the invoke operator:

let append_to_column_a=(T:(a:string), what:string) {
    T | extend a=strcat(a, " ", what)
};
datatable (a:string) ["sad", "really", "sad"]
| invoke append_to_column_a(":-)")

Default values

Functions may provide default values to some of their parameters under the following conditions:

  • Default values may be provided for scalar parameters only.
  • Default values are always literals (constants). They can’t be arbitrary calculations.
  • Parameters with no default value always precede parameters that do have a default value.
  • Callers must provide the value of all parameters with no default values arranged in the same order as the function declaration.
  • Callers don’t need to provide the value for parameters with default values, but may do so.
  • Callers may provide arguments in an order that doesn’t match the order of the parameters. If so, they must name their arguments.

The following example returns a table with two identical records. In the first invocation of f, the arguments are completely “scrambled”, so each one is explicitly given a name:

let f = (a:long, b:string = "b.default", c:long = 0) {
  strcat(a, "-", b, "-", c)
};
union
  (print x=f(c=7, a=12)), // "12-b.default-7"
  (print x=f(12, c=7))    // "12-b.default-7"

Output

x
12-b.default-7
12-b.default-7

View functions

A user-defined function that takes no arguments and returns a tabular expression can be marked as a view. Marking a user-defined function as a view means that the function behaves like a table whenever a wildcard table name resolution is performed.

The following example shows two user-defined functions, T_view and T_notview, and shows how only the first one is resolved by the wildcard reference in the union:

let T_view = view () { print x=1 };
let T_notview = () { print x=2 };
union T*

Restrictions

The following restrictions apply:

  • User-defined functions can’t pass into toscalar() invocation information that depends on the row-context in which the function is called.
  • User-defined functions that return a tabular expression can’t be invoked with an argument that varies with the row context.
  • A function taking at least one tabular input can’t be invoked on a remote cluster.
  • A scalar function can’t be invoked on a remote cluster.

The only place a user-defined function may be invoked with an argument that varies with the row context is when the user-defined function is composed of scalar functions only and doesn’t use toscalar().

Examples

Supported scalar function

The following query is supported because f is a scalar function that doesn’t reference any tabular expression.

let Table1 = datatable(xdate:datetime)[datetime(1970-01-01)];
let Table2 = datatable(Column:long)[1235];
let f = (hours:long) { now() + hours*1h };
Table2 | where Column != 123 | project d = f(10)

The following query is supported because f is a scalar function that references the tabular expression Table1 but is invoked with no reference to the current row context f(10):

let Table1 = datatable(xdate:datetime)[datetime(1970-01-01)];
let Table2 = datatable(Column:long)[1235];
let f = (hours:long) { toscalar(Table1 | summarize min(xdate) - hours*1h) };
Table2 | where Column != 123 | project d = f(10)

Unsupported scalar function

The following query isn’t supported because f is a scalar function that references the tabular expression Table1, and is invoked with a reference to the current row context f(Column):

let Table1 = datatable(xdate:datetime)[datetime(1970-01-01)];
let Table2 = datatable(Column:long)[1235];
let f = (hours:long) { toscalar(Table1 | summarize min(xdate) - hours*1h) };
Table2 | where Column != 123 | project d = f(Column)

Unsupported tabular function

The following query isn’t supported because f is a tabular function that is invoked in a context that expects a scalar value.

let Table1 = datatable(xdate:datetime)[datetime(1970-01-01)];
let Table2 = datatable(Column:long)[1235];
let f = (hours:long) { range x from 1 to hours step 1 | summarize make_list(x) };
Table2 | where Column != 123 | project d = f(Column)

Features that are currently unsupported by user-defined functions

For completeness, here are some commonly requested features for user-defined functions that are currently not supported:

  1. Function overloading: There’s currently no way to overload a function (a way to create multiple functions with the same name and different input schema).

  2. Default values: The default value for a scalar parameter to a function must be a scalar literal (constant).

61 - wilcoxon_test_fl()

This article describes the wilcoxon_test_fl() user-defined function.

The function wilcoxon_test_fl() is a user-defined function (UDF) that performs the Wilcoxon Test.

Syntax

T | invoke wilcoxon_test_fl()(data, test_statistic,p_value)

Parameters

NameTypeRequiredDescription
datastring✔️The name of the column containing the data to be used for the test.
test_statisticstring✔️The name of the column to store test statistic value for the results.
p_valuestring✔️The name of the column to store p-value for the results.

Function definition

You can define the function by either embedding its code as a query-defined function, or creating it as a stored function in your database, as follows:

Query-defined

Define the function using the following let statement. No permissions are required.

let wilcoxon_test_fl = (tbl:(*), data:string, test_statistic:string, p_value:string)
{
    let kwargs = bag_pack('data', data, 'test_statistic', test_statistic, 'p_value', p_value);
    let code = ```if 1:
        from scipy import stats
        data = kargs["data"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        def func(row):
            statistics = stats.wilcoxon(row[data])
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
// Write your query to use the function here.

Stored

Define the stored function once using the following .create function. Database User permissions are required.

.create-or-alter function with (folder = "Packages\\Stats", docstring = "Wilcoxon Test")
wilcoxon_test_fl(tbl:(*), data:string, test_statistic:string, p_value:string)
{
    let kwargs = bag_pack('data', data, 'test_statistic', test_statistic, 'p_value', p_value);
    let code = ```if 1:
        from scipy import stats
        data = kargs["data"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        def func(row):
            statistics = stats.wilcoxon(row[data])
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
}

Example

The following example uses the invoke operator to run the function.

Query-defined

To use a query-defined function, invoke it after the embedded function definition.

let wilcoxon_test_fl = (tbl:(*), data:string, test_statistic:string, p_value:string)
{
    let kwargs = bag_pack('data', data, 'test_statistic', test_statistic, 'p_value', p_value);
    let code = ```if 1:
        from scipy import stats
        data = kargs["data"]
        test_statistic = kargs["test_statistic"]
        p_value = kargs["p_value"]
        def func(row):
            statistics = stats.wilcoxon(row[data])
            return statistics[0], statistics[1]
        result = df
        result[[test_statistic, p_value]]  = df.apply(func, axis=1, result_type = "expand")
    ```;
    tbl
    | evaluate python(typeof(*), code, kwargs)
};
datatable(id:string, sample1:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]),
'Test #2', dynamic([20.85, 21.89, 23.41]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke wilcoxon_test_fl('sample1', 'test_stat', 'p_val') -->

Stored

datatable(id:string, sample1:dynamic) [
'Test #1', dynamic([23.64, 20.57, 20.42]),
'Test #2', dynamic([20.85, 21.89, 23.41]),
'Test #3', dynamic([20.13, 20.5, 21.7, 22.02])
]
| extend test_stat= 0.0, p_val = 0.0
| invoke wilcoxon_test_fl('sample1', 'test_stat', 'p_val')

Output

IDsample1test_statp_val
Test #1[23.64, 20.57, 20.42]0, 0.10880943004054568
Test #2[20.85, 21.89, 23.41]0, 0.10880943004054568
Test #3[20.13, 20.5, 21.7, 22.02]0, 0.06788915486182899