Configure Advance Monitoring & Analytics

Follow these steps configure advance monitoring & analytics

  1. Make sure that the agent/collector has correct configuration

  2. Update your custom_values.yaml file to include following and then rerun install command.

    ingester: replicaCount: 1 config: aggregator: metric: knightEnabled: true query-service: replicaCount: 1 advancefunctions: replicaCount: 1 config: auto_alert: true algos: - name: rrcf num_trees: 10 tree_size: 256 min_samples_for_score: .7 # > 70 % samples have to have valid values. min_samples_for_anomaly_factor: 1 # 25% of the samples to be present to calulcate anomaly. score_value_tolerance: 1.1 # How much higher the current score has to be from the percentile value for it to be considered anomalous. score_to_anomaly_percentile: .99 # score percentile value to compare the current score with, If current_score > score_percentile_value of all sample scores (being > min_samples_for_anomaly_factor), then a score is anomalous. on_anomaly_value_normalization_fn: 1 # When a value is considered anomalous, it's value is normalized using this function. 0: no normalization, 1: mean, 2: interpolate anomaly_fn: 1 # 0: quantile based, 1: 3 sigma based over tree size history, 2: 3 sigma based over full history ignore_anomalous_scores: 0 # when detecting anomaly using anomaly_fn, should anomalous scores be condiered or ignored. description: '50th percentile http request sizes per service, per request path and type tracked by knight' inner: avg by(kube_cluster_name, kube_namespace, kube_service, request_path, request_type) (histogram_quantile(.50,request_count{kube_namesapce!~"kfuse.*",protocol="http",kube_service!~"datadog-agent.*|kfuse-agent.*",kube_service=~".+",request_path=~".+",request_type=~".+"})) frequency: 60 samples: 15 enabled: true source: knight metric_name: knight_request_size_p50 filters: request_path: - "/merge?" - "/detail.html?" - "/stats/summary?" - "/js/.*.js" - "/favicon.png" - "/category.html" - "/robots.txt" - name: rrcf num_trees: 10 tree_size: 256 min_samples_for_score: .7 # > 70 % samples have to have valid values. min_samples_for_anomaly_factor: 1 # 25% of the samples to be present to calulcate anomaly. score_value_tolerance: 1.1 # How much higher the current score has to be from the percentile value for it to be considered anomalous. score_to_anomaly_percentile: .99 # score percentile value to compare the current score with, If current_score > score_percentile_value of all sample scores (being > min_samples_for_anomaly_factor), then a score is anomalous. on_anomaly_value_normalization_fn: 1 # When a value is considered anomalous, it's value is normalized using this function. 0: no normalization, 1: mean, 2: interpolate anomaly_fn: 1 # 0: quantile based, 1: 3 sigma based over tree size history, 2: 3 sigma based over full history ignore_anomalous_scores: 0 # when detecting anomaly using anomaly_fn, should anomalous scores be condiered or ignored. description: '90th percentile http request sizes per service, per request path and type tracked by knight' inner: avg by(kube_cluster_name, kube_namespace, kube_service, request_path, request_type) (histogram_quantile(.90,request_count{kube_namesapce!~"kfuse.*",protocol="http",kube_service!~"datadog-agent.*|kfuse-agent.*",kube_service=~".+",request_path=~".+",request_type=~".+"})) frequency: 60 samples: 15 enabled: true source: knight metric_name: knight_request_size_p90 filters: request_path: - "/merge?" - "/detail.html?" - "/stats/summary?" - "/js/.*.js" - "/favicon.png" - "/category.html" - "/robots.txt" - name: rrcf num_trees: 10 tree_size: 256 min_samples_for_score: .7 # > 70 % samples have to have valid values. min_samples_for_anomaly_factor: 1 # 25% of the samples to be present to calulcate anomaly. score_value_tolerance: 1.1 # How much higher the current score has to be from the percentile value for it to be considered anomalous. score_to_anomaly_percentile: .99 # score percentile value to compare the current score with, If current_score > score_percentile_value of all sample scores (being > min_samples_for_anomaly_factor), then a score is anomalous. on_anomaly_value_normalization_fn: 1 # When a value is considered anomalous, it's value is normalized using this function. 0: no normalization, 1: mean, 2: interpolate anomaly_fn: 1 # 0: quantile based, 1: 3 sigma based over tree size history, 2: 3 sigma based over full history ignore_anomalous_scores: 0 # when detecting anomaly using anomaly_fn, should anomalous scores be condiered or ignored. description: '50th percentile http response sizes per service, per request path and type tracked by knight' inner: avg by(kube_cluster_name, kube_namespace, kube_service, request_path, request_type) (histogram_quantile(.50,response_count{kube_namesapce!~"kfuse.*",protocol="http",kube_service!~"datadog-agent.*|kfuse-agent.*",kube_service=~".+",request_path=~".+",request_type=~".+"})) frequency: 60 samples: 15 enabled: true source: knight metric_name: knight_response_size_p50 filters: request_path: - "/merge?" - "/detail.html?" - "/stats/summary?" - "/js/.*.js" - "/favicon.png" - "/category.html" - "/robots.txt" - name: rrcf num_trees: 10 tree_size: 256 min_samples_for_score: .7 # > 70 % samples have to have valid values. min_samples_for_anomaly_factor: 1 # 25% of the samples to be present to calulcate anomaly. score_value_tolerance: 1.1 # How much higher the current score has to be from the percentile value for it to be considered anomalous. score_to_anomaly_percentile: .99 # score percentile value to compare the current score with, If current_score > score_percentile_value of all sample scores (being > min_samples_for_anomaly_factor), then a score is anomalous. on_anomaly_value_normalization_fn: 1 # When a value is considered anomalous, it's value is normalized using this function. 0: no normalization, 1: mean, 2: interpolate anomaly_fn: 1 # 0: quantile based, 1: 3 sigma based over tree size history, 2: 3 sigma based over full history ignore_anomalous_scores: 0 # when detecting anomaly using anomaly_fn, should anomalous scores be condiered or ignored. description: '90th percentile http response sizes per service, per request path and type tracked by knight' inner: avg by(kube_cluster_name, kube_namespace, kube_service, request_path, request_type) (histogram_quantile(.90,response_count{kube_namesapce!~"kfuse.*",protocol="http",kube_service!~"datadog-agent.*|kfuse-agent.*",kube_service=~".+",request_path=~".+",request_type=~".+"})) frequency: 60 samples: 15 enabled: true source: knight metric_name: knight_response_size_p90 filters: request_path: - "/merge?" - "/detail.html?" - "/stats/summary?" - "/js/.*.js" - "/favicon.png" - "/category.html" - "/robots.txt" - name: rrcf num_trees: 10 tree_size: 256 min_samples_for_score: .7 # > 70 % samples have to have valid values. min_samples_for_anomaly_factor: 1 # 25% of the samples to be present to calulcate anomaly. score_value_tolerance: 1.1 # How much higher the current score has to be from the percentile value for it to be considered anomalous. score_to_anomaly_percentile: .99 # score percentile value to compare the current score with, If current_score > score_percentile_value of all sample scores (being > min_samples_for_anomaly_factor), then a score is anomalous. on_anomaly_value_normalization_fn: 1 # When a value is considered anomalous, it's value is normalized using this function. 0: no normalization, 1: mean, 2: interpolate anomaly_fn: 1 # 0: quantile based, 1: 3 sigma based over tree size history, 2: 3 sigma based over full history ignore_anomalous_scores: 0 # when detecting anomaly using anomaly_fn, should anomalous scores be condiered or ignored. description: '50th percentile http request latencies per service, per request path and type tracked by knight' inner: avg by(kube_cluster_name, kube_namespace, kube_service, request_path, request_type) (histogram_quantile(.50,latency_count{kube_namesapce!~"kfuse.*",protocol="http",kube_service!~"datadog-agent.*|kfuse-agent.*",kube_service=~".+",request_path=~".+",request_type=~".+"})) frequency: 60 samples: 15 enabled: true source: knight metric_name: knight_latency_p50 filters: request_path: - "/merge?" - "/detail.html?" - "/stats/summary?" - "/js/.*.js" - "/favicon.png" - "/category.html" - "/robots.txt" - name: rrcf num_trees: 10 tree_size: 256 min_samples_for_score: .7 # > 70 % samples have to have valid values. min_samples_for_anomaly_factor: 1 # 25% of the samples to be present to calulcate anomaly. score_value_tolerance: 1.1 # How much higher the current score has to be from the percentile value for it to be considered anomalous. score_to_anomaly_percentile: .99 # score percentile value to compare the current score with, If current_score > score_percentile_value of all sample scores (being > min_samples_for_anomaly_factor), then a score is anomalous. on_anomaly_value_normalization_fn: 1 # When a value is considered anomalous, it's value is normalized using this function. 0: no normalization, 1: mean, 2: interpolate anomaly_fn: 1 # 0: quantile based, 1: 3 sigma based over tree size history, 2: 3 sigma based over full history ignore_anomalous_scores: 0 # when detecting anomaly using anomaly_fn, should anomalous scores be condiered or ignored. description: '90th percentile http request latencies per service, per request path and type tracked by knight' inner: avg by(kube_cluster_name, kube_namespace, kube_service, request_path, request_type) (histogram_quantile(.90,latency_count{kube_namesapce!~"kfuse.*",protocol="http",kube_service!~"datadog-agent.*|kfuse-agent.*",kube_service=~".+",request_path=~".+",request_type=~".+"})) frequency: 60 samples: 15 enabled: true source: knight metric_name: knight_latency_p90 filters: request_path: - "/merge?" - "/detail.html?" - "/stats/summary?" - "/js/.*.js" - "/favicon.png" - "/category.html" - "/robots.txt" - name: rrcf num_trees: 10 tree_size: 256 min_samples_for_score: .7 # > 70 % samples have to have valid values. min_samples_for_anomaly_factor: 1 # 25% of the samples to be present to calulcate anomaly. score_value_tolerance: 1.1 # How much higher the current score has to be from the percentile value for it to be considered anomalous. score_to_anomaly_percentile: .99 # score percentile value to compare the current score with, If current_score > score_percentile_value of all sample scores (being > min_samples_for_anomaly_factor), then a score is anomalous. on_anomaly_value_normalization_fn: 1 # When a value is considered anomalous, it's value is normalized using this function. 0: no normalization, 1: mean, 2: interpolate anomaly_fn: 1 # 0: quantile based, 1: 3 sigma based over tree size history, 2: 3 sigma based over full history ignore_anomalous_scores: 0 # when detecting anomaly using anomaly_fn, should anomalous scores be condiered or ignored. description: 'error rate of http requests per service per request type tracked by knight' inner: sum by(kube_cluster_name, kube_namespace, kube_service, request_path)(rate(error{rkube_namesapce!~"kfuse.*",esponse_code!~"2..",protocol="http",kube_service!~"datadog-agent.*|kfuse-agent.*",kube_service=~".+",request_path=~".+",request_type=~".+"}[5m])) frequency: 60 samples: 15 enabled: true source: knight metric_name: knight_error filters: request_path: - "/merge?" - "/detail.html?" - "/stats/summary?" - "/js/.*.js" - "/favicon.png" - "/category.html" - "/robots.txt"