Skip to end of metadata
Go to start of metadata

You are viewing an old version of this page. View the current version.

Compare with Current View Page History

Version 1 Current »

Follow these steps configure advance monitoring & analytics

  1. Make sure that the agent/collector has correct configuration

  2. Update your custom_values.yaml file to include following and then rerun install command.

    ingester:
      replicaCount: 1
      config:
        aggregator:
          metric:
            knightEnabled: true
    
    query-service:
      replicaCount: 1
      advancefunctions:
        replicaCount: 1
        config:
          auto_alert: true
          algos:
            - name: rrcf
              num_trees: 10
              tree_size: 256
              min_samples_for_score: .7 # > 70 % samples have to have valid values.
              min_samples_for_anomaly_factor: 1  # 25% of the samples to be present to calulcate anomaly.
              score_value_tolerance: 1.1  # How much higher the current score has to be from the percentile value for it to be considered anomalous.
              score_to_anomaly_percentile: .99  # score percentile value to compare the current score with, If current_score > score_percentile_value of all sample scores (being > min_samples_for_anomaly_factor), then a score is anomalous.
              on_anomaly_value_normalization_fn: 1 # When a value is considered anomalous, it's value is normalized using this function. 0: no normalization, 1: mean, 2: interpolate
              anomaly_fn: 1 # 0: quantile based, 1: 3 sigma based over tree size history, 2: 3 sigma based over full history
              ignore_anomalous_scores: 0 # when detecting anomaly using anomaly_fn, should anomalous scores be condiered or ignored.
              description: '50th percentile http request sizes per service, per request path and type tracked by knight'
              inner: avg by(kube_cluster_name, kube_namespace, kube_service, request_path, request_type) (histogram_quantile(.50,request_count{kube_namesapce!~"kfuse.*",protocol="http",kube_service!~"datadog-agent.*|kfuse-agent.*",kube_service=~".+",request_path=~".+",request_type=~".+"}))
              frequency: 60
              samples: 15
              enabled: true
              source: knight
              metric_name: knight_request_size_p50
              filters:
                request_path:
                  - "/merge?"
                  - "/detail.html?"
                  - "/stats/summary?"
                  - "/js/.*.js"
                  - "/favicon.png"
                  - "/category.html"
                  - "/robots.txt"
            - name: rrcf
              num_trees: 10
              tree_size: 256
              min_samples_for_score: .7 # > 70 % samples have to have valid values.
              min_samples_for_anomaly_factor: 1  # 25% of the samples to be present to calulcate anomaly.
              score_value_tolerance: 1.1  # How much higher the current score has to be from the percentile value for it to be considered anomalous.
              score_to_anomaly_percentile: .99  # score percentile value to compare the current score with, If current_score > score_percentile_value of all sample scores (being > min_samples_for_anomaly_factor), then a score is anomalous.
              on_anomaly_value_normalization_fn: 1 # When a value is considered anomalous, it's value is normalized using this function. 0: no normalization, 1: mean, 2: interpolate
              anomaly_fn: 1 # 0: quantile based, 1: 3 sigma based over tree size history, 2: 3 sigma based over full history
              ignore_anomalous_scores: 0 # when detecting anomaly using anomaly_fn, should anomalous scores be condiered or ignored.
              description: '90th percentile http request sizes per service, per request path and type tracked by knight'
              inner: avg by(kube_cluster_name, kube_namespace, kube_service, request_path, request_type) (histogram_quantile(.90,request_count{kube_namesapce!~"kfuse.*",protocol="http",kube_service!~"datadog-agent.*|kfuse-agent.*",kube_service=~".+",request_path=~".+",request_type=~".+"}))
              frequency: 60
              samples: 15
              enabled: true
              source: knight
              metric_name: knight_request_size_p90
              filters:
                request_path:
                  - "/merge?"
                  - "/detail.html?"
                  - "/stats/summary?"
                  - "/js/.*.js"
                  - "/favicon.png"
                  - "/category.html"
                  - "/robots.txt"
            - name: rrcf
              num_trees: 10
              tree_size: 256
              min_samples_for_score: .7 # > 70 % samples have to have valid values.
              min_samples_for_anomaly_factor: 1  # 25% of the samples to be present to calulcate anomaly.
              score_value_tolerance: 1.1  # How much higher the current score has to be from the percentile value for it to be considered anomalous.
              score_to_anomaly_percentile: .99  # score percentile value to compare the current score with, If current_score > score_percentile_value of all sample scores (being > min_samples_for_anomaly_factor), then a score is anomalous.
              on_anomaly_value_normalization_fn: 1 # When a value is considered anomalous, it's value is normalized using this function. 0: no normalization, 1: mean, 2: interpolate
              anomaly_fn: 1 # 0: quantile based, 1: 3 sigma based over tree size history, 2: 3 sigma based over full history
              ignore_anomalous_scores: 0 # when detecting anomaly using anomaly_fn, should anomalous scores be condiered or ignored.
              description: '50th percentile http response sizes per service, per request path and type tracked by knight'
              inner: avg by(kube_cluster_name, kube_namespace, kube_service, request_path, request_type) (histogram_quantile(.50,response_count{kube_namesapce!~"kfuse.*",protocol="http",kube_service!~"datadog-agent.*|kfuse-agent.*",kube_service=~".+",request_path=~".+",request_type=~".+"}))
              frequency: 60
              samples: 15
              enabled: true
              source: knight
              metric_name: knight_response_size_p50
              filters:
                request_path:
                  - "/merge?"
                  - "/detail.html?"
                  - "/stats/summary?"
                  - "/js/.*.js"
                  - "/favicon.png"
                  - "/category.html"
                  - "/robots.txt"
            - name: rrcf
              num_trees: 10
              tree_size: 256
              min_samples_for_score: .7 # > 70 % samples have to have valid values.
              min_samples_for_anomaly_factor: 1  # 25% of the samples to be present to calulcate anomaly.
              score_value_tolerance: 1.1  # How much higher the current score has to be from the percentile value for it to be considered anomalous.
              score_to_anomaly_percentile: .99  # score percentile value to compare the current score with, If current_score > score_percentile_value of all sample scores (being > min_samples_for_anomaly_factor), then a score is anomalous.
              on_anomaly_value_normalization_fn: 1 # When a value is considered anomalous, it's value is normalized using this function. 0: no normalization, 1: mean, 2: interpolate
              anomaly_fn: 1 # 0: quantile based, 1: 3 sigma based over tree size history, 2: 3 sigma based over full history
              ignore_anomalous_scores: 0 # when detecting anomaly using anomaly_fn, should anomalous scores be condiered or ignored.
              description: '90th percentile http response sizes per service, per request path and type tracked by knight'
              inner: avg by(kube_cluster_name, kube_namespace, kube_service, request_path, request_type) (histogram_quantile(.90,response_count{kube_namesapce!~"kfuse.*",protocol="http",kube_service!~"datadog-agent.*|kfuse-agent.*",kube_service=~".+",request_path=~".+",request_type=~".+"}))
              frequency: 60
              samples: 15
              enabled: true
              source: knight
              metric_name: knight_response_size_p90
              filters:
                request_path:
                  - "/merge?"
                  - "/detail.html?"
                  - "/stats/summary?"
                  - "/js/.*.js"
                  - "/favicon.png"
                  - "/category.html"
                  - "/robots.txt"
            - name: rrcf
              num_trees: 10
              tree_size: 256
              min_samples_for_score: .7 # > 70 % samples have to have valid values.
              min_samples_for_anomaly_factor: 1  # 25% of the samples to be present to calulcate anomaly.
              score_value_tolerance: 1.1  # How much higher the current score has to be from the percentile value for it to be considered anomalous.
              score_to_anomaly_percentile: .99  # score percentile value to compare the current score with, If current_score > score_percentile_value of all sample scores (being > min_samples_for_anomaly_factor), then a score is anomalous.
              on_anomaly_value_normalization_fn: 1 # When a value is considered anomalous, it's value is normalized using this function. 0: no normalization, 1: mean, 2: interpolate
              anomaly_fn: 1 # 0: quantile based, 1: 3 sigma based over tree size history, 2: 3 sigma based over full history
              ignore_anomalous_scores: 0 # when detecting anomaly using anomaly_fn, should anomalous scores be condiered or ignored.
              description: '50th percentile http request latencies per service, per request path and type tracked by knight'
              inner: avg by(kube_cluster_name, kube_namespace, kube_service, request_path, request_type) (histogram_quantile(.50,latency_count{kube_namesapce!~"kfuse.*",protocol="http",kube_service!~"datadog-agent.*|kfuse-agent.*",kube_service=~".+",request_path=~".+",request_type=~".+"}))
              frequency: 60
              samples: 15
              enabled: true
              source: knight
              metric_name: knight_latency_p50
              filters:
                request_path:
                  - "/merge?"
                  - "/detail.html?"
                  - "/stats/summary?"
                  - "/js/.*.js"
                  - "/favicon.png"
                  - "/category.html"
                  - "/robots.txt"
            - name: rrcf
              num_trees: 10
              tree_size: 256
              min_samples_for_score: .7 # > 70 % samples have to have valid values.
              min_samples_for_anomaly_factor: 1  # 25% of the samples to be present to calulcate anomaly.
              score_value_tolerance: 1.1  # How much higher the current score has to be from the percentile value for it to be considered anomalous.
              score_to_anomaly_percentile: .99  # score percentile value to compare the current score with, If current_score > score_percentile_value of all sample scores (being > min_samples_for_anomaly_factor), then a score is anomalous.
              on_anomaly_value_normalization_fn: 1 # When a value is considered anomalous, it's value is normalized using this function. 0: no normalization, 1: mean, 2: interpolate
              anomaly_fn: 1 # 0: quantile based, 1: 3 sigma based over tree size history, 2: 3 sigma based over full history
              ignore_anomalous_scores: 0 # when detecting anomaly using anomaly_fn, should anomalous scores be condiered or ignored.
              description: '90th percentile http request latencies per service, per request path and type tracked by knight'
              inner: avg by(kube_cluster_name, kube_namespace, kube_service, request_path, request_type) (histogram_quantile(.90,latency_count{kube_namesapce!~"kfuse.*",protocol="http",kube_service!~"datadog-agent.*|kfuse-agent.*",kube_service=~".+",request_path=~".+",request_type=~".+"}))
              frequency: 60
              samples: 15
              enabled: true
              source: knight
              metric_name: knight_latency_p90
              filters:
                request_path:
                  - "/merge?"
                  - "/detail.html?"
                  - "/stats/summary?"
                  - "/js/.*.js"
                  - "/favicon.png"
                  - "/category.html"
                  - "/robots.txt"
            - name: rrcf
              num_trees: 10
              tree_size: 256
              min_samples_for_score: .7 # > 70 % samples have to have valid values.
              min_samples_for_anomaly_factor: 1  # 25% of the samples to be present to calulcate anomaly.
              score_value_tolerance: 1.1  # How much higher the current score has to be from the percentile value for it to be considered anomalous.
              score_to_anomaly_percentile: .99  # score percentile value to compare the current score with, If current_score > score_percentile_value of all sample scores (being > min_samples_for_anomaly_factor), then a score is anomalous.
              on_anomaly_value_normalization_fn: 1 # When a value is considered anomalous, it's value is normalized using this function. 0: no normalization, 1: mean, 2: interpolate
              anomaly_fn: 1 # 0: quantile based, 1: 3 sigma based over tree size history, 2: 3 sigma based over full history
              ignore_anomalous_scores: 0 # when detecting anomaly using anomaly_fn, should anomalous scores be condiered or ignored.
              description: 'error rate of http requests per service per request type tracked by knight'
              inner: sum by(kube_cluster_name, kube_namespace, kube_service, request_path)(rate(error{rkube_namesapce!~"kfuse.*",esponse_code!~"2..",protocol="http",kube_service!~"datadog-agent.*|kfuse-agent.*",kube_service=~".+",request_path=~".+",request_type=~".+"}[5m]))
              frequency: 60
              samples: 15
              enabled: true
              source: knight
              metric_name: knight_error
              filters:
                request_path:
                  - "/merge?"
                  - "/detail.html?"
                  - "/stats/summary?"
                  - "/js/.*.js"
                  - "/favicon.png"
                  - "/category.html"
                  - "/robots.txt"
    
  • No labels