# openobserve-values.yamlimage:repository:public.ecr.aws/zinclabs/openobservetag:"0.15.1"pullPolicy:IfNotPresentreplicaCount:3# 3 nodes recommended for HA in production# Resource configuration (adjust based on log volume)resources:limits:cpu:"8"memory:16Girequests:cpu:"4"memory:8Gi# Data persistencepersistence:enabled:trueexistingClaim:openobserve-datamountPath:/data# Environment variablesenv:- name:ZO_ROOT_USER_EMAILvalue:"admin@example.com"- name:ZO_ROOT_USER_PASSWORDvalueFrom:secretKeyRef:name:openobserve-secretkey:password- name:ZO_DATA_DIRvalue:"/data"- name:ZO_HTTP_PORTvalue:"5080"- name:ZO_MEMORY_CACHE_ENABLEDvalue:"true"- name:ZO_MEMORY_CACHE_MAX_SIZEvalue:"4096"# MB- name:ZO_COMPRESSION_ENABLEDvalue:"true"- name:ZO_COMPRESSION_FORMATvalue:"zstd"- name:ZO_PARQUET_COMPRESSIONvalue:"zstd"- name:ZO_META_STOREvalue:"sqlite"# Use PostgreSQL for production- name:ZO_METRICS_ENABLEDvalue:"true"# Configure PostgreSQL metadata store (recommended for production)envFrom:- secretRef:name:postgres-connection# Service configurationservice:type:ClusterIPport:5080targetPort:5080annotations:service.beta.kubernetes.io/aws-load-balancer-type:"nlb"# Ingress configurationingress:enabled:trueclassName:"nginx"annotations:cert-manager.io/cluster-issuer:"letsencrypt-prod"nginx.ingress.kubernetes.io/proxy-body-size:"100m"nginx.ingress.kubernetes.io/proxy-read-timeout:"600"nginx.ingress.kubernetes.io/proxy-send-timeout:"600"hosts:- host:logs.example.compaths:- path:/pathType:Prefixtls:- secretName:logs-tlshosts:- logs.example.com# Pod schedulingnodeSelector:{}tolerations:[]affinity:podAntiAffinity:preferredDuringSchedulingIgnoredDuringExecution:- weight:100podAffinityTerm:labelSelector:matchExpressions:- key:app.kubernetes.io/nameoperator:Invalues:- openobservetopologyKey:kubernetes.io/hostname# Monitoring configurationmonitoring:enabled:trueserviceMonitor:enabled:trueinterval:30snamespace:logging
kind:ConfigMapapiVersion:v1metadata:name:fluent-bit-confignamespace:uganda-prodlabels:app:fluent-bittier:loggingdata:# ---------------------------------------------------------------------------# 1. Custom Parsers# Used to extract specific log fields, e.g., extract log level (INFO, ERROR, etc.) from Java logs# ---------------------------------------------------------------------------custom_parsers.conf:| [PARSER]
Name java_log_level
Format regex
# Match format: 2023-10-27 10:00:00.123 INFO ...
# The 'level' capture group will contain the log level
Regex ^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}\s+(?<level>[A-Z]+)# ---------------------------------------------------------------------------# 2. Multiline Parsers# Specifically for handling multi-line logs like Java stack traces to prevent stacks from being split into multiple records# ---------------------------------------------------------------------------multiline_parsers.conf:| [MULTILINE_PARSER]
Name java_md_multiline
Type regex
# Rule 1: If line starts with date (e.g., 2023-...), consider it start of new log
rule "start_state" "/^\d{4}-\d{2}-\d{2}/" "cont"
# Rule 2: If line doesn't start with date, consider it continuation of previous line (stack part)
rule "cont" "/^(?!\d{4}-\d{2}-\d{2}).+/" "cont"
# Flush timeout (seconds), force output current buffered multi-line log after timeout
flush_timeout 5# ---------------------------------------------------------------------------# 3. Main Configuration File# ---------------------------------------------------------------------------fluent-bit.conf:| [SERVICE]
Daemon Off
Flush 1
Log_Level info
# Load standard and custom parsers
Parsers_File /fluent-bit/etc/conf/parsers.conf
Parsers_File /fluent-bit/etc/conf/custom_parsers.conf
Parsers_File /fluent-bit/etc/conf/multiline_parsers.conf
# Enable built-in HTTP server for health checks and metrics exposure (/api/v1/metrics)
HTTP_Server On
HTTP_Listen 0.0.0.0
HTTP_Port 2020
Health_Check On
# File system buffer configuration (prevent data loss when backend is unavailable)
storage.path /var/log/flb_storage
storage.sync normal
storage.checksum off
storage.backlog.mem_limit 200M
# -----------------------------------------------------------------------
# [INPUT] Tail file collection
# Collect standard container logs generated by Docker/Containerd
# -----------------------------------------------------------------------
[INPUT]
Name tail
Path /var/log/containers/*.log
# Enable built-in multiline parsing (docker, cri format)
multiline.parser docker, cri
Tag kube.*
Mem_Buf_Limit 500MB
Skip_Long_Lines On
Refresh_Interval 10
Persist offset database, resume reading from checkpoint after restart
DB /var/log/flb_kube.db
DB.Sync Normal
Rotate_Wait 30
Read_from_Head Off
# -----------------------------------------------------------------------
# [FILTER] Stage 1: Multiline merging
# Merge multiline logs before getting K8s metadata to ensure stack traces are handled as single records
# -----------------------------------------------------------------------
[FILTER]
Name multiline
Match kube.*
multiline.key_content log
multiline.parser java_md_multiline
# -----------------------------------------------------------------------
# [FILTER] Stage 2: K8s metadata enhancement
Call K8s API to get Pod details (Namespace, Pod Name, Labels, etc.)
# -----------------------------------------------------------------------
[FILTER]
Name kubernetes
Match kube.*
Merge_Log Off # Don't try to parse log field as JSON (avoid performance overhead or errors)
Keep_Log On # Keep original log field
K8S-Logging.Parser On # Allow Pod Annotation to specify parser
K8S-Logging.Exclude Off # Don't exclude logs with specific annotation
# -----------------------------------------------------------------------
# [FILTER] Stage 3: Log content parsing
# Use custom parser to extract 'level' field
# -----------------------------------------------------------------------
[FILTER]
Name parser
Match kube.*
Key_Name log
Parser java_log_level
Reserve_Data On # Keep original data
Preserve_Key On # Keep original log key
# -----------------------------------------------------------------------
# [FILTER] Stage 4: Field cleaning
# Copy extracted level field, remove unnecessary temporary fields
# -----------------------------------------------------------------------
[FILTER]
Name modify
Match kube.*
Copy level level
Remove _p
Remove stream
Remove time
# -----------------------------------------------------------------------
# [FILTER] Stage 5: Structure restructuring (Nest Lift)
# Lift fields under kubernetes object to top level with 'k8s_' prefix
# Purpose: Flatten data structure for easier rule matching
# -----------------------------------------------------------------------
[FILTER]
Name nest
Match kube.*
Operation lift
Nested_under kubernetes
Add_prefix k8s_
# -----------------------------------------------------------------------
# [FILTER] Stage 6: Remove redundant metadata
# Delete large fields not needed to be sent to backend (e.g., annotations, docker_id)
# -----------------------------------------------------------------------
[FILTER]
Name modify
Match kube.*
Remove k8s_annotations
Remove k8s_docker_id
Remove k8s_container_hash
# -----------------------------------------------------------------------
# [FILTER] Stage 7: Structure restructuring (Nest Nest)
# Repackage all 'k8s_' prefixed fields back under 'kubernetes' object, removing prefix
# Purpose: Restore clean nested structure while cleaning up useless fields
# -----------------------------------------------------------------------
[FILTER]
Name nest
Match kube.*
Operation nest
Wildcard k8s_*
Nested_under kubernetes
Remove_prefix k8s_
# =======================================================================
# [FILTER] Stage 8: Dynamic routing (Rewrite Tag) - First hop: Split by environment
# Modify Tag based on Namespace name to route logs to different processing streams
# Syntax: Rule $field_name regex newTag keep_original(boolean)
# =======================================================================
# Route: uganda-uat environment
[FILTER]
Name rewrite_tag
Match kube.*
Rule $kubernetes['namespace_name'] ^uganda-uat$ uganda-uat.temp false
Emitter_Name re_emitted_uganda-uat-temp
# Route: uganda-test environment
[FILTER]
Name rewrite_tag
Match kube.*
Rule $kubernetes['namespace_name'] ^uganda-test$ uganda-test.temp false
Emitter_Name re_emitted_uganda-test-temp
# Route: uganda-prod environment (current deployment namespace)
[FILTER]
Name rewrite_tag
Match kube.*
Rule $kubernetes['namespace_name'] ^uganda-prod$ uganda-prod.temp false
Emitter_Name re_emitted_uganda-prod-temp
# Route: uganda-offline environment
[FILTER]
Name rewrite_tag
Match kube.*
Rule $kubernetes['namespace_name'] ^uganda-offline$ uganda-offline.temp false
Emitter_Name re_emitted_uganda-offline-temp
# =======================================================================
# [FILTER] Stage 9: Dynamic routing (Rewrite Tag) - Second hop: Split by service
# Secondary routing for specific containers in specific environments for fine-grained index isolation
# =======================================================================
# Example: UAT environment -> lms-backend service
[FILTER]
Name rewrite_tag
Match uganda-uat.temp
Rule $kubernetes['container_name'] ^lms-backend$ uganda-uat-lms-backend false
Emitter_Name re_emitted_uganda-uat-lms-backend
# Example: UAT environment -> other-service service (needs complete configuration)
[FILTER]
Name rewrite_tag
Match uganda-uat.temp
Rule $kubernetes['container_name'] ^other-service$ uganda-uat-other-service false
Emitter_Name re_emitted_uganda-uat-other-service
# Note: Production and other environments need similar container name filtering rules added
# For example:
# [FILTER]
# Name rewrite_tag
# Match uganda-prod.temp
# Rule $kubernetes['container_name'] ^payment-service$ uganda-prod-payment-service false
# Emitter_Name re_emitted_uganda-prod-payment-service
# =======================================================================
# [OUTPUT] Output plugin configuration
# Send filtered logs to OpenObserve
# =======================================================================
# Output: UAT LMS Backend
[OUTPUT]
Name http
Match uganda-uat-lms-backend
URI /api/39NVPcXSEBOwGM5UnceQ35hQFNB/lms_backend/_json
Host openobserve.uganda-uat.svc.cluster.local
Port 5080
tls Off
Format json
Json_date_key _timestamp
Json_date_format iso8601
HTTP_User ops@test.com
HTTP_Passwd ROPe50N4BJjovJiT
compress gzip
Retry_Limit False # Infinite retry until successful
net.connect_timeout 10
net.io_timeout 30
# Output: UAT Other Service (example)
[OUTPUT]
Name http
Match uganda-uat-other-service
URI /api/39NVPcXSEBOwGM5UnceQ35hQFNB/other_service/_json
Host openobserve.uganda-uat.svc.cluster.local
Port 5080
tls Off
Format json
Json_date_key _timestamp
Json_date_format iso8601
HTTP_User devops@test.com
HTTP_Passwd ROPe50N4BJjovJiT
compress gzip
Retry_Limit False
# Default output (optional): Capture logs that don't match any specific rules to prevent data loss
# [OUTPUT]
# Name http
# Match uganda-prod.temp
# URI /api/.../default/_json
# ...
Configuration Explanation
Filter Stage: Use rewrite_tag filter to filter logs by Namespace and container name
Output Stage: Send filtered logs to OpenObserve in different environments via HTTP
Multiline Logs: Use multiline filter to handle Java exception stacks
Log Optimization: gzip compression, retry on failure, TCP timeout configuration
Multiline Merge Rule Explanation
State
Matching Rule
Description
start_state
/^\d{4}-\d{2}-\d{2}/
Starts with date → New log starts, enters cont state
cont
/^(?!\d{4}-\d{2}-\d{2}).+/
Doesn’t start with date → Continue appending to previous log
Note: After modifying the ConfigMap, the fluent-bit service needs to be restarted.
# Method 1: Create via APIcurl -X POST "https://logs.example.com/api/demo/streams"\
-u "admin@example.com:password"\
-H "Content-Type: application/json"\
-d '{
"name": "app-logs",
"storage_type": "memory",
"stream_type": "logs"
}'# Method 2: Auto-create (first write auto-creates, we use this method)# Fluent Bit will auto-create new stream on first write
Set Data Retention Policy
OpenObserve’s dashboard allows setting retention policies based on index, with main strategies as follows:
Time-based retention
Size-based retention
Hybrid strategy (3650 days or 30TB, whichever comes first, we use this approach. Since it’s a financial service, we set retention for 10 years)
# OpenObserve auto-compressionenv:- name:ZO_COMPRESSION_ENABLEDvalue:"true"- name:ZO_COMPRESSION_FORMATvalue:"zstd"# zstd/gzip/snappy- name:ZO_COMPRESSION_LEVELvalue:"3"# 1-19, higher compression ratio but higher CPU consumption
Query and Analysis
OpenObserve provides a powerful query engine supporting SQL mode and native query language (VQL). The following introduces basic queries and aggregation queries in different scenarios to help you quickly locate key logs.
💡 Core Tips
Mode Switch: Aggregation queries are recommended to be executed in SQL mode.
Table Name Convention: Stream names (table names) must be enclosed in double quotes, for example "lms".
Time Range: All queries are limited by the time selector in the upper right corner of the UI, please ensure the time range covers the target data.
1. Basic Search
Quickly locate a single log or specific event.
1.1 Full-text Index Search
Automatically scans all text fields, suitable for fuzzy searching.
Scenario: Calculate average and maximum latency for specific API requests to evaluate performance bottlenecks.
3. Advanced Techniques and Best Practices
Technique
Description
Example
Limit Result Set
Avoid returning too much data causing browser lag
Add LIMIT 100
Deduplication Count
Count unique error message types
COUNT(DISTINCT log)
Alias Optimization
Make output column names more readable
COUNT(*) as "Total Errors"
Null Value Handling
Exclude records with empty fields
WHERE level IS NOT NULL
Regex Matching
More flexible matching than str_match
REGEXP_MATCH(log, 'Error.*\d+')
⚠️ Performance Recommendations
Filter before aggregating: Be sure to narrow down the data range as much as possible in the WHERE clause (specifying time, keywords) before performing GROUP BY, which can significantly improve query speed.
Time granularity: When querying over large time ranges (like 7 days), use date_trunc('hour', ...) or date_trunc('day', ...), avoid using 'second' which causes too many data points.
Field indexing: For fields commonly used in GROUP BY (like level, container_name), it’s recommended to enable indexing in Stream Settings for best performance.
There are two parts that need monitoring: one is the OpenObserve service monitoring itself, and the other is business log monitoring. For monitoring of OpenObserve and Fluent Bit services themselves, Prometheus monitoring can be used directly. For core business order log monitoring, OpenObserve’s built-in alerting mechanism can meet the requirements directly.
apiVersion:monitoring.coreos.com/v1kind:PrometheusRulemetadata:name:logging-alertsnamespace:loggingspec:groups:- name:loggingrules:# Fluent Bit log loss alert- alert:FluentBitLogsDroppingexpr:rate(fluentbit_output_proc_records_failed_total[5m]) > 100for:5mlabels:severity:warningannotations:summary:"Fluent Bit Log Loss"description:"Lost {{ $value }} logs in the last 5 minutes"# OpenObserve storage alert- alert:OpenObserveDiskSpaceLowexpr:(node_filesystem_avail_bytes{mountpoint="/data"} / node_filesystem_size_bytes{mountpoint="/data"}) < 0.1for:10mlabels:severity:criticalannotations:summary:"OpenObserve Disk Space Low"description:"Less than 10% disk space remaining"# Log ingestion delay- alert:LoggingIngestLagexpr:openobserve_ingest_delay_seconds > 300for:5mlabels:severity:warningannotations:summary:"Log Ingestion Lag"description:"Log lag is {{ $value }} seconds"
Business Log Alerts
This involves several key parts:
Alert Message Template
Settings –> Templates –> Add template, define Webhook alert message format, note that the message needs to be configured differently for different chat tools
Note: The message here needs to be configured differently for different chat tools. My configuration here is for WeChat Work. For more information, please refer to https://openobserve.ai/docs/user-guide/management/templates/ to configure a suitable alert template for your needs.
Alert Address
Add a message assistant to the WeChat Work group to get the push message Webhook address, then add it in OpenObserve UI Settings –> Address –> Add Address.
Alert Rule Configuration
This needs to be determined based on internal R&D team communication, business log matching rules. Daily alerts should only match ERROR level logs. For core orders and risk control processes, phone alerts are recommended:
OpenObserve Open Source Edition Risks and Considerations
RBAC Permission Control
Open Source Edition Limitations:
No granular RBAC control (Enterprise edition supports)
# 1. Use multi-tenancy (Stream-level isolation)# Create separate Streams for each teamcurl -X POST "https://logs.example.com/api/demo/streams" \-d '{"name":"team-a-logs"}'curl -X POST "https://logs.example.com/api/demo/streams" \-d '{"name":"team-b-logs"}'# 2. Use reverse proxy for permission control# Nginx routes users to different Streams based on userlocation /api/team-a/ {internal;proxy_pass http://openobserve.logging.svc/api/team-a-logs/;}# 3. Use API Key for simple authentication# Generate separate API Key for each team
Here, since there are many projects, permission control rules are usually added at the reverse proxy, such as restricting IP and Host for POST requests.
# 1. Deploy multiple replicasreplicaCount:3# 2. Use shared storagepersistence:enabled:truestorageClass:"nfs-client"# Use distributed storage, here using Ceph RBD# 3. Configure pod anti-affinityaffinity:podAntiAffinity:requiredDuringSchedulingIgnoredDuringExecution:- labelSelector:matchExpressions:- key:app.kubernetes.io/nameoperator:Invalues:- openobservetopologyKey:kubernetes.io/hostname
Capacity Planning
Log Volume
Fluent Bit Resources
OpenObserve Resources
Storage Space/Month
10GB/day
100m CPU / 100Mi
2 CPU / 4Gi
100GB
100GB/day
200m CPU / 200Mi
4 CPU / 8Gi
1TB
1TB/day
500m CPU / 500Mi
8 CPU / 16Gi
10TB
Summary
The log platform solution based on OpenObserve and Fluent Bit has the following advantages:
Cost Advantage: Compared to ELK solutions, storage costs are reduced by 90% and computing resources by 70%
High Performance: A single node supports 5TB+/day log ingestion with query responses <100ms
Simple and Easy to Use: Deployment time <30 minutes with low learning curve
Cloud Native: Kubernetes native integration with automatic scaling
It has been running stably in production for half a year with 5TB+ data volume, and query responses are stable within 200ms, basically meeting most business needs. If there are strong requirements for RBAC, it is recommended to directly purchase the commercial version or abandon this solution.