nagios 設定檔

cgi.cfg – 使用者權限認證、網頁路徑等等
command.cfg – 定義執行名稱及其使用的指令與參數，下 check_http -h 可以得知其用法

define command{
command_name check_dns
command_line $USER1$/check_dns -H www.yahoo.com -s $HOSTADDRESS$
}

localhost.cfg – 本機設定檔，包含 timeperiods.cfg、contacts.cfg、
contactgroups.cfg、hosts.cfg、hostgroups.cfg、services.cfg
nagios.cfg – 主要設定檔

# Log 記錄
log_file=/var/log/nagios/nagios.log

# 監控方式可以將設定隔開於每個設定檔，比如說 host 要在一個檔、service 要在另一個檔
cfg_file=/etc/nagios/hosts.cfg
cfg_file=/etc/nagios/services.cfg

# 而 cfg_dir 的功能類似 apache 的 conf.d，只要設定檔的副檔名有 .cfg 就可監控
cfg_dir=/etc/nagios/hosts
cfg_dir=/etc/nagios/services

resource.cfg – 放一些使用者自訂的

timeperiods.cfg – 定義 service check 的時間，24×7 為全年無休囉

define timeperiod{
timeperiod_name 24×7                                 # 名稱定義
alias           24 Hours A Day, 7 Days A Week # 多加敘述
sunday          00:00-24:00                            # 一到日、00點到24點
monday          00:00-24:00
tuesday         00:00-24:00
wednesday       00:00-24:00
thursday        00:00-24:00
friday          00:00-24:00
saturday        00:00-24:00
}

contact – 主機有問題時該通知誰，及怎麼通知

define contact{
contact_name                    nagios-admin                       # 名稱定義
alias                           Nagios Admin                               # 名稱描述
contactgroups                                                                # 群組
service_notification_period     24×7                               # 服務有問題或復原時，這個時間內作警告通知，這時就用到 timeperiod_name 了
host_notification_period        24×7                                # 主機有問題或復原時，這個時間內作警告通知，這時就用到 timeperiod_name 了
service_notification_options    w,u,c,r,n                           # 針對服務什麼情況才通知，w=warning、u=unknown、c=critical、
r=recovery、f=starts and stops flapping、n=none
host_notification_options       d,u,r,n                                  # 針對主機什麼情況才通知，d=down、u=unreachable、r=recovery、n=none
service_notification_commands   notify-by-email          # 針對服務用什麼方式通知
host_notification_commands      host-notify-by-email   # 針對主機用什麼方式通知
email                           cross@ssorc.tw                          # 這個 contact_name 的 email
pager
addressX
}

contactgroups

define contactgroup{
contactgroup_name       admins
alias                   Nagios Administrators
members                 nagios-admin            # contacts.cfg 的 contact_name 的名稱
}

host

define host {
host_name   ssorc.tw      # 主機名
alias              ssorc.tw      # 主機別名
address                              # 主機 ip 或 fqdn
parents                               # 上層主機名 ???
hostgroups
check_command
check_interval            # 檢查的時間間隔，間隔多少分鐘去作一次檢查
max_check_attempts   10   # 讓 check_command 去 check 十次
active_checks_enabled 1   # 要不要 check，1=enable、0=disable
passive_checks_enabled # ??? 被動檢查， enable 時會接受從外部進來的檢查結果 ???
check_period                 # 什麼時間監控 check，設定 timeperiod 的 name
contact_groups             #
notification_interval       # 定義態狀仍是 down 或者 unreachable 時，隔多久再通知一次，分鐘
notification_period         # 警告通知的時間，設定 timeperiod 的 name
notifications_options      # 決定什麼情況要警告通知，d=down、u=unreachable、r=recovery、f=host start and stops flapping、n=none
notifications_enabled     # 是否要警告通知，1=enable、0=disable
obsess_over_host 1       # 1=enable 時可來執行 ochp_command，檢查、通知完後才執行 ?????
check_freshness [0/1]    # 對於資訊新鮮度的檢查，如果被動監視被啟動時，這個選項可避免資訊過期 ?????
freshness_threshold   # 當新鮮度被啟動時，這個用來決定資訊的過期時間，在那個值裡的才算是新鮮的 ????????
event_handler # 當事件發生時需要被啟動的命令
event_handler_enabled [0/1] # 對應 event_handler，1=enable
low_flap_threshold   # 判斷 flap 的低門檻值，0 或有設定即使用預設
high_flap_threshold   # 高門檻值
flap_detection_enabled [0/1]   # 是否要 flap 偵測
process_perf_data  [0/1] # nagios.cfg 裡有個 host_perfdata_command 項目，這個命令會在每個 host 檢查完後執行。預設為0=disable，1=enable
retain_status_information [0/1]   # 在 nagios.cfg 裡也有這個設定，設 1 時這裡才有效用，用於 nagios 重啟時仍保留狀態資訊
retain_nonstatus_information [0/1]    # 保留非狀態資訊
stalking_options # 要以什麼樣的方式來記錄 log，通常 log 只有在狀態發生改變時，才會 log，這個項目用來記錄不同情況，o=stalk on UP stats、d=stalk on DOWN states、u=stalk on UNREACHABLE state
}

hostgroup

define hostgroup {
hostgroup_name
alias
members
}

service

define service {
host_name   # 被監控此服務的主機名
service _description   # 描述
servicegroups
is_volatile         # ???
check_command
max_check_attempts   # 最大檢查次數，在狀態為不OK時才使用到
normal_check_interval   # 一般情況下這次檢查與下次檢查隔多久時間(分鐘)，給 s 的話就是秒了，值為 0 時即是不檢查，值為 -1 時就是盡量不斷的檢查
retry_check_interval          # 狀態呈現不ok時，才使用這個時間週期(不使用normal_check_interval)，次數是靠著 max_check_attempts
# 比方說: 第一次檢查連線不到，狀態為 soft，使用 retry_check_interval 時間重試，試了 max_check_attempts 次數後，狀態轉變為 hard
# 而 hard 狀態之前的間隔 check 時間為 normal_check_interval
check_period               #
active_checks_enabled
check_period
parallelize_check      # ???
notification_interval   # hard 狀態時發出通知的時間週期，也就是第一次通知與第二次通知的時間隔。值是 0 時，只通知一次
notification_period   # 意義同 check_period
notification_options # 什麼情況才要通知，w=warning、u=unknown、c=critical、r=recovery、f=starts and stops flapping、n=none
notifications_enabled
contact_groups   # 通知誰
}

servicegroup

define servicegroup {
servicegroup
alias
members
}

servicedependency
serviceescalation
hostdependency
hostescalation

hostextinfo
serviceextinfo

Related posts 相關文章

使用 Grafana 與 Prometheus 監控主機

簡單容易自己架設的監控平台-Uptime Kuma

監控系統 icinga (nagios 的分支) 安裝 icinga-web 時遇到 500 internal server error

監控系統 icinga (nagios 的分支) Q&A 篇

作者

留言

撰寫回覆或留言取消回覆