作者:youngl9
项目:bosu
func (s *Schedule) save() {
if s.db == nil {
return
}
s.Lock("Save")
store := map[string]interface{}{
dbMetric: s.Search.Read.Metric,
dbTagk: s.Search.Read.Tagk,
dbTagv: s.Search.Read.Tagv,
dbMetricTags: s.Search.Read.MetricTags,
dbNotifications: s.Notifications,
dbSilence: s.Silence,
dbStatus: s.status,
dbMetadata: s.Metadata,
dbIncidents: s.Incidents,
}
tostore := make(map[string][]byte)
for name, data := range store {
f := new(bytes.Buffer)
gz := gzip.NewWriter(f)
cw := &counterWriter{w: gz}
enc := gob.NewEncoder(cw)
if err := enc.Encode(data); err != nil {
slog.Errorf("error saving %s: %v", name, err)
s.Unlock()
return
}
if err := gz.Flush(); err != nil {
slog.Errorf("gzip flush error saving %s: %v", name, err)
}
if err := gz.Close(); err != nil {
slog.Errorf("gzip close error saving %s: %v", name, err)
}
tostore[name] = f.Bytes()
slog.Infof("wrote %s: %v", name, conf.ByteSize(cw.written))
collect.Put("statefile.size", opentsdb.TagSet{"object": name}, cw.written)
}
s.Unlock()
err := s.db.Update(func(tx *bolt.Tx) error {
b, err := tx.CreateBucketIfNotExists([]byte(dbBucket))
if err != nil {
return err
}
for name, data := range tostore {
if err := b.Put([]byte(name), data); err != nil {
return err
}
}
return nil
})
if err != nil {
slog.Errorf("save db update error: %v", err)
return
}
fi, err := os.Stat(s.Conf.StateFile)
if err == nil {
collect.Put("statefile.size", opentsdb.TagSet{"object": "total"}, fi.Size())
}
slog.Infoln("save to db complete")
}
作者:Skyscanne
项目:bosu
func (s *Schedule) CheckAlert(T miniprofiler.Timer, r *RunHistory, a *conf.Alert) {
slog.Infof("check alert %v start", a.Name)
start := utcNow()
for _, ak := range s.findUnknownAlerts(r.Start, a.Name) {
r.Events[ak] = &models.Event{Status: models.StUnknown}
}
var warns, crits models.AlertKeys
d, err := s.executeExpr(T, r, a, a.Depends)
var deps expr.ResultSlice
if err == nil {
deps = filterDependencyResults(d)
crits, err = s.CheckExpr(T, r, a, a.Crit, models.StCritical, nil)
if err == nil {
warns, err = s.CheckExpr(T, r, a, a.Warn, models.StWarning, crits)
}
}
unevalCount, unknownCount := markDependenciesUnevaluated(r.Events, deps, a.Name)
if err != nil {
slog.Errorf("Error checking alert %s: %s", a.Name, err.Error())
removeUnknownEvents(r.Events, a.Name)
s.markAlertError(a.Name, err)
} else {
s.markAlertSuccessful(a.Name)
}
collect.Put("check.duration", opentsdb.TagSet{"name": a.Name}, time.Since(start).Seconds())
slog.Infof("check alert %v done (%s): %v crits, %v warns, %v unevaluated, %v unknown", a.Name, time.Since(start), len(crits), len(warns), unevalCount, unknownCount)
}
作者:nicolle
项目:bosu
func (s *Schedule) CheckAlert(T miniprofiler.Timer, r *RunHistory, a *conf.Alert) (cancelled bool) {
slog.Infof("check alert %v start", a.Name)
start := utcNow()
for _, ak := range s.findUnknownAlerts(r.Start, a.Name) {
r.Events[ak] = &models.Event{Status: models.StUnknown}
}
var warns, crits models.AlertKeys
type res struct {
results *expr.Results
error error
}
// buffered channel so go func that runs executeExpr won't leak if the Check is cancelled
// by the closing of the schedule
rc := make(chan res, 1)
var d *expr.Results
var err error
go func() {
d, err := s.executeExpr(T, r, a, a.Depends)
rc <- res{d, err} // this will hang forever if the channel isn't buffered since nothing will ever receieve from rc
}()
select {
case res := <-rc:
d = res.results
err = res.error
// If the schedule closes before the expression has finised executing, we abandon the
// execution of the expression
case <-s.runnerContext.Done():
return true
}
var deps expr.ResultSlice
if err == nil {
deps = filterDependencyResults(d)
crits, err, cancelled = s.CheckExpr(T, r, a, a.Crit, models.StCritical, nil)
if err == nil && !cancelled {
warns, err, cancelled = s.CheckExpr(T, r, a, a.Warn, models.StWarning, crits)
}
}
if cancelled {
return true
}
unevalCount, unknownCount := markDependenciesUnevaluated(r.Events, deps, a.Name)
if err != nil {
slog.Errorf("Error checking alert %s: %s", a.Name, err.Error())
removeUnknownEvents(r.Events, a.Name)
s.markAlertError(a.Name, err)
} else {
s.markAlertSuccessful(a.Name)
}
collect.Put("check.duration", opentsdb.TagSet{"name": a.Name}, time.Since(start).Seconds())
slog.Infof("check alert %v done (%s): %v crits, %v warns, %v unevaluated, %v unknown", a.Name, time.Since(start), len(crits), len(warns), unevalCount, unknownCount)
return false
}
作者:bridgewel
项目:bosu
func pingHost(host string) {
p := fastping.NewPinger()
tags := opentsdb.TagSet{"dst_host": host}
resolved := 0
defer func() {
collect.Put("ping.resolved", tags, resolved)
}()
ra, err := net.ResolveIPAddr("ip4:icmp", host)
if err != nil {
return
}
resolved = 1
p.AddIPAddr(ra)
p.MaxRTT = time.Second * 5
timeout := 1
p.OnRecv = func(addr *net.IPAddr, t time.Duration) {
collect.Put("ping.rtt", tags, float64(t)/float64(time.Millisecond))
timeout = 0
}
if err := p.Run(); err != nil {
log.Print(err)
}
collect.Put("ping.timeout", tags, timeout)
}
作者:wenxiaoy
项目:bosu
func (s *Schedule) CheckAlert(T miniprofiler.Timer, r *RunHistory, a *conf.Alert) {
log.Printf("check alert %v start", a.Name)
start := time.Now()
var warns, crits expr.AlertKeys
d, err := s.executeExpr(T, r, a, a.Depends)
var deps expr.ResultSlice
if err == nil {
deps = filterDependencyResults(d)
crits, err = s.CheckExpr(T, r, a, a.Crit, StCritical, nil)
if err == nil {
warns, _ = s.CheckExpr(T, r, a, a.Warn, StWarning, crits)
}
}
unevalCount, unknownCount := markDependenciesUnevaluated(r.Events, deps, a.Name)
if err != nil {
removeUnknownEvents(r.Events, a.Name)
}
collect.Put("check.duration", opentsdb.TagSet{"name": a.Name}, time.Since(start).Seconds())
log.Printf("check alert %v done (%s): %v crits, %v warns, %v unevaluated, %v unknown", a.Name, time.Since(start), len(crits), len(warns), unevalCount, unknownCount)
}
作者:Skyscanne
项目:bosu
// CollectStates sends various state information to bosun with collect.
func (s *Schedule) CollectStates() {
// [AlertName][Severity]Count
severityCounts := make(map[string]map[string]int64)
abnormalCounts := make(map[string]map[string]int64)
ackStatusCounts := make(map[string]map[bool]int64)
ackByNotificationCounts := make(map[string]map[bool]int64)
unAckOldestByNotification := make(map[string]time.Time)
activeStatusCounts := make(map[string]map[bool]int64)
// Initalize the Counts
for _, alert := range s.Conf.Alerts {
severityCounts[alert.Name] = make(map[string]int64)
abnormalCounts[alert.Name] = make(map[string]int64)
var i models.Status
for i = 1; i.String() != "none"; i++ {
severityCounts[alert.Name][i.String()] = 0
abnormalCounts[alert.Name][i.String()] = 0
}
ackStatusCounts[alert.Name] = make(map[bool]int64)
activeStatusCounts[alert.Name] = make(map[bool]int64)
ackStatusCounts[alert.Name][false] = 0
activeStatusCounts[alert.Name][false] = 0
ackStatusCounts[alert.Name][true] = 0
activeStatusCounts[alert.Name][true] = 0
}
for notificationName := range s.Conf.Notifications {
unAckOldestByNotification[notificationName] = time.Unix(1<<63-62135596801, 999999999)
ackByNotificationCounts[notificationName] = make(map[bool]int64)
ackByNotificationCounts[notificationName][false] = 0
ackByNotificationCounts[notificationName][true] = 0
}
//TODO:
// for _, state := range s.status {
// if !state.Open {
// continue
// }
// name := state.AlertKey.Name()
// alertDef := s.Conf.Alerts[name]
// nots := make(map[string]bool)
// for name := range alertDef.WarnNotification.Get(s.Conf, state.Group) {
// nots[name] = true
// }
// for name := range alertDef.CritNotification.Get(s.Conf, state.Group) {
// nots[name] = true
// }
// incident, err := s.GetIncident(state.Last().IncidentId)
// if err != nil {
// slog.Errorln(err)
// }
// for notificationName := range nots {
// ackByNotificationCounts[notificationName][state.NeedAck]++
// if incident != nil && incident.Start.Before(unAckOldestByNotification[notificationName]) && state.NeedAck {
// unAckOldestByNotification[notificationName] = incident.Start
// }
// }
// severity := state.CurrentStatus.String()
// lastAbnormal := state.LastAbnormalStatus.String()
// severityCounts[state.Alert][severity]++
// abnormalCounts[state.Alert][lastAbnormal]++
// ackStatusCounts[state.Alert][state.NeedAck]++
// activeStatusCounts[state.Alert][state.IsActive()]++
// }
for notification := range ackByNotificationCounts {
ts := opentsdb.TagSet{"notification": notification}
err := collect.Put("alerts.acknowledgement_status_by_notification",
ts.Copy().Merge(opentsdb.TagSet{"status": "unacknowledged"}),
ackByNotificationCounts[notification][true])
if err != nil {
slog.Errorln(err)
}
err = collect.Put("alerts.acknowledgement_status_by_notification",
ts.Copy().Merge(opentsdb.TagSet{"status": "acknowledged"}),
ackByNotificationCounts[notification][false])
if err != nil {
slog.Errorln(err)
}
}
for notification, timeStamp := range unAckOldestByNotification {
ts := opentsdb.TagSet{"notification": notification}
var ago time.Duration
if !timeStamp.Equal(time.Unix(1<<63-62135596801, 999999999)) {
ago = utcNow().Sub(timeStamp)
}
err := collect.Put("alerts.oldest_unacked_by_notification",
ts,
ago.Seconds())
if err != nil {
slog.Errorln(err)
}
}
for alertName := range severityCounts {
ts := opentsdb.TagSet{"alert": alertName}
// The tagset of the alert is not included because there is no way to
// store the string of a group in OpenTSBD in a parsable way. This is
// because any delimiter we chose could also be part of a tag key or tag
// value.
for severity := range severityCounts[alertName] {
err := collect.Put("alerts.current_severity",
ts.Copy().Merge(opentsdb.TagSet{"severity": severity}),
severityCounts[alertName][severity])
//.........这里部分代码省略.........
作者:eswd
项目:bosu
//.........这里部分代码省略.........
for _, x := range conf.ExtraHop {
check(collectors.ExtraHop(x.Host, x.APIKey, x.FilterBy, x.FilterPercent))
}
if err != nil {
slog.Fatal(err)
}
collectors.KeepalivedCommunity = conf.KeepalivedCommunity
// Add all process collectors. This is platform specific.
collectors.WatchProcesses()
collectors.WatchProcessesDotNet()
if *flagFake > 0 {
collectors.InitFake(*flagFake)
}
collect.Debug = *flagDebug
util.Debug = *flagDebug
collect.DisableDefaultCollectors = conf.DisableSelf
c := collectors.Search(conf.Filter)
if len(c) == 0 {
slog.Fatalf("Filter %v matches no collectors.", conf.Filter)
}
for _, col := range c {
col.Init()
}
u, err := parseHost(conf.Host)
if *flagList {
list(c)
return
} else if *flagPrint {
u = &url.URL{Scheme: "http", Host: "localhost:0"}
} else if err != nil {
slog.Fatalf("invalid host %v: %v", conf.Host, err)
}
freq := time.Second * time.Duration(conf.Freq)
if freq <= 0 {
slog.Fatal("freq must be > 0")
}
collectors.DefaultFreq = freq
collect.Freq = freq
if conf.BatchSize < 0 {
slog.Fatal("BatchSize must be > 0")
}
if conf.BatchSize != 0 {
collect.BatchSize = conf.BatchSize
}
collect.Tags = conf.Tags.Copy().Merge(opentsdb.TagSet{"os": runtime.GOOS})
if *flagPrint {
collect.Print = true
}
if !*flagDisableMetadata {
if err := metadata.Init(u, *flagDebug); err != nil {
slog.Fatal(err)
}
}
cdp, cquit := collectors.Run(c)
if u != nil {
slog.Infoln("OpenTSDB host:", u)
}
if err := collect.InitChan(u, "scollector", cdp); err != nil {
slog.Fatal(err)
}
if version.VersionDate != "" {
v, err := strconv.ParseInt(version.VersionDate, 10, 64)
if err == nil {
go func() {
metadata.AddMetricMeta("scollector.version", metadata.Gauge, metadata.None,
"Scollector version number, which indicates when scollector was built.")
for {
if err := collect.Put("version", collect.Tags, v); err != nil {
slog.Error(err)
}
time.Sleep(time.Hour)
}
}()
}
}
if *flagBatchSize > 0 {
collect.BatchSize = *flagBatchSize
}
go func() {
const maxMem = 500 * 1024 * 1024 // 500MB
var m runtime.MemStats
for range time.Tick(time.Minute) {
runtime.ReadMemStats(&m)
if m.Alloc > maxMem {
panic("memory max reached")
}
}
}()
sChan := make(chan os.Signal)
signal.Notify(sChan, os.Interrupt)
<-sChan
close(cquit)
// try to flush all datapoints on sigterm, but quit after 5 seconds no matter what.
time.AfterFunc(5*time.Second, func() {
os.Exit(0)
})
collect.Flush()
}
作者:nicolle
项目:bosu
//.........这里部分代码省略.........
if err != nil {
slog.Fatalf("Error adding tag overrides: %s", err)
}
u, err := parseHost(conf.Host)
if *flagList {
list(c)
return
} else if *flagPrint {
u = &url.URL{Scheme: "http", Host: "localhost:0"}
} else if err != nil {
slog.Fatalf("invalid host %v: %v", conf.Host, err)
}
freq := time.Second * time.Duration(conf.Freq)
if freq <= 0 {
slog.Fatal("freq must be > 0")
}
collectors.DefaultFreq = freq
collect.Freq = freq
if conf.BatchSize < 0 {
slog.Fatal("BatchSize must be > 0")
}
if conf.BatchSize != 0 {
collect.BatchSize = conf.BatchSize
}
collect.Tags = conf.Tags.Copy().Merge(opentsdb.TagSet{"os": runtime.GOOS})
if *flagPrint {
collect.Print = true
}
if !*flagDisableMetadata {
if err := metadata.Init(u, *flagDebug); err != nil {
slog.Fatal(err)
}
}
cdp, cquit := collectors.Run(c)
if u != nil {
slog.Infoln("OpenTSDB host:", u)
}
collect.UseNtlm = conf.UseNtlm
if err := collect.InitChan(u, "scollector", cdp); err != nil {
slog.Fatal(err)
}
if collect.DisableDefaultCollectors == false && version.VersionDate != "" {
v, err := strconv.ParseInt(version.VersionDate, 10, 64)
if err == nil {
go func() {
metadata.AddMetricMeta("scollector.version", metadata.Gauge, metadata.None,
"Scollector version number, which indicates when scollector was built.")
for {
if err := collect.Put("version", collect.Tags, v); err != nil {
slog.Error(err)
}
time.Sleep(time.Hour)
}
}()
}
}
if *flagBatchSize > 0 {
collect.BatchSize = *flagBatchSize
}
if conf.MaxQueueLen != 0 {
if conf.MaxQueueLen < collect.BatchSize {
slog.Fatalf("MaxQueueLen must be >= %d (BatchSize)", collect.BatchSize)
}
collect.MaxQueueLen = conf.MaxQueueLen
}
maxMemMB := uint64(500)
if conf.MaxMem != 0 {
maxMemMB = conf.MaxMem
}
go func() {
var m runtime.MemStats
for range time.Tick(time.Second * 30) {
runtime.ReadMemStats(&m)
allocMB := m.Alloc / 1024 / 1024
if allocMB > maxMemMB {
slog.Fatalf("memory max runtime reached: (current alloc: %v megabytes, max: %v megabytes)", allocMB, maxMemMB)
}
//See proccess_windows.go and process_linux.go for total process memory usage.
//Note that in linux the rss metric includes shared pages, where as in
//Windows the private working set does not include shared memory.
//Total memory used seems to scale linerarly with m.Alloc.
//But we want this to catch a memory leak outside the runtime (WMI/CGO).
//So for now just add any runtime allocations to the allowed total limit.
maxMemTotalMB := maxMemMB + allocMB
if collectors.TotalScollectorMemoryMB > maxMemTotalMB {
slog.Fatalf("memory max total reached: (current total: %v megabytes, current runtime alloc: %v megabytes, max: %v megabytes)", collectors.TotalScollectorMemoryMB, allocMB, maxMemTotalMB)
}
}
}()
sChan := make(chan os.Signal)
signal.Notify(sChan, os.Interrupt)
<-sChan
close(cquit)
// try to flush all datapoints on sigterm, but quit after 5 seconds no matter what.
time.AfterFunc(5*time.Second, func() {
os.Exit(0)
})
collect.Flush()
}
作者:snowsnai
项目:bosu
// CollectStates sends various state information to bosun with collect.
func (s *Schedule) CollectStates() {
// [AlertName][Severity]Count
severityCounts := make(map[string]map[string]int64)
abnormalCounts := make(map[string]map[string]int64)
ackStatusCounts := make(map[string]map[bool]int64)
activeStatusCounts := make(map[string]map[bool]int64)
// Initalize the Counts
for _, alert := range s.Conf.Alerts {
severityCounts[alert.Name] = make(map[string]int64)
abnormalCounts[alert.Name] = make(map[string]int64)
var i Status
for i = 1; i.String() != "none"; i++ {
severityCounts[alert.Name][i.String()] = 0
abnormalCounts[alert.Name][i.String()] = 0
}
ackStatusCounts[alert.Name] = make(map[bool]int64)
activeStatusCounts[alert.Name] = make(map[bool]int64)
ackStatusCounts[alert.Name][false] = 0
activeStatusCounts[alert.Name][false] = 0
ackStatusCounts[alert.Name][true] = 0
activeStatusCounts[alert.Name][true] = 0
}
for _, state := range s.status {
if !state.Open {
continue
}
severity := state.Status().String()
lastAbnormal := state.AbnormalStatus().String()
severityCounts[state.Alert][severity]++
abnormalCounts[state.Alert][lastAbnormal]++
ackStatusCounts[state.Alert][state.NeedAck]++
activeStatusCounts[state.Alert][state.IsActive()]++
}
for alertName := range severityCounts {
ts := opentsdb.TagSet{"alert": alertName}
// The tagset of the alert is not included because there is no way to
// store the string of a group in OpenTSBD in a parsable way. This is
// because any delimiter we chose could also be part of a tag key or tag
// value.
for severity := range severityCounts[alertName] {
err := collect.Put("alerts.current_severity",
ts.Copy().Merge(opentsdb.TagSet{"severity": severity}),
severityCounts[alertName][severity])
if err != nil {
slog.Errorln(err)
}
err = collect.Put("alerts.last_abnormal_severity",
ts.Copy().Merge(opentsdb.TagSet{"severity": severity}),
abnormalCounts[alertName][severity])
if err != nil {
slog.Errorln(err)
}
}
err := collect.Put("alerts.acknowledgement_status",
ts.Copy().Merge(opentsdb.TagSet{"status": "unacknowledged"}),
ackStatusCounts[alertName][true])
err = collect.Put("alerts.acknowledgement_status",
ts.Copy().Merge(opentsdb.TagSet{"status": "acknowledged"}),
ackStatusCounts[alertName][false])
if err != nil {
slog.Errorln(err)
}
err = collect.Put("alerts.active_status",
ts.Copy().Merge(opentsdb.TagSet{"status": "active"}),
activeStatusCounts[alertName][true])
if err != nil {
slog.Errorln(err)
}
err = collect.Put("alerts.active_status",
ts.Copy().Merge(opentsdb.TagSet{"status": "inactive"}),
activeStatusCounts[alertName][false])
if err != nil {
slog.Errorln(err)
}
}
}
作者:couchan
项目:bosu
//.........这里部分代码省略.........
check(collectors.ICMP(i.Host))
}
for _, a := range conf.AWS {
check(collectors.AWS(a.AccessKey, a.SecretKey, a.Region))
}
for _, v := range conf.Vsphere {
check(collectors.Vsphere(v.User, v.Password, v.Host))
}
for _, p := range conf.Process {
check(collectors.AddProcessConfig(p))
}
for _, h := range conf.HTTPUnit {
if h.TOML != "" {
check(collectors.HTTPUnitTOML(h.TOML))
}
if h.Hiera != "" {
check(collectors.HTTPUnitHiera(h.Hiera))
}
}
if err != nil {
slog.Fatal(err)
}
collectors.KeepalivedCommunity = conf.KeepalivedCommunity
// Add all process collectors. This is platform specific.
collectors.WatchProcesses()
collectors.WatchProcessesDotNet()
if *flagFake > 0 {
collectors.InitFake(*flagFake)
}
collect.Debug = *flagDebug
util.Debug = *flagDebug
collect.DisableDefaultCollectors = conf.DisableSelf
c := collectors.Search(conf.Filter)
if len(c) == 0 {
slog.Fatalf("Filter %v matches no collectors.", conf.Filter)
}
for _, col := range c {
col.Init()
}
u, err := parseHost(conf.Host)
if *flagList {
list(c)
return
} else if err != nil {
slog.Fatalf("invalid host %v: %v", conf.Host, err)
}
freq := time.Second * time.Duration(conf.Freq)
if freq <= 0 {
slog.Fatal("freq must be > 0")
}
collectors.DefaultFreq = freq
collect.Freq = freq
collect.Tags = opentsdb.TagSet{"os": runtime.GOOS}
if *flagPrint {
collect.Print = true
}
if !*flagDisableMetadata {
if err := metadata.Init(u, *flagDebug); err != nil {
slog.Fatal(err)
}
}
cdp := collectors.Run(c)
if u != nil {
slog.Infoln("OpenTSDB host:", u)
}
if err := collect.InitChan(u, "scollector", cdp); err != nil {
slog.Fatal(err)
}
if version.VersionDate != "" {
v, err := strconv.ParseInt(version.VersionDate, 10, 64)
if err == nil {
go func() {
metadata.AddMetricMeta("scollector.version", metadata.Gauge, metadata.None,
"Scollector version number, which indicates when scollector was built.")
for {
if err := collect.Put("version", collect.Tags, v); err != nil {
slog.Error(err)
}
time.Sleep(time.Hour)
}
}()
}
}
if *flagBatchSize > 0 {
collect.BatchSize = *flagBatchSize
}
go func() {
const maxMem = 500 * 1024 * 1024 // 500MB
var m runtime.MemStats
for range time.Tick(time.Minute) {
runtime.ReadMemStats(&m)
if m.Alloc > maxMem {
panic("memory max reached")
}
}
}()
select {}
}