作者:nicolle
项目:bosu
func (d *dataAccess) TouchAlertKey(ak models.AlertKey, t time.Time) error {
conn := d.Get()
defer conn.Close()
_, err := conn.Do("ZADD", statesLastTouchedKey(ak.Name()), t.UTC().Unix(), string(ak))
return slog.Wrap(err)
}
作者:eswd
项目:bosu
func NewStatus(ak models.AlertKey) *State {
g := ak.Group()
return &State{
Alert: ak.Name(),
Tags: g.Tags(),
Group: g,
}
}
作者:kroni
项目:bosu
func (d *dataAccess) TouchAlertKey(ak models.AlertKey, t time.Time) error {
defer collect.StartTimer("redis", opentsdb.TagSet{"op": "TouchAlertKey"})()
conn := d.GetConnection()
defer conn.Close()
_, err := conn.Do("ZADD", statesLastTouchedKey(ak.Name()), t.UTC().Unix(), string(ak))
return slog.Wrap(err)
}
作者:Skyscanne
项目:bosu
func NewIncident(ak models.AlertKey) *models.IncidentState {
s := &models.IncidentState{}
s.Start = utcNow()
s.AlertKey = ak
s.Alert = ak.Name()
s.Tags = ak.Group().Tags()
s.Result = &models.Result{}
return s
}
作者:noblehn
项目:bosu
func (s *Schedule) Action(user, message string, t models.ActionType, ak models.AlertKey) error {
if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil {
slog.Errorln(err)
}
st, err := s.DataAccess.State().GetLatestIncident(ak)
if err != nil {
return err
}
if st == nil {
return fmt.Errorf("no such alert key: %v", ak)
}
isUnknown := st.LastAbnormalStatus == models.StUnknown
timestamp := utcNow()
switch t {
case models.ActionAcknowledge:
if !st.NeedAck {
return fmt.Errorf("alert already acknowledged")
}
if !st.Open {
return fmt.Errorf("cannot acknowledge closed alert")
}
st.NeedAck = false
if err := s.DataAccess.Notifications().ClearNotifications(ak); err != nil {
return err
}
case models.ActionClose:
if st.IsActive() {
return fmt.Errorf("cannot close active alert")
}
fallthrough
case models.ActionForceClose:
st.Open = false
st.End = ×tamp
case models.ActionForget:
if !isUnknown {
return fmt.Errorf("can only forget unknowns")
}
fallthrough
case models.ActionPurge:
return s.DataAccess.State().Forget(ak)
default:
return fmt.Errorf("unknown action type: %v", t)
}
// Would like to also track the alert group, but I believe this is impossible because any character
// that could be used as a delimiter could also be a valid tag key or tag value character
if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil {
slog.Errorln(err)
}
st.Actions = append(st.Actions, models.Action{
Message: message,
Time: timestamp,
Type: t,
User: user,
})
_, err = s.DataAccess.State().UpdateIncidentState(st)
return err
}
作者:nicolle
项目:bosu
func (d *dataAccess) SetUnevaluated(ak models.AlertKey, uneval bool) error {
conn := d.Get()
defer conn.Close()
op := "SREM"
if uneval {
op = "SADD"
}
_, err := conn.Do(op, statesUnevalKey(ak.Name()), ak)
return slog.Wrap(err)
}
作者:kroni
项目:bosu
func (d *dataAccess) SetUnevaluated(ak models.AlertKey, uneval bool) error {
defer collect.StartTimer("redis", opentsdb.TagSet{"op": "SetUnevaluated"})()
conn := d.GetConnection()
defer conn.Close()
op := "SREM"
if uneval {
op = "SADD"
}
_, err := conn.Do(op, statesUnevalKey(ak.Name()), ak)
return slog.Wrap(err)
}
作者:kroni
项目:bosu
// The nucular option. Delete all we know about this alert key
func (d *dataAccess) Forget(ak models.AlertKey) error {
defer collect.StartTimer("redis", opentsdb.TagSet{"op": "Forget"})()
conn := d.GetConnection()
defer conn.Close()
alert := ak.Name()
return d.transact(conn, func() error {
// last touched.
if _, err := conn.Do("HDEL", statesLastTouchedKey(alert), ak); err != nil {
return slog.Wrap(err)
}
// unknown/uneval sets
if _, err := conn.Do("SREM", statesUnknownKey(alert), ak); err != nil {
return slog.Wrap(err)
}
if _, err := conn.Do("SREM", statesUnevalKey(alert), ak); err != nil {
return slog.Wrap(err)
}
//open set
if _, err := conn.Do("HDEL", statesOpenIncidentsKey, ak); err != nil {
return slog.Wrap(err)
}
//all incidents
ids, err := int64s(conn.Do("LRANGE", incidentsForAlertKeyKey(ak), 0, -1))
if err != nil {
return slog.Wrap(err)
}
if _, err = conn.Do("HDEL", statesOpenIncidentsKey, ak); err != nil {
return slog.Wrap(err)
}
for _, id := range ids {
if _, err = conn.Do("DEL", incidentStateKey(id)); err != nil {
return slog.Wrap(err)
}
}
if _, err := conn.Do(d.LCLEAR(), incidentsForAlertKeyKey(ak)); err != nil {
return slog.Wrap(err)
}
return nil
})
}
作者:Skyscanne
项目:bosu
// RunHistory for a single alert key. Returns true if notifications were altered.
func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.Event, silenced SilenceTester) (checkNotify bool, err error) {
event.Time = r.Start
a := s.Conf.Alerts[ak.Name()]
if a.UnknownsNormal && event.Status == models.StUnknown {
event.Status = models.StNormal
}
data := s.DataAccess.State()
err = data.TouchAlertKey(ak, utcNow())
if err != nil {
return
}
si := silenced(ak)
// get existing open incident if exists
var incident *models.IncidentState
incident, err = data.GetOpenIncident(ak)
if err != nil {
return
}
defer func() {
// save unless incident is new and closed (log alert)
if incident != nil && (incident.Id != 0 || incident.Open) {
_, err = data.UpdateIncidentState(incident)
} else {
err = data.SetUnevaluated(ak, event.Unevaluated) // if nothing to save, at least store the unevaluated state
}
}()
// If nothing is out of the ordinary we are done
if event.Status <= models.StNormal && incident == nil {
return
}
// if event is unevaluated, we are done also.
if incident != nil {
incident.Unevaluated = event.Unevaluated
}
if event.Unevaluated {
return
}
shouldNotify := false
newIncident := false
if incident == nil {
incident = NewIncident(ak)
newIncident = true
shouldNotify = true
}
// VICTOROPS INTEGRATION: Enables notification of incidents which have returned to normal (Sends normNotification defined in config)
if event.Status <= models.StNormal && (incident.CurrentStatus == models.StWarning || incident.CurrentStatus == models.StCritical) {
slog.Infof("TRIGGER_RESOLVED: from %s to %s", incident.CurrentStatus, event.Status)
shouldNotify = true
}
// VICTOROPS INTEGRATION: Enables notification of Incidents which have returned to normal but are now back to warning or critical. i.e. enable Flapping
if incident.CurrentStatus == models.StNormal && (event.Status == models.StCritical || event.Status == models.StWarning) {
slog.Infof("TRIGGER_REALERT: from %s to %s", incident.CurrentStatus, event.Status)
shouldNotify = true
}
// set state.Result according to event result
if event.Status == models.StCritical {
incident.Result = event.Crit
} else if event.Status == models.StWarning {
incident.Result = event.Warn
}
if event.Status > models.StNormal {
incident.LastAbnormalStatus = event.Status
incident.LastAbnormalTime = event.Time.UTC().Unix()
}
if event.Status > incident.WorstStatus {
incident.WorstStatus = event.Status
shouldNotify = true
}
if event.Status != incident.CurrentStatus {
incident.Events = append(incident.Events, *event)
}
incident.CurrentStatus = event.Status
//run a preliminary save on new incidents to get an id
if newIncident {
if a.Log || silencedOrIgnored(a, event, si) {
//a log or silenced/ignored alert will not need to be saved
} else {
incident.Id, err = s.DataAccess.State().UpdateIncidentState(incident)
if err != nil {
return
}
}
}
//render templates and open alert key if abnormal
if event.Status > models.StNormal {
s.executeTemplates(incident, event, a, r)
incident.Open = true
if a.Log {
//.........这里部分代码省略.........
作者:jareks
项目:bosu
func notsByAlertKeyKey(ak models.AlertKey) string {
return fmt.Sprintf("notsByAlert:%s", ak.Name())
}
作者:eswd
项目:bosu
// RunHistory for a single alert key. Returns true if notifications were altered.
func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *Event, silenced map[models.AlertKey]models.Silence) bool {
checkNotify := false
// get existing state object for alert key. add to schedule status if doesn't already exist
state := s.GetStatus(ak)
if state == nil {
state = NewStatus(ak)
s.SetStatus(ak, state)
}
defer s.SetStatus(ak, state)
// make sure we always touch the state.
state.Touched = r.Start
// set state.Result according to event result
if event.Crit != nil {
state.Result = event.Crit
} else if event.Warn != nil {
state.Result = event.Warn
}
// if event is unevaluated, we are done.
state.Unevaluated = event.Unevaluated
if event.Unevaluated {
return checkNotify
}
// assign incident id to new event if applicable
prev := state.Last()
worst := StNormal
event.Time = r.Start
if prev.IncidentId != 0 {
// If last event has incident id and is not closed, we continue it.
incident, err := s.DataAccess.Incidents().GetIncident(prev.IncidentId)
if err != nil {
slog.Error(err)
} else if incident.End == nil {
event.IncidentId = prev.IncidentId
worst = state.WorstThisIncident()
}
}
if event.IncidentId == 0 && event.Status != StNormal {
incident, err := s.createIncident(ak, event.Time)
if err != nil {
slog.Error("Error creating incident", err)
} else {
event.IncidentId = incident.Id
}
}
state.Append(event)
a := s.Conf.Alerts[ak.Name()]
// render templates and open alert key if abnormal
if event.Status > StNormal {
s.executeTemplates(state, event, a, r)
state.Open = true
if a.Log {
worst = StNormal
state.Open = false
}
}
// On state increase, clear old notifications and notify current.
// If the old alert was not acknowledged, do nothing.
// Do nothing if state did not change.
notify := func(ns *conf.Notifications) {
if a.Log {
lastLogTime := state.LastLogTime
now := time.Now()
if now.Before(lastLogTime.Add(a.MaxLogFrequency)) {
return
}
state.LastLogTime = now
}
nots := ns.Get(s.Conf, state.Group)
for _, n := range nots {
s.Notify(state, n)
checkNotify = true
}
}
notifyCurrent := func() {
// Auto close ignoreUnknowns.
if a.IgnoreUnknown && event.Status == StUnknown {
state.Open = false
state.Forgotten = true
state.NeedAck = false
state.Action("bosun", "Auto close because alert has ignoreUnknown.", ActionClose, event.Time)
slog.Infof("auto close %s because alert has ignoreUnknown", ak)
return
} else if silenced[ak].Forget && event.Status == StUnknown {
state.Open = false
state.Forgotten = true
state.NeedAck = false
state.Action("bosun", "Auto close because alert is silenced and marked auto forget.", ActionClose, event.Time)
slog.Infof("auto close %s because alert is silenced and marked auto forget", ak)
return
}
state.NeedAck = true
switch event.Status {
case StCritical, StUnknown:
notify(a.CritNotification)
case StWarning:
notify(a.WarnNotification)
}
}
//.........这里部分代码省略.........
作者:Victoria
项目:bosu
// RunHistory for a single alert key. Returns true if notifications were altered.
func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.Event, silenced SilenceTester) (checkNotify bool, err error) {
event.Time = r.Start
data := s.DataAccess.State()
err = data.TouchAlertKey(ak, time.Now())
if err != nil {
return
}
// get existing open incident if exists
incident, err := data.GetOpenIncident(ak)
if err != nil {
return
}
defer func() {
// save unless incident is new and closed (log alert)
if incident != nil && (incident.Id != 0 || incident.Open) {
err = data.UpdateIncidentState(incident)
} else {
err = data.SetUnevaluated(ak, event.Unevaluated) // if nothing to save, at least store the unevaluated state
}
}()
// If nothing is out of the ordinary we are done
if event.Status <= models.StNormal && incident == nil {
return
}
// if event is unevaluated, we are done also.
if incident != nil {
incident.Unevaluated = event.Unevaluated
}
if event.Unevaluated {
return
}
shouldNotify := false
if incident == nil {
incident = NewIncident(ak)
shouldNotify = true
}
// set state.Result according to event result
if event.Status == models.StCritical {
incident.Result = event.Crit
} else if event.Status == models.StWarning {
incident.Result = event.Warn
}
if event.Status > models.StNormal {
incident.LastAbnormalStatus = event.Status
incident.LastAbnormalTime = event.Time.UTC().Unix()
}
if event.Status > incident.WorstStatus {
incident.WorstStatus = event.Status
shouldNotify = true
}
if event.Status != incident.CurrentStatus {
incident.Events = append(incident.Events, *event)
}
incident.CurrentStatus = event.Status
a := s.Conf.Alerts[ak.Name()]
//render templates and open alert key if abnormal
if event.Status > models.StNormal {
s.executeTemplates(incident, event, a, r)
incident.Open = true
if a.Log {
incident.Open = false
}
}
// On state increase, clear old notifications and notify current.
// Do nothing if state did not change.
notify := func(ns *conf.Notifications) {
if a.Log {
lastLogTime := s.lastLogTimes[ak]
now := time.Now()
if now.Before(lastLogTime.Add(a.MaxLogFrequency)) {
return
}
s.lastLogTimes[ak] = now
}
nots := ns.Get(s.Conf, incident.AlertKey.Group())
for _, n := range nots {
s.Notify(incident, n)
checkNotify = true
}
}
notifyCurrent := func() {
si := silenced(ak)
//Auto close ignoreUnknowns for new incident.
if a.IgnoreUnknown && event.Status == models.StUnknown {
incident.Open = false
return
} else if si != nil && si.Forget && event.Status == models.StUnknown {
incident.Open = false
return
}
incident.NeedAck = true
switch event.Status {
case models.StCritical, models.StUnknown:
//.........这里部分代码省略.........
作者:rajde
项目:bosu
func (s *Schedule) Action(user, message string, t ActionType, ak models.AlertKey) error {
s.Lock("Action")
defer s.Unlock()
st := s.status[ak]
if st == nil {
return fmt.Errorf("no such alert key: %v", ak)
}
ack := func() {
delete(s.Notifications, ak)
st.NeedAck = false
}
isUnknown := st.AbnormalStatus() == StUnknown
timestamp := time.Now().UTC()
switch t {
case ActionAcknowledge:
if !st.NeedAck {
return fmt.Errorf("alert already acknowledged")
}
if !st.Open {
return fmt.Errorf("cannot acknowledge closed alert")
}
ack()
case ActionClose:
if st.NeedAck {
ack()
}
if st.IsActive() {
return fmt.Errorf("cannot close active alert")
}
st.Open = false
last := st.Last()
if last.IncidentId != 0 {
incident, err := s.DataAccess.Incidents().GetIncident(last.IncidentId)
if err != nil {
return err
}
incident.End = ×tamp
if err = s.DataAccess.Incidents().UpdateIncident(last.IncidentId, incident); err != nil {
return err
}
}
case ActionForget:
if !isUnknown {
return fmt.Errorf("can only forget unknowns")
}
if st.NeedAck {
ack()
}
st.Open = false
st.Forgotten = true
delete(s.status, ak)
default:
return fmt.Errorf("unknown action type: %v", t)
}
st.Action(user, message, t, timestamp)
// Would like to also track the alert group, but I believe this is impossible because any character
// that could be used as a delimiter could also be a valid tag key or tag value character
if err := collect.Add("actions", opentsdb.TagSet{"user": user, "alert": ak.Name(), "type": t.String()}, 1); err != nil {
slog.Errorln(err)
}
return nil
}