cloudwatch.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613
  1. package cloudwatch
  2. import (
  3. "context"
  4. "errors"
  5. "fmt"
  6. "regexp"
  7. "sort"
  8. "strconv"
  9. "strings"
  10. "time"
  11. "github.com/grafana/grafana/pkg/log"
  12. "github.com/grafana/grafana/pkg/models"
  13. "github.com/grafana/grafana/pkg/setting"
  14. "github.com/grafana/grafana/pkg/tsdb"
  15. "golang.org/x/sync/errgroup"
  16. "github.com/aws/aws-sdk-go/aws"
  17. "github.com/aws/aws-sdk-go/aws/awserr"
  18. "github.com/aws/aws-sdk-go/aws/request"
  19. "github.com/aws/aws-sdk-go/service/cloudwatch"
  20. "github.com/aws/aws-sdk-go/service/ec2/ec2iface"
  21. "github.com/aws/aws-sdk-go/service/resourcegroupstaggingapi/resourcegroupstaggingapiiface"
  22. "github.com/grafana/grafana/pkg/components/null"
  23. "github.com/grafana/grafana/pkg/components/simplejson"
  24. "github.com/grafana/grafana/pkg/infra/metrics"
  25. )
  26. type CloudWatchExecutor struct {
  27. *models.DataSource
  28. ec2Svc ec2iface.EC2API
  29. rgtaSvc resourcegroupstaggingapiiface.ResourceGroupsTaggingAPIAPI
  30. }
  31. type DatasourceInfo struct {
  32. Profile string
  33. Region string
  34. AuthType string
  35. AssumeRoleArn string
  36. Namespace string
  37. AccessKey string
  38. SecretKey string
  39. }
  40. func NewCloudWatchExecutor(dsInfo *models.DataSource) (tsdb.TsdbQueryEndpoint, error) {
  41. return &CloudWatchExecutor{}, nil
  42. }
  43. var (
  44. plog log.Logger
  45. standardStatistics map[string]bool
  46. aliasFormat *regexp.Regexp
  47. )
  48. func init() {
  49. plog = log.New("tsdb.cloudwatch")
  50. tsdb.RegisterTsdbQueryEndpoint("cloudwatch", NewCloudWatchExecutor)
  51. standardStatistics = map[string]bool{
  52. "Average": true,
  53. "Maximum": true,
  54. "Minimum": true,
  55. "Sum": true,
  56. "SampleCount": true,
  57. }
  58. aliasFormat = regexp.MustCompile(`\{\{\s*(.+?)\s*\}\}`)
  59. }
  60. func (e *CloudWatchExecutor) Query(ctx context.Context, dsInfo *models.DataSource, queryContext *tsdb.TsdbQuery) (*tsdb.Response, error) {
  61. var result *tsdb.Response
  62. e.DataSource = dsInfo
  63. queryType := queryContext.Queries[0].Model.Get("type").MustString("")
  64. var err error
  65. switch queryType {
  66. case "metricFindQuery":
  67. result, err = e.executeMetricFindQuery(ctx, queryContext)
  68. case "annotationQuery":
  69. result, err = e.executeAnnotationQuery(ctx, queryContext)
  70. case "timeSeriesQuery":
  71. fallthrough
  72. default:
  73. result, err = e.executeTimeSeriesQuery(ctx, queryContext)
  74. }
  75. return result, err
  76. }
  77. func (e *CloudWatchExecutor) executeTimeSeriesQuery(ctx context.Context, queryContext *tsdb.TsdbQuery) (*tsdb.Response, error) {
  78. results := &tsdb.Response{
  79. Results: make(map[string]*tsdb.QueryResult),
  80. }
  81. resultChan := make(chan *tsdb.QueryResult, len(queryContext.Queries))
  82. eg, ectx := errgroup.WithContext(ctx)
  83. getMetricDataQueries := make(map[string]map[string]*CloudWatchQuery)
  84. for i, model := range queryContext.Queries {
  85. queryType := model.Model.Get("type").MustString()
  86. if queryType != "timeSeriesQuery" && queryType != "" {
  87. continue
  88. }
  89. RefId := queryContext.Queries[i].RefId
  90. query, err := parseQuery(queryContext.Queries[i].Model)
  91. if err != nil {
  92. results.Results[RefId] = &tsdb.QueryResult{
  93. Error: err,
  94. }
  95. return results, nil
  96. }
  97. query.RefId = RefId
  98. if query.Id != "" {
  99. if _, ok := getMetricDataQueries[query.Region]; !ok {
  100. getMetricDataQueries[query.Region] = make(map[string]*CloudWatchQuery)
  101. }
  102. getMetricDataQueries[query.Region][query.Id] = query
  103. continue
  104. }
  105. if query.Id == "" && query.Expression != "" {
  106. results.Results[query.RefId] = &tsdb.QueryResult{
  107. Error: fmt.Errorf("Invalid query: id should be set if using expression"),
  108. }
  109. return results, nil
  110. }
  111. eg.Go(func() error {
  112. defer func() {
  113. if err := recover(); err != nil {
  114. plog.Error("Execute Query Panic", "error", err, "stack", log.Stack(1))
  115. if theErr, ok := err.(error); ok {
  116. resultChan <- &tsdb.QueryResult{
  117. RefId: query.RefId,
  118. Error: theErr,
  119. }
  120. }
  121. }
  122. }()
  123. queryRes, err := e.executeQuery(ectx, query, queryContext)
  124. if ae, ok := err.(awserr.Error); ok && ae.Code() == "500" {
  125. return err
  126. }
  127. if err != nil {
  128. resultChan <- &tsdb.QueryResult{
  129. RefId: query.RefId,
  130. Error: err,
  131. }
  132. return nil
  133. }
  134. resultChan <- queryRes
  135. return nil
  136. })
  137. }
  138. if len(getMetricDataQueries) > 0 {
  139. for region, getMetricDataQuery := range getMetricDataQueries {
  140. q := getMetricDataQuery
  141. eg.Go(func() error {
  142. defer func() {
  143. if err := recover(); err != nil {
  144. plog.Error("Execute Get Metric Data Query Panic", "error", err, "stack", log.Stack(1))
  145. if theErr, ok := err.(error); ok {
  146. resultChan <- &tsdb.QueryResult{
  147. Error: theErr,
  148. }
  149. }
  150. }
  151. }()
  152. queryResponses, err := e.executeGetMetricDataQuery(ectx, region, q, queryContext)
  153. if ae, ok := err.(awserr.Error); ok && ae.Code() == "500" {
  154. return err
  155. }
  156. for _, queryRes := range queryResponses {
  157. if err != nil {
  158. queryRes.Error = err
  159. }
  160. resultChan <- queryRes
  161. }
  162. return nil
  163. })
  164. }
  165. }
  166. if err := eg.Wait(); err != nil {
  167. return nil, err
  168. }
  169. close(resultChan)
  170. for result := range resultChan {
  171. results.Results[result.RefId] = result
  172. }
  173. return results, nil
  174. }
  175. func (e *CloudWatchExecutor) executeQuery(ctx context.Context, query *CloudWatchQuery, queryContext *tsdb.TsdbQuery) (*tsdb.QueryResult, error) {
  176. client, err := e.getClient(query.Region)
  177. if err != nil {
  178. return nil, err
  179. }
  180. startTime, err := queryContext.TimeRange.ParseFrom()
  181. if err != nil {
  182. return nil, err
  183. }
  184. endTime, err := queryContext.TimeRange.ParseTo()
  185. if err != nil {
  186. return nil, err
  187. }
  188. if !startTime.Before(endTime) {
  189. return nil, fmt.Errorf("Invalid time range: Start time must be before end time")
  190. }
  191. params := &cloudwatch.GetMetricStatisticsInput{
  192. Namespace: aws.String(query.Namespace),
  193. MetricName: aws.String(query.MetricName),
  194. Dimensions: query.Dimensions,
  195. Period: aws.Int64(int64(query.Period)),
  196. }
  197. if len(query.Statistics) > 0 {
  198. params.Statistics = query.Statistics
  199. }
  200. if len(query.ExtendedStatistics) > 0 {
  201. params.ExtendedStatistics = query.ExtendedStatistics
  202. }
  203. // 1 minutes resolution metrics is stored for 15 days, 15 * 24 * 60 = 21600
  204. if query.HighResolution && (((endTime.Unix() - startTime.Unix()) / int64(query.Period)) > 21600) {
  205. return nil, errors.New("too long query period")
  206. }
  207. var resp *cloudwatch.GetMetricStatisticsOutput
  208. for startTime.Before(endTime) {
  209. params.StartTime = aws.Time(startTime)
  210. if query.HighResolution {
  211. startTime = startTime.Add(time.Duration(1440*query.Period) * time.Second)
  212. } else {
  213. startTime = endTime
  214. }
  215. params.EndTime = aws.Time(startTime)
  216. if setting.Env == setting.DEV {
  217. plog.Debug("CloudWatch query", "raw query", params)
  218. }
  219. partResp, err := client.GetMetricStatisticsWithContext(ctx, params, request.WithResponseReadTimeout(10*time.Second))
  220. if err != nil {
  221. return nil, err
  222. }
  223. if resp != nil {
  224. resp.Datapoints = append(resp.Datapoints, partResp.Datapoints...)
  225. } else {
  226. resp = partResp
  227. }
  228. metrics.M_Aws_CloudWatch_GetMetricStatistics.Inc()
  229. }
  230. queryRes, err := parseResponse(resp, query)
  231. if err != nil {
  232. return nil, err
  233. }
  234. return queryRes, nil
  235. }
  236. func (e *CloudWatchExecutor) executeGetMetricDataQuery(ctx context.Context, region string, queries map[string]*CloudWatchQuery, queryContext *tsdb.TsdbQuery) ([]*tsdb.QueryResult, error) {
  237. queryResponses := make([]*tsdb.QueryResult, 0)
  238. // validate query
  239. for _, query := range queries {
  240. if !(len(query.Statistics) == 1 && len(query.ExtendedStatistics) == 0) &&
  241. !(len(query.Statistics) == 0 && len(query.ExtendedStatistics) == 1) {
  242. return queryResponses, errors.New("Statistics count should be 1")
  243. }
  244. }
  245. client, err := e.getClient(region)
  246. if err != nil {
  247. return queryResponses, err
  248. }
  249. startTime, err := queryContext.TimeRange.ParseFrom()
  250. if err != nil {
  251. return queryResponses, err
  252. }
  253. endTime, err := queryContext.TimeRange.ParseTo()
  254. if err != nil {
  255. return queryResponses, err
  256. }
  257. params := &cloudwatch.GetMetricDataInput{
  258. StartTime: aws.Time(startTime),
  259. EndTime: aws.Time(endTime),
  260. ScanBy: aws.String("TimestampAscending"),
  261. }
  262. for _, query := range queries {
  263. // 1 minutes resolution metrics is stored for 15 days, 15 * 24 * 60 = 21600
  264. if query.HighResolution && (((endTime.Unix() - startTime.Unix()) / int64(query.Period)) > 21600) {
  265. return queryResponses, errors.New("too long query period")
  266. }
  267. mdq := &cloudwatch.MetricDataQuery{
  268. Id: aws.String(query.Id),
  269. ReturnData: aws.Bool(query.ReturnData),
  270. }
  271. if query.Expression != "" {
  272. mdq.Expression = aws.String(query.Expression)
  273. } else {
  274. mdq.MetricStat = &cloudwatch.MetricStat{
  275. Metric: &cloudwatch.Metric{
  276. Namespace: aws.String(query.Namespace),
  277. MetricName: aws.String(query.MetricName),
  278. },
  279. Period: aws.Int64(int64(query.Period)),
  280. }
  281. for _, d := range query.Dimensions {
  282. mdq.MetricStat.Metric.Dimensions = append(mdq.MetricStat.Metric.Dimensions,
  283. &cloudwatch.Dimension{
  284. Name: d.Name,
  285. Value: d.Value,
  286. })
  287. }
  288. if len(query.Statistics) == 1 {
  289. mdq.MetricStat.Stat = query.Statistics[0]
  290. } else {
  291. mdq.MetricStat.Stat = query.ExtendedStatistics[0]
  292. }
  293. }
  294. params.MetricDataQueries = append(params.MetricDataQueries, mdq)
  295. }
  296. nextToken := ""
  297. mdr := make(map[string]*cloudwatch.MetricDataResult)
  298. for {
  299. if nextToken != "" {
  300. params.NextToken = aws.String(nextToken)
  301. }
  302. resp, err := client.GetMetricDataWithContext(ctx, params)
  303. if err != nil {
  304. return queryResponses, err
  305. }
  306. metrics.M_Aws_CloudWatch_GetMetricData.Add(float64(len(params.MetricDataQueries)))
  307. for _, r := range resp.MetricDataResults {
  308. if _, ok := mdr[*r.Id]; !ok {
  309. mdr[*r.Id] = r
  310. } else {
  311. mdr[*r.Id].Timestamps = append(mdr[*r.Id].Timestamps, r.Timestamps...)
  312. mdr[*r.Id].Values = append(mdr[*r.Id].Values, r.Values...)
  313. }
  314. }
  315. if resp.NextToken == nil || *resp.NextToken == "" {
  316. break
  317. }
  318. nextToken = *resp.NextToken
  319. }
  320. for i, r := range mdr {
  321. if *r.StatusCode != "Complete" {
  322. return queryResponses, fmt.Errorf("Part of query is failed: %s", *r.StatusCode)
  323. }
  324. queryRes := tsdb.NewQueryResult()
  325. queryRes.RefId = queries[i].RefId
  326. query := queries[*r.Id]
  327. series := tsdb.TimeSeries{
  328. Tags: map[string]string{},
  329. Points: make([]tsdb.TimePoint, 0),
  330. }
  331. for _, d := range query.Dimensions {
  332. series.Tags[*d.Name] = *d.Value
  333. }
  334. s := ""
  335. if len(query.Statistics) == 1 {
  336. s = *query.Statistics[0]
  337. } else {
  338. s = *query.ExtendedStatistics[0]
  339. }
  340. series.Name = formatAlias(query, s, series.Tags)
  341. for j, t := range r.Timestamps {
  342. expectedTimestamp := r.Timestamps[j].Add(time.Duration(query.Period) * time.Second)
  343. if j > 0 && expectedTimestamp.Before(*t) {
  344. series.Points = append(series.Points, tsdb.NewTimePoint(null.FloatFromPtr(nil), float64(expectedTimestamp.Unix()*1000)))
  345. }
  346. series.Points = append(series.Points, tsdb.NewTimePoint(null.FloatFrom(*r.Values[j]), float64((*t).Unix())*1000))
  347. }
  348. queryRes.Series = append(queryRes.Series, &series)
  349. queryRes.Meta = simplejson.New()
  350. queryResponses = append(queryResponses, queryRes)
  351. }
  352. return queryResponses, nil
  353. }
  354. func parseDimensions(model *simplejson.Json) ([]*cloudwatch.Dimension, error) {
  355. var result []*cloudwatch.Dimension
  356. for k, v := range model.Get("dimensions").MustMap() {
  357. kk := k
  358. if vv, ok := v.(string); ok {
  359. result = append(result, &cloudwatch.Dimension{
  360. Name: &kk,
  361. Value: &vv,
  362. })
  363. } else {
  364. return nil, errors.New("failed to parse")
  365. }
  366. }
  367. sort.Slice(result, func(i, j int) bool {
  368. return *result[i].Name < *result[j].Name
  369. })
  370. return result, nil
  371. }
  372. func parseStatistics(model *simplejson.Json) ([]string, []string, error) {
  373. var statistics []string
  374. var extendedStatistics []string
  375. for _, s := range model.Get("statistics").MustArray() {
  376. if ss, ok := s.(string); ok {
  377. if _, isStandard := standardStatistics[ss]; isStandard {
  378. statistics = append(statistics, ss)
  379. } else {
  380. extendedStatistics = append(extendedStatistics, ss)
  381. }
  382. } else {
  383. return nil, nil, errors.New("failed to parse")
  384. }
  385. }
  386. return statistics, extendedStatistics, nil
  387. }
  388. func parseQuery(model *simplejson.Json) (*CloudWatchQuery, error) {
  389. region, err := model.Get("region").String()
  390. if err != nil {
  391. return nil, err
  392. }
  393. namespace, err := model.Get("namespace").String()
  394. if err != nil {
  395. return nil, err
  396. }
  397. metricName, err := model.Get("metricName").String()
  398. if err != nil {
  399. return nil, err
  400. }
  401. id := model.Get("id").MustString("")
  402. expression := model.Get("expression").MustString("")
  403. dimensions, err := parseDimensions(model)
  404. if err != nil {
  405. return nil, err
  406. }
  407. statistics, extendedStatistics, err := parseStatistics(model)
  408. if err != nil {
  409. return nil, err
  410. }
  411. p := model.Get("period").MustString("")
  412. if p == "" {
  413. if namespace == "AWS/EC2" {
  414. p = "300"
  415. } else {
  416. p = "60"
  417. }
  418. }
  419. var period int
  420. if regexp.MustCompile(`^\d+$`).Match([]byte(p)) {
  421. period, err = strconv.Atoi(p)
  422. if err != nil {
  423. return nil, err
  424. }
  425. } else {
  426. d, err := time.ParseDuration(p)
  427. if err != nil {
  428. return nil, err
  429. }
  430. period = int(d.Seconds())
  431. }
  432. alias := model.Get("alias").MustString()
  433. returnData := model.Get("returnData").MustBool(false)
  434. highResolution := model.Get("highResolution").MustBool(false)
  435. return &CloudWatchQuery{
  436. Region: region,
  437. Namespace: namespace,
  438. MetricName: metricName,
  439. Dimensions: dimensions,
  440. Statistics: aws.StringSlice(statistics),
  441. ExtendedStatistics: aws.StringSlice(extendedStatistics),
  442. Period: period,
  443. Alias: alias,
  444. Id: id,
  445. Expression: expression,
  446. ReturnData: returnData,
  447. HighResolution: highResolution,
  448. }, nil
  449. }
  450. func formatAlias(query *CloudWatchQuery, stat string, dimensions map[string]string) string {
  451. if len(query.Id) > 0 && len(query.Expression) > 0 {
  452. if len(query.Alias) > 0 {
  453. return query.Alias
  454. } else {
  455. return query.Id
  456. }
  457. }
  458. data := map[string]string{}
  459. data["region"] = query.Region
  460. data["namespace"] = query.Namespace
  461. data["metric"] = query.MetricName
  462. data["stat"] = stat
  463. data["period"] = strconv.Itoa(query.Period)
  464. for k, v := range dimensions {
  465. data[k] = v
  466. }
  467. result := aliasFormat.ReplaceAllFunc([]byte(query.Alias), func(in []byte) []byte {
  468. labelName := strings.Replace(string(in), "{{", "", 1)
  469. labelName = strings.Replace(labelName, "}}", "", 1)
  470. labelName = strings.TrimSpace(labelName)
  471. if val, exists := data[labelName]; exists {
  472. return []byte(val)
  473. }
  474. return in
  475. })
  476. return string(result)
  477. }
  478. func parseResponse(resp *cloudwatch.GetMetricStatisticsOutput, query *CloudWatchQuery) (*tsdb.QueryResult, error) {
  479. queryRes := tsdb.NewQueryResult()
  480. queryRes.RefId = query.RefId
  481. var value float64
  482. for _, s := range append(query.Statistics, query.ExtendedStatistics...) {
  483. series := tsdb.TimeSeries{
  484. Tags: map[string]string{},
  485. Points: make([]tsdb.TimePoint, 0),
  486. }
  487. for _, d := range query.Dimensions {
  488. series.Tags[*d.Name] = *d.Value
  489. }
  490. series.Name = formatAlias(query, *s, series.Tags)
  491. lastTimestamp := make(map[string]time.Time)
  492. sort.Slice(resp.Datapoints, func(i, j int) bool {
  493. return (*resp.Datapoints[i].Timestamp).Before(*resp.Datapoints[j].Timestamp)
  494. })
  495. for _, v := range resp.Datapoints {
  496. switch *s {
  497. case "Average":
  498. value = *v.Average
  499. case "Maximum":
  500. value = *v.Maximum
  501. case "Minimum":
  502. value = *v.Minimum
  503. case "Sum":
  504. value = *v.Sum
  505. case "SampleCount":
  506. value = *v.SampleCount
  507. default:
  508. if strings.Index(*s, "p") == 0 && v.ExtendedStatistics[*s] != nil {
  509. value = *v.ExtendedStatistics[*s]
  510. }
  511. }
  512. // terminate gap of data points
  513. timestamp := *v.Timestamp
  514. if _, ok := lastTimestamp[*s]; ok {
  515. nextTimestampFromLast := lastTimestamp[*s].Add(time.Duration(query.Period) * time.Second)
  516. for timestamp.After(nextTimestampFromLast) {
  517. series.Points = append(series.Points, tsdb.NewTimePoint(null.FloatFromPtr(nil), float64(nextTimestampFromLast.Unix()*1000)))
  518. nextTimestampFromLast = nextTimestampFromLast.Add(time.Duration(query.Period) * time.Second)
  519. }
  520. }
  521. lastTimestamp[*s] = timestamp
  522. series.Points = append(series.Points, tsdb.NewTimePoint(null.FloatFrom(value), float64(timestamp.Unix()*1000)))
  523. }
  524. queryRes.Series = append(queryRes.Series, &series)
  525. queryRes.Meta = simplejson.New()
  526. if len(resp.Datapoints) > 0 && resp.Datapoints[0].Unit != nil {
  527. if unit, ok := cloudwatchUnitMappings[*resp.Datapoints[0].Unit]; ok {
  528. queryRes.Meta.Set("unit", unit)
  529. }
  530. }
  531. }
  532. return queryRes, nil
  533. }