metrics.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. package cloudwatch
  2. import (
  3. "encoding/json"
  4. "sort"
  5. "strings"
  6. "sync"
  7. "time"
  8. "github.com/aws/aws-sdk-go/aws"
  9. "github.com/aws/aws-sdk-go/aws/awsutil"
  10. "github.com/aws/aws-sdk-go/aws/session"
  11. "github.com/aws/aws-sdk-go/service/cloudwatch"
  12. "github.com/grafana/grafana/pkg/middleware"
  13. "github.com/grafana/grafana/pkg/util"
  14. )
  15. var metricsMap map[string][]string
  16. var dimensionsMap map[string][]string
  17. type CustomMetricsCache struct {
  18. Expire time.Time
  19. Cache []string
  20. }
  21. var customMetricsMetricsMap map[string]map[string]map[string]*CustomMetricsCache
  22. var customMetricsDimensionsMap map[string]map[string]map[string]*CustomMetricsCache
  23. func init() {
  24. metricsMap = map[string][]string{
  25. "AWS/AutoScaling": {"GroupMinSize", "GroupMaxSize", "GroupDesiredCapacity", "GroupInServiceInstances", "GroupPendingInstances", "GroupStandbyInstances", "GroupTerminatingInstances", "GroupTotalInstances"},
  26. "AWS/Billing": {"EstimatedCharges"},
  27. "AWS/CloudFront": {"Requests", "BytesDownloaded", "BytesUploaded", "TotalErrorRate", "4xxErrorRate", "5xxErrorRate"},
  28. "AWS/CloudSearch": {"SuccessfulRequests", "SearchableDocuments", "IndexUtilization", "Partitions"},
  29. "AWS/DynamoDB": {"ConditionalCheckFailedRequests", "ConsumedReadCapacityUnits", "ConsumedWriteCapacityUnits", "OnlineIndexConsumedWriteCapacity", "OnlineIndexPercentageProgress", "OnlineIndexThrottleEvents", "ProvisionedReadCapacityUnits", "ProvisionedWriteCapacityUnits", "ReadThrottleEvents", "ReturnedItemCount", "SuccessfulRequestLatency", "SystemErrors", "ThrottledRequests", "UserErrors", "WriteThrottleEvents"},
  30. "AWS/ECS": {"CPUUtilization", "MemoryUtilization"},
  31. "AWS/ElastiCache": {
  32. "CPUUtilization", "FreeableMemory", "NetworkBytesIn", "NetworkBytesOut", "SwapUsage",
  33. "BytesUsedForCacheItems", "BytesReadIntoMemcached", "BytesWrittenOutFromMemcached", "CasBadval", "CasHits", "CasMisses", "CmdFlush", "CmdGet", "CmdSet", "CurrConnections", "CurrItems", "DecrHits", "DecrMisses", "DeleteHits", "DeleteMisses", "Evictions", "GetHits", "GetMisses", "IncrHits", "IncrMisses", "Reclaimed",
  34. "BytesUsedForHash", "CmdConfigGet", "CmdConfigSet", "CmdTouch", "CurrConfig", "EvictedUnfetched", "ExpiredUnfetched", "SlabsMoved", "TouchHits", "TouchMisses",
  35. "NewConnections", "NewItems", "UnusedMemory",
  36. "BytesUsedForCache", "CacheHits", "CacheMisses", "CurrConnections", "Evictions", "HyperLogLogBasedCmds", "NewConnections", "Reclaimed", "ReplicationBytes", "ReplicationLag", "SaveInProgress",
  37. "CurrItems", "GetTypeCmds", "HashBasedCmds", "KeyBasedCmds", "ListBasedCmds", "SetBasedCmds", "SetTypeCmds", "SortedSetBasedCmds", "StringBasedCmds",
  38. },
  39. "AWS/EBS": {"VolumeReadBytes", "VolumeWriteBytes", "VolumeReadOps", "VolumeWriteOps", "VolumeTotalReadTime", "VolumeTotalWriteTime", "VolumeIdleTime", "VolumeQueueLength", "VolumeThroughputPercentage", "VolumeConsumedReadWriteOps"},
  40. "AWS/EC2": {"CPUCreditUsage", "CPUCreditBalance", "CPUUtilization", "DiskReadOps", "DiskWriteOps", "DiskReadBytes", "DiskWriteBytes", "NetworkIn", "NetworkOut", "StatusCheckFailed", "StatusCheckFailed_Instance", "StatusCheckFailed_System"},
  41. "AWS/ELB": {"HealthyHostCount", "UnHealthyHostCount", "RequestCount", "Latency", "HTTPCode_ELB_4XX", "HTTPCode_ELB_5XX", "HTTPCode_Backend_2XX", "HTTPCode_Backend_3XX", "HTTPCode_Backend_4XX", "HTTPCode_Backend_5XX", "BackendConnectionErrors", "SurgeQueueLength", "SpilloverCount"},
  42. "AWS/ElasticMapReduce": {"IsIdle", "JobsRunning", "JobsFailed",
  43. "MapTasksRunning", "MapTasksRemaining", "MapSlotsOpen", "RemainingMapTasksPerSlot", "ReduceTasksRunning", "ReduceTasksRemaining", "ReduceSlotsOpen",
  44. "CoreNodesRunning", "CoreNodesPending", "LiveDataNodes", "TaskNodesRunning", "TaskNodesPending", "LiveTaskTrackers",
  45. "S3BytesWritten", "S3BytesRead", "HDFSUtilization", "HDFSBytesRead", "HDFSBytesWritten", "MissingBlocks", "TotalLoad",
  46. "BackupFailed", "MostRecentBackupDuration", "TimeSinceLastSuccessfulBackup",
  47. "IsIdle", "ContainerAllocated", "ContainerReserved", "ContainerPending", "AppsCompleted", "AppsFailed", "AppsKilled", "AppsPending", "AppsRunning", "AppsSubmitted",
  48. "CoreNodesRunning", "CoreNodesPending", "LiveDataNodes", "MRTotalNodes", "MRActiveNodes", "MRLostNodes", "MRUnhealthyNodes", "MRDecommissionedNodes", "MRRebootedNodes",
  49. "S3BytesWritten", "S3BytesRead", "HDFSUtilization", "HDFSBytesRead", "HDFSBytesWritten", "MissingBlocks", "CorruptBlocks", "TotalLoad", "MemoryTotalMB", "MemoryReservedMB", "MemoryAvailableMB", "MemoryAllocatedMB", "PendingDeletionBlocks", "UnderReplicatedBlocks", "DfsPendingReplicationBlocks", "CapacityRemainingGB",
  50. "HbaseBackupFailed", "MostRecentBackupDuration", "TimeSinceLastSuccessfulBackup"},
  51. "AWS/ES": {"ClusterStatus.green", "ClusterStatus.yellow", "ClusterStatus.red", "Nodes", "SearchableDocuments", "DeletedDocuments", "CPUUtilization", "FreeStorageSpace", "JVMMemoryPressure", "AutomatedSnapshotFailure", "MasterCPUUtilization", "MasterFreeStorageSpace", "MasterJVMMemoryPressure", "ReadLatency", "WriteLatency", "ReadThroughput", "WriteThroughput", "DiskQueueLength", "ReadIOPS", "WriteIOPS"},
  52. "AWS/Kinesis": {"PutRecord.Bytes", "PutRecord.Latency", "PutRecord.Success", "PutRecords.Bytes", "PutRecords.Latency", "PutRecords.Records", "PutRecords.Success", "IncomingBytes", "IncomingRecords", "GetRecords.Bytes", "GetRecords.IteratorAgeMilliseconds", "GetRecords.Latency", "GetRecords.Success"},
  53. "AWS/Lambda": {"Invocations", "Errors", "Duration", "Throttles"},
  54. "AWS/ML": {"PredictCount", "PredictFailureCount"},
  55. "AWS/OpsWorks": {"cpu_idle", "cpu_nice", "cpu_system", "cpu_user", "cpu_waitio", "load_1", "load_5", "load_15", "memory_buffers", "memory_cached", "memory_free", "memory_swap", "memory_total", "memory_used", "procs"},
  56. "AWS/Redshift": {"CPUUtilization", "DatabaseConnections", "HealthStatus", "MaintenanceMode", "NetworkReceiveThroughput", "NetworkTransmitThroughput", "PercentageDiskSpaceUsed", "ReadIOPS", "ReadLatency", "ReadThroughput", "WriteIOPS", "WriteLatency", "WriteThroughput"},
  57. "AWS/RDS": {"BinLogDiskUsage", "CPUUtilization", "CPUCreditUsage", "CPUCreditBalance", "DatabaseConnections", "DiskQueueDepth", "FreeableMemory", "FreeStorageSpace", "ReplicaLag", "SwapUsage", "ReadIOPS", "WriteIOPS", "ReadLatency", "WriteLatency", "ReadThroughput", "WriteThroughput", "NetworkReceiveThroughput", "NetworkTransmitThroughput"},
  58. "AWS/Route53": {"HealthCheckStatus", "HealthCheckPercentageHealthy"},
  59. "AWS/SNS": {"NumberOfMessagesPublished", "PublishSize", "NumberOfNotificationsDelivered", "NumberOfNotificationsFailed"},
  60. "AWS/SQS": {"NumberOfMessagesSent", "SentMessageSize", "NumberOfMessagesReceived", "NumberOfEmptyReceives", "NumberOfMessagesDeleted", "ApproximateNumberOfMessagesDelayed", "ApproximateNumberOfMessagesVisible", "ApproximateNumberOfMessagesNotVisible"},
  61. "AWS/S3": {"BucketSizeBytes", "NumberOfObjects"},
  62. "AWS/SWF": {"DecisionTaskScheduleToStartTime", "DecisionTaskStartToCloseTime", "DecisionTasksCompleted", "StartedDecisionTasksTimedOutOnClose", "WorkflowStartToCloseTime", "WorkflowsCanceled", "WorkflowsCompleted", "WorkflowsContinuedAsNew", "WorkflowsFailed", "WorkflowsTerminated", "WorkflowsTimedOut",
  63. "ActivityTaskScheduleToCloseTime", "ActivityTaskScheduleToStartTime", "ActivityTaskStartToCloseTime", "ActivityTasksCanceled", "ActivityTasksCompleted", "ActivityTasksFailed", "ScheduledActivityTasksTimedOutOnClose", "ScheduledActivityTasksTimedOutOnStart", "StartedActivityTasksTimedOutOnClose", "StartedActivityTasksTimedOutOnHeartbeat"},
  64. "AWS/StorageGateway": {"CacheHitPercent", "CachePercentUsed", "CachePercentDirty", "CloudBytesDownloaded", "CloudDownloadLatency", "CloudBytesUploaded", "UploadBufferFree", "UploadBufferPercentUsed", "UploadBufferUsed", "QueuedWrites", "ReadBytes", "ReadTime", "TotalCacheSize", "WriteBytes", "WriteTime", "TimeSinceLastRecoveryPoint", "WorkingStorageFree", "WorkingStoragePercentUsed", "WorkingStorageUsed",
  65. "CacheHitPercent", "CachePercentUsed", "CachePercentDirty", "ReadBytes", "ReadTime", "WriteBytes", "WriteTime", "QueuedWrites"},
  66. "AWS/WAF": {"AllowedRequests", "BlockedRequests", "CountedRequests"},
  67. "AWS/WorkSpaces": {"Available", "Unhealthy", "ConnectionAttempt", "ConnectionSuccess", "ConnectionFailure", "SessionLaunchTime", "InSessionLatency", "SessionDisconnect"},
  68. }
  69. dimensionsMap = map[string][]string{
  70. "AWS/AutoScaling": {"AutoScalingGroupName"},
  71. "AWS/Billing": {"ServiceName", "LinkedAccount", "Currency"},
  72. "AWS/CloudFront": {"DistributionId", "Region"},
  73. "AWS/CloudSearch": {},
  74. "AWS/DynamoDB": {"TableName", "GlobalSecondaryIndexName", "Operation"},
  75. "AWS/ECS": {"ClusterName", "ServiceName"},
  76. "AWS/ElastiCache": {"CacheClusterId", "CacheNodeId"},
  77. "AWS/EBS": {"VolumeId"},
  78. "AWS/EC2": {"AutoScalingGroupName", "ImageId", "InstanceId", "InstanceType"},
  79. "AWS/ELB": {"LoadBalancerName", "AvailabilityZone"},
  80. "AWS/ElasticMapReduce": {"ClusterId", "JobFlowId", "JobId"},
  81. "AWS/ES": {},
  82. "AWS/Kinesis": {"StreamName"},
  83. "AWS/Lambda": {"FunctionName"},
  84. "AWS/ML": {"MLModelId", "RequestMode"},
  85. "AWS/OpsWorks": {"StackId", "LayerId", "InstanceId"},
  86. "AWS/Redshift": {"NodeID", "ClusterIdentifier"},
  87. "AWS/RDS": {"DBInstanceIdentifier", "DatabaseClass", "EngineName"},
  88. "AWS/Route53": {"HealthCheckId"},
  89. "AWS/SNS": {"Application", "Platform", "TopicName"},
  90. "AWS/SQS": {"QueueName"},
  91. "AWS/S3": {"BucketName", "StorageType"},
  92. "AWS/SWF": {"Domain", "WorkflowTypeName", "WorkflowTypeVersion", "ActivityTypeName", "ActivityTypeVersion"},
  93. "AWS/StorageGateway": {"GatewayId", "GatewayName", "VolumeId"},
  94. "AWS/WAF": {"Rule", "WebACL"},
  95. "AWS/WorkSpaces": {"DirectoryId", "WorkspaceId"},
  96. }
  97. customMetricsMetricsMap = make(map[string]map[string]map[string]*CustomMetricsCache)
  98. customMetricsDimensionsMap = make(map[string]map[string]map[string]*CustomMetricsCache)
  99. }
  100. // Whenever this list is updated, frontend list should also be updated.
  101. // Please update the region list in public/app/plugins/datasource/cloudwatch/partials/config.html
  102. func handleGetRegions(req *cwRequest, c *middleware.Context) {
  103. regions := []string{
  104. "ap-northeast-1", "ap-northeast-2", "ap-southeast-1", "ap-southeast-2", "cn-north-1",
  105. "eu-central-1", "eu-west-1", "sa-east-1", "us-east-1", "us-west-1", "us-west-2",
  106. }
  107. result := []interface{}{}
  108. for _, region := range regions {
  109. result = append(result, util.DynMap{"text": region, "value": region})
  110. }
  111. c.JSON(200, result)
  112. }
  113. func handleGetNamespaces(req *cwRequest, c *middleware.Context) {
  114. keys := []string{}
  115. for key := range metricsMap {
  116. keys = append(keys, key)
  117. }
  118. if customMetricsNamespaces, ok := req.DataSource.JsonData["customMetricsNamespaces"].(string); ok {
  119. for _, key := range strings.Split(customMetricsNamespaces, ",") {
  120. keys = append(keys, key)
  121. }
  122. }
  123. sort.Sort(sort.StringSlice(keys))
  124. result := []interface{}{}
  125. for _, key := range keys {
  126. result = append(result, util.DynMap{"text": key, "value": key})
  127. }
  128. c.JSON(200, result)
  129. }
  130. func handleGetMetrics(req *cwRequest, c *middleware.Context) {
  131. reqParam := &struct {
  132. Parameters struct {
  133. Namespace string `json:"namespace"`
  134. } `json:"parameters"`
  135. }{}
  136. json.Unmarshal(req.Body, reqParam)
  137. var namespaceMetrics []string
  138. if !isCustomMetrics(reqParam.Parameters.Namespace) {
  139. var exists bool
  140. if namespaceMetrics, exists = metricsMap[reqParam.Parameters.Namespace]; !exists {
  141. c.JsonApiErr(404, "Unable to find namespace "+reqParam.Parameters.Namespace, nil)
  142. return
  143. }
  144. } else {
  145. var err error
  146. if namespaceMetrics, err = getMetricsForCustomMetrics(req.Region, reqParam.Parameters.Namespace, req.DataSource.Database, getAllMetrics); err != nil {
  147. c.JsonApiErr(500, "Unable to call AWS API", err)
  148. return
  149. }
  150. }
  151. sort.Sort(sort.StringSlice(namespaceMetrics))
  152. result := []interface{}{}
  153. for _, name := range namespaceMetrics {
  154. result = append(result, util.DynMap{"text": name, "value": name})
  155. }
  156. c.JSON(200, result)
  157. }
  158. func handleGetDimensions(req *cwRequest, c *middleware.Context) {
  159. reqParam := &struct {
  160. Parameters struct {
  161. Namespace string `json:"namespace"`
  162. } `json:"parameters"`
  163. }{}
  164. json.Unmarshal(req.Body, reqParam)
  165. var dimensionValues []string
  166. if !isCustomMetrics(reqParam.Parameters.Namespace) {
  167. var exists bool
  168. if dimensionValues, exists = dimensionsMap[reqParam.Parameters.Namespace]; !exists {
  169. c.JsonApiErr(404, "Unable to find dimension "+reqParam.Parameters.Namespace, nil)
  170. return
  171. }
  172. } else {
  173. var err error
  174. if dimensionValues, err = getDimensionsForCustomMetrics(req.Region, reqParam.Parameters.Namespace, req.DataSource.Database, getAllMetrics); err != nil {
  175. c.JsonApiErr(500, "Unable to call AWS API", err)
  176. return
  177. }
  178. }
  179. sort.Sort(sort.StringSlice(dimensionValues))
  180. result := []interface{}{}
  181. for _, name := range dimensionValues {
  182. result = append(result, util.DynMap{"text": name, "value": name})
  183. }
  184. c.JSON(200, result)
  185. }
  186. func getAllMetrics(region string, namespace string, database string) (cloudwatch.ListMetricsOutput, error) {
  187. cfg := &aws.Config{
  188. Region: aws.String(region),
  189. Credentials: getCredentials(database),
  190. }
  191. svc := cloudwatch.New(session.New(cfg), cfg)
  192. params := &cloudwatch.ListMetricsInput{
  193. Namespace: aws.String(namespace),
  194. }
  195. var resp cloudwatch.ListMetricsOutput
  196. err := svc.ListMetricsPages(params,
  197. func(page *cloudwatch.ListMetricsOutput, lastPage bool) bool {
  198. metrics, _ := awsutil.ValuesAtPath(page, "Metrics")
  199. for _, metric := range metrics {
  200. resp.Metrics = append(resp.Metrics, metric.(*cloudwatch.Metric))
  201. }
  202. return !lastPage
  203. })
  204. if err != nil {
  205. return resp, err
  206. }
  207. return resp, nil
  208. }
  209. var metricsCacheLock sync.Mutex
  210. func getMetricsForCustomMetrics(region string, namespace string, database string, getAllMetrics func(string, string, string) (cloudwatch.ListMetricsOutput, error)) ([]string, error) {
  211. result, err := getAllMetrics(region, namespace, database)
  212. if err != nil {
  213. return []string{}, err
  214. }
  215. metricsCacheLock.Lock()
  216. defer metricsCacheLock.Unlock()
  217. if _, ok := customMetricsMetricsMap[database]; !ok {
  218. customMetricsMetricsMap[database] = make(map[string]map[string]*CustomMetricsCache)
  219. }
  220. if _, ok := customMetricsMetricsMap[database][region]; !ok {
  221. customMetricsMetricsMap[database][region] = make(map[string]*CustomMetricsCache)
  222. }
  223. if _, ok := customMetricsMetricsMap[database][region][namespace]; !ok {
  224. customMetricsMetricsMap[database][region][namespace] = &CustomMetricsCache{}
  225. customMetricsMetricsMap[database][region][namespace].Cache = make([]string, 0)
  226. }
  227. if customMetricsMetricsMap[database][region][namespace].Expire.After(time.Now()) {
  228. return customMetricsMetricsMap[database][region][namespace].Cache, nil
  229. }
  230. customMetricsMetricsMap[database][region][namespace].Cache = make([]string, 0)
  231. customMetricsMetricsMap[database][region][namespace].Expire = time.Now().Add(5 * time.Minute)
  232. for _, metric := range result.Metrics {
  233. if isDuplicate(customMetricsMetricsMap[database][region][namespace].Cache, *metric.MetricName) {
  234. continue
  235. }
  236. customMetricsMetricsMap[database][region][namespace].Cache = append(customMetricsMetricsMap[database][region][namespace].Cache, *metric.MetricName)
  237. }
  238. return customMetricsMetricsMap[database][region][namespace].Cache, nil
  239. }
  240. var dimensionsCacheLock sync.Mutex
  241. func getDimensionsForCustomMetrics(region string, namespace string, database string, getAllMetrics func(string, string, string) (cloudwatch.ListMetricsOutput, error)) ([]string, error) {
  242. result, err := getAllMetrics(region, namespace, database)
  243. if err != nil {
  244. return []string{}, err
  245. }
  246. dimensionsCacheLock.Lock()
  247. defer dimensionsCacheLock.Unlock()
  248. if _, ok := customMetricsDimensionsMap[database]; !ok {
  249. customMetricsDimensionsMap[database] = make(map[string]map[string]*CustomMetricsCache)
  250. }
  251. if _, ok := customMetricsDimensionsMap[database][region]; !ok {
  252. customMetricsDimensionsMap[database][region] = make(map[string]*CustomMetricsCache)
  253. }
  254. if _, ok := customMetricsDimensionsMap[database][region][namespace]; !ok {
  255. customMetricsDimensionsMap[database][region][namespace] = &CustomMetricsCache{}
  256. customMetricsDimensionsMap[database][region][namespace].Cache = make([]string, 0)
  257. }
  258. if customMetricsDimensionsMap[database][region][namespace].Expire.After(time.Now()) {
  259. return customMetricsDimensionsMap[database][region][namespace].Cache, nil
  260. }
  261. customMetricsDimensionsMap[database][region][namespace].Cache = make([]string, 0)
  262. customMetricsDimensionsMap[database][region][namespace].Expire = time.Now().Add(5 * time.Minute)
  263. for _, metric := range result.Metrics {
  264. for _, dimension := range metric.Dimensions {
  265. if isDuplicate(customMetricsDimensionsMap[database][region][namespace].Cache, *dimension.Name) {
  266. continue
  267. }
  268. customMetricsDimensionsMap[database][region][namespace].Cache = append(customMetricsDimensionsMap[database][region][namespace].Cache, *dimension.Name)
  269. }
  270. }
  271. return customMetricsDimensionsMap[database][region][namespace].Cache, nil
  272. }
  273. func isDuplicate(nameList []string, target string) bool {
  274. for _, name := range nameList {
  275. if name == target {
  276. return true
  277. }
  278. }
  279. return false
  280. }
  281. func isCustomMetrics(namespace string) bool {
  282. return strings.Index(namespace, "AWS/") != 0
  283. }