metrics.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. package cloudwatch
  2. import (
  3. "encoding/json"
  4. "sort"
  5. "strings"
  6. "sync"
  7. "time"
  8. "github.com/aws/aws-sdk-go/aws"
  9. "github.com/aws/aws-sdk-go/aws/awsutil"
  10. "github.com/aws/aws-sdk-go/aws/session"
  11. "github.com/aws/aws-sdk-go/service/cloudwatch"
  12. "github.com/grafana/grafana/pkg/middleware"
  13. "github.com/grafana/grafana/pkg/util"
  14. )
  15. var metricsMap map[string][]string
  16. var dimensionsMap map[string][]string
  17. type CustomMetricsCache struct {
  18. Expire time.Time
  19. Cache []string
  20. }
  21. var customMetricsMetricsMap map[string]map[string]map[string]*CustomMetricsCache
  22. var customMetricsDimensionsMap map[string]map[string]map[string]*CustomMetricsCache
  23. func init() {
  24. metricsMap = map[string][]string{
  25. "AWS/AutoScaling": {"GroupMinSize", "GroupMaxSize", "GroupDesiredCapacity", "GroupInServiceInstances", "GroupPendingInstances", "GroupStandbyInstances", "GroupTerminatingInstances", "GroupTotalInstances"},
  26. "AWS/Billing": {"EstimatedCharges"},
  27. "AWS/CloudFront": {"Requests", "BytesDownloaded", "BytesUploaded", "TotalErrorRate", "4xxErrorRate", "5xxErrorRate"},
  28. "AWS/CloudSearch": {"SuccessfulRequests", "SearchableDocuments", "IndexUtilization", "Partitions"},
  29. "AWS/DynamoDB": {"ConditionalCheckFailedRequests", "ConsumedReadCapacityUnits", "ConsumedWriteCapacityUnits", "OnlineIndexConsumedWriteCapacity", "OnlineIndexPercentageProgress", "OnlineIndexThrottleEvents", "ProvisionedReadCapacityUnits", "ProvisionedWriteCapacityUnits", "ReadThrottleEvents", "ReturnedItemCount", "SuccessfulRequestLatency", "SystemErrors", "ThrottledRequests", "UserErrors", "WriteThrottleEvents"},
  30. "AWS/ECS": {"CPUUtilization", "MemoryUtilization"},
  31. "AWS/ElastiCache": {
  32. "CPUUtilization", "FreeableMemory", "NetworkBytesIn", "NetworkBytesOut", "SwapUsage",
  33. "BytesUsedForCacheItems", "BytesReadIntoMemcached", "BytesWrittenOutFromMemcached", "CasBadval", "CasHits", "CasMisses", "CmdFlush", "CmdGet", "CmdSet", "CurrConnections", "CurrItems", "DecrHits", "DecrMisses", "DeleteHits", "DeleteMisses", "Evictions", "GetHits", "GetMisses", "IncrHits", "IncrMisses", "Reclaimed",
  34. "BytesUsedForHash", "CmdConfigGet", "CmdConfigSet", "CmdTouch", "CurrConfig", "EvictedUnfetched", "ExpiredUnfetched", "SlabsMoved", "TouchHits", "TouchMisses",
  35. "NewConnections", "NewItems", "UnusedMemory",
  36. "BytesUsedForCache", "CacheHits", "CacheMisses", "CurrConnections", "Evictions", "HyperLogLogBasedCmds", "NewConnections", "Reclaimed", "ReplicationBytes", "ReplicationLag", "SaveInProgress",
  37. "CurrItems", "GetTypeCmds", "HashBasedCmds", "KeyBasedCmds", "ListBasedCmds", "SetBasedCmds", "SetTypeCmds", "SortedSetBasedCmds", "StringBasedCmds",
  38. },
  39. "AWS/EBS": {"VolumeReadBytes", "VolumeWriteBytes", "VolumeReadOps", "VolumeWriteOps", "VolumeTotalReadTime", "VolumeTotalWriteTime", "VolumeIdleTime", "VolumeQueueLength", "VolumeThroughputPercentage", "VolumeConsumedReadWriteOps"},
  40. "AWS/EC2": {"CPUCreditUsage", "CPUCreditBalance", "CPUUtilization", "DiskReadOps", "DiskWriteOps", "DiskReadBytes", "DiskWriteBytes", "NetworkIn", "NetworkOut", "StatusCheckFailed", "StatusCheckFailed_Instance", "StatusCheckFailed_System"},
  41. "AWS/ELB": {"HealthyHostCount", "UnHealthyHostCount", "RequestCount", "Latency", "HTTPCode_ELB_4XX", "HTTPCode_ELB_5XX", "HTTPCode_Backend_2XX", "HTTPCode_Backend_3XX", "HTTPCode_Backend_4XX", "HTTPCode_Backend_5XX", "BackendConnectionErrors", "SurgeQueueLength", "SpilloverCount"},
  42. "AWS/ElasticMapReduce": {"IsIdle", "JobsRunning", "JobsFailed",
  43. "MapTasksRunning", "MapTasksRemaining", "MapSlotsOpen", "RemainingMapTasksPerSlot", "ReduceTasksRunning", "ReduceTasksRemaining", "ReduceSlotsOpen",
  44. "CoreNodesRunning", "CoreNodesPending", "LiveDataNodes", "TaskNodesRunning", "TaskNodesPending", "LiveTaskTrackers",
  45. "S3BytesWritten", "S3BytesRead", "HDFSUtilization", "HDFSBytesRead", "HDFSBytesWritten", "MissingBlocks", "TotalLoad",
  46. "BackupFailed", "MostRecentBackupDuration", "TimeSinceLastSuccessfulBackup",
  47. "IsIdle", "ContainerAllocated", "ContainerReserved", "ContainerPending", "AppsCompleted", "AppsFailed", "AppsKilled", "AppsPending", "AppsRunning", "AppsSubmitted",
  48. "CoreNodesRunning", "CoreNodesPending", "LiveDataNodes", "MRTotalNodes", "MRActiveNodes", "MRLostNodes", "MRUnhealthyNodes", "MRDecommissionedNodes", "MRRebootedNodes",
  49. "S3BytesWritten", "S3BytesRead", "HDFSUtilization", "HDFSBytesRead", "HDFSBytesWritten", "MissingBlocks", "CorruptBlocks", "TotalLoad", "MemoryTotalMB", "MemoryReservedMB", "MemoryAvailableMB", "MemoryAllocatedMB", "PendingDeletionBlocks", "UnderReplicatedBlocks", "DfsPendingReplicationBlocks", "CapacityRemainingGB",
  50. "HbaseBackupFailed", "MostRecentBackupDuration", "TimeSinceLastSuccessfulBackup"},
  51. "AWS/ES": {"ClusterStatus.green", "ClusterStatus.yellow", "ClusterStatus.red", "Nodes", "SearchableDocuments", "DeletedDocuments", "CPUUtilization", "FreeStorageSpace", "JVMMemoryPressure", "AutomatedSnapshotFailure", "MasterCPUUtilization", "MasterFreeStorageSpace", "MasterJVMMemoryPressure", "ReadLatency", "WriteLatency", "ReadThroughput", "WriteThroughput", "DiskQueueLength", "ReadIOPS", "WriteIOPS"},
  52. "AWS/Events": {"Invocations", "FailedInvocations", "TriggeredRules", "MatchedEvents", "ThrottledRules"},
  53. "AWS/Kinesis": {"PutRecord.Bytes", "PutRecord.Latency", "PutRecord.Success", "PutRecords.Bytes", "PutRecords.Latency", "PutRecords.Records", "PutRecords.Success", "IncomingBytes", "IncomingRecords", "GetRecords.Bytes", "GetRecords.IteratorAgeMilliseconds", "GetRecords.Latency", "GetRecords.Success"},
  54. "AWS/Lambda": {"Invocations", "Errors", "Duration", "Throttles"},
  55. "AWS/Logs": {"IncomingBytes", "IncomingLogEvents", "ForwardedBytes", "ForwardedLogEvents", "DeliveryErrors", "DeliveryThrottling"},
  56. "AWS/ML": {"PredictCount", "PredictFailureCount"},
  57. "AWS/OpsWorks": {"cpu_idle", "cpu_nice", "cpu_system", "cpu_user", "cpu_waitio", "load_1", "load_5", "load_15", "memory_buffers", "memory_cached", "memory_free", "memory_swap", "memory_total", "memory_used", "procs"},
  58. "AWS/Redshift": {"CPUUtilization", "DatabaseConnections", "HealthStatus", "MaintenanceMode", "NetworkReceiveThroughput", "NetworkTransmitThroughput", "PercentageDiskSpaceUsed", "ReadIOPS", "ReadLatency", "ReadThroughput", "WriteIOPS", "WriteLatency", "WriteThroughput"},
  59. "AWS/RDS": {"BinLogDiskUsage", "CPUUtilization", "CPUCreditUsage", "CPUCreditBalance", "DatabaseConnections", "DiskQueueDepth", "FreeableMemory", "FreeStorageSpace", "ReplicaLag", "SwapUsage", "ReadIOPS", "WriteIOPS", "ReadLatency", "WriteLatency", "ReadThroughput", "WriteThroughput", "NetworkReceiveThroughput", "NetworkTransmitThroughput"},
  60. "AWS/Route53": {"HealthCheckStatus", "HealthCheckPercentageHealthy", "ConnectionTime", "SSLHandshakeTime", "TimeToFirstByte"},
  61. "AWS/SNS": {"NumberOfMessagesPublished", "PublishSize", "NumberOfNotificationsDelivered", "NumberOfNotificationsFailed"},
  62. "AWS/SQS": {"NumberOfMessagesSent", "SentMessageSize", "NumberOfMessagesReceived", "NumberOfEmptyReceives", "NumberOfMessagesDeleted", "ApproximateNumberOfMessagesDelayed", "ApproximateNumberOfMessagesVisible", "ApproximateNumberOfMessagesNotVisible"},
  63. "AWS/S3": {"BucketSizeBytes", "NumberOfObjects"},
  64. "AWS/SWF": {"DecisionTaskScheduleToStartTime", "DecisionTaskStartToCloseTime", "DecisionTasksCompleted", "StartedDecisionTasksTimedOutOnClose", "WorkflowStartToCloseTime", "WorkflowsCanceled", "WorkflowsCompleted", "WorkflowsContinuedAsNew", "WorkflowsFailed", "WorkflowsTerminated", "WorkflowsTimedOut",
  65. "ActivityTaskScheduleToCloseTime", "ActivityTaskScheduleToStartTime", "ActivityTaskStartToCloseTime", "ActivityTasksCanceled", "ActivityTasksCompleted", "ActivityTasksFailed", "ScheduledActivityTasksTimedOutOnClose", "ScheduledActivityTasksTimedOutOnStart", "StartedActivityTasksTimedOutOnClose", "StartedActivityTasksTimedOutOnHeartbeat"},
  66. "AWS/StorageGateway": {"CacheHitPercent", "CachePercentUsed", "CachePercentDirty", "CloudBytesDownloaded", "CloudDownloadLatency", "CloudBytesUploaded", "UploadBufferFree", "UploadBufferPercentUsed", "UploadBufferUsed", "QueuedWrites", "ReadBytes", "ReadTime", "TotalCacheSize", "WriteBytes", "WriteTime", "TimeSinceLastRecoveryPoint", "WorkingStorageFree", "WorkingStoragePercentUsed", "WorkingStorageUsed",
  67. "CacheHitPercent", "CachePercentUsed", "CachePercentDirty", "ReadBytes", "ReadTime", "WriteBytes", "WriteTime", "QueuedWrites"},
  68. "AWS/WAF": {"AllowedRequests", "BlockedRequests", "CountedRequests"},
  69. "AWS/WorkSpaces": {"Available", "Unhealthy", "ConnectionAttempt", "ConnectionSuccess", "ConnectionFailure", "SessionLaunchTime", "InSessionLatency", "SessionDisconnect"},
  70. }
  71. dimensionsMap = map[string][]string{
  72. "AWS/AutoScaling": {"AutoScalingGroupName"},
  73. "AWS/Billing": {"ServiceName", "LinkedAccount", "Currency"},
  74. "AWS/CloudFront": {"DistributionId", "Region"},
  75. "AWS/CloudSearch": {},
  76. "AWS/DynamoDB": {"TableName", "GlobalSecondaryIndexName", "Operation"},
  77. "AWS/ECS": {"ClusterName", "ServiceName"},
  78. "AWS/ElastiCache": {"CacheClusterId", "CacheNodeId"},
  79. "AWS/EBS": {"VolumeId"},
  80. "AWS/EC2": {"AutoScalingGroupName", "ImageId", "InstanceId", "InstanceType"},
  81. "AWS/ELB": {"LoadBalancerName", "AvailabilityZone"},
  82. "AWS/ElasticMapReduce": {"ClusterId", "JobFlowId", "JobId"},
  83. "AWS/ES": {},
  84. "AWS/Events": {"RuleName"},
  85. "AWS/Kinesis": {"StreamName"},
  86. "AWS/Lambda": {"FunctionName"},
  87. "AWS/Logs": {"LogGroupName", "DestinationType", "FilterName"},
  88. "AWS/ML": {"MLModelId", "RequestMode"},
  89. "AWS/OpsWorks": {"StackId", "LayerId", "InstanceId"},
  90. "AWS/Redshift": {"NodeID", "ClusterIdentifier"},
  91. "AWS/RDS": {"DBInstanceIdentifier", "DatabaseClass", "EngineName"},
  92. "AWS/Route53": {"HealthCheckId"},
  93. "AWS/SNS": {"Application", "Platform", "TopicName"},
  94. "AWS/SQS": {"QueueName"},
  95. "AWS/S3": {"BucketName", "StorageType"},
  96. "AWS/SWF": {"Domain", "WorkflowTypeName", "WorkflowTypeVersion", "ActivityTypeName", "ActivityTypeVersion"},
  97. "AWS/StorageGateway": {"GatewayId", "GatewayName", "VolumeId"},
  98. "AWS/WAF": {"Rule", "WebACL"},
  99. "AWS/WorkSpaces": {"DirectoryId", "WorkspaceId"},
  100. }
  101. customMetricsMetricsMap = make(map[string]map[string]map[string]*CustomMetricsCache)
  102. customMetricsDimensionsMap = make(map[string]map[string]map[string]*CustomMetricsCache)
  103. }
  104. // Whenever this list is updated, frontend list should also be updated.
  105. // Please update the region list in public/app/plugins/datasource/cloudwatch/partials/config.html
  106. func handleGetRegions(req *cwRequest, c *middleware.Context) {
  107. regions := []string{
  108. "ap-northeast-1", "ap-northeast-2", "ap-southeast-1", "ap-southeast-2", "cn-north-1",
  109. "eu-central-1", "eu-west-1", "sa-east-1", "us-east-1", "us-west-1", "us-west-2",
  110. }
  111. result := []interface{}{}
  112. for _, region := range regions {
  113. result = append(result, util.DynMap{"text": region, "value": region})
  114. }
  115. c.JSON(200, result)
  116. }
  117. func handleGetNamespaces(req *cwRequest, c *middleware.Context) {
  118. keys := []string{}
  119. for key := range metricsMap {
  120. keys = append(keys, key)
  121. }
  122. customNamespaces := req.DataSource.JsonData.Get("customMetricsNamespaces").MustString()
  123. if customNamespaces != "" {
  124. for _, key := range strings.Split(customNamespaces, ",") {
  125. keys = append(keys, key)
  126. }
  127. }
  128. sort.Sort(sort.StringSlice(keys))
  129. result := []interface{}{}
  130. for _, key := range keys {
  131. result = append(result, util.DynMap{"text": key, "value": key})
  132. }
  133. c.JSON(200, result)
  134. }
  135. func handleGetMetrics(req *cwRequest, c *middleware.Context) {
  136. reqParam := &struct {
  137. Parameters struct {
  138. Namespace string `json:"namespace"`
  139. } `json:"parameters"`
  140. }{}
  141. json.Unmarshal(req.Body, reqParam)
  142. var namespaceMetrics []string
  143. if !isCustomMetrics(reqParam.Parameters.Namespace) {
  144. var exists bool
  145. if namespaceMetrics, exists = metricsMap[reqParam.Parameters.Namespace]; !exists {
  146. c.JsonApiErr(404, "Unable to find namespace "+reqParam.Parameters.Namespace, nil)
  147. return
  148. }
  149. } else {
  150. var err error
  151. if namespaceMetrics, err = getMetricsForCustomMetrics(req.Region, reqParam.Parameters.Namespace, req.DataSource.Database, getAllMetrics); err != nil {
  152. c.JsonApiErr(500, "Unable to call AWS API", err)
  153. return
  154. }
  155. }
  156. sort.Sort(sort.StringSlice(namespaceMetrics))
  157. result := []interface{}{}
  158. for _, name := range namespaceMetrics {
  159. result = append(result, util.DynMap{"text": name, "value": name})
  160. }
  161. c.JSON(200, result)
  162. }
  163. func handleGetDimensions(req *cwRequest, c *middleware.Context) {
  164. reqParam := &struct {
  165. Parameters struct {
  166. Namespace string `json:"namespace"`
  167. } `json:"parameters"`
  168. }{}
  169. json.Unmarshal(req.Body, reqParam)
  170. var dimensionValues []string
  171. if !isCustomMetrics(reqParam.Parameters.Namespace) {
  172. var exists bool
  173. if dimensionValues, exists = dimensionsMap[reqParam.Parameters.Namespace]; !exists {
  174. c.JsonApiErr(404, "Unable to find dimension "+reqParam.Parameters.Namespace, nil)
  175. return
  176. }
  177. } else {
  178. var err error
  179. if dimensionValues, err = getDimensionsForCustomMetrics(req.Region, reqParam.Parameters.Namespace, req.DataSource.Database, getAllMetrics); err != nil {
  180. c.JsonApiErr(500, "Unable to call AWS API", err)
  181. return
  182. }
  183. }
  184. sort.Sort(sort.StringSlice(dimensionValues))
  185. result := []interface{}{}
  186. for _, name := range dimensionValues {
  187. result = append(result, util.DynMap{"text": name, "value": name})
  188. }
  189. c.JSON(200, result)
  190. }
  191. func getAllMetrics(region string, namespace string, database string) (cloudwatch.ListMetricsOutput, error) {
  192. cfg := &aws.Config{
  193. Region: aws.String(region),
  194. Credentials: getCredentials(database),
  195. }
  196. svc := cloudwatch.New(session.New(cfg), cfg)
  197. params := &cloudwatch.ListMetricsInput{
  198. Namespace: aws.String(namespace),
  199. }
  200. var resp cloudwatch.ListMetricsOutput
  201. err := svc.ListMetricsPages(params,
  202. func(page *cloudwatch.ListMetricsOutput, lastPage bool) bool {
  203. metrics, _ := awsutil.ValuesAtPath(page, "Metrics")
  204. for _, metric := range metrics {
  205. resp.Metrics = append(resp.Metrics, metric.(*cloudwatch.Metric))
  206. }
  207. return !lastPage
  208. })
  209. if err != nil {
  210. return resp, err
  211. }
  212. return resp, nil
  213. }
  214. var metricsCacheLock sync.Mutex
  215. func getMetricsForCustomMetrics(region string, namespace string, database string, getAllMetrics func(string, string, string) (cloudwatch.ListMetricsOutput, error)) ([]string, error) {
  216. result, err := getAllMetrics(region, namespace, database)
  217. if err != nil {
  218. return []string{}, err
  219. }
  220. metricsCacheLock.Lock()
  221. defer metricsCacheLock.Unlock()
  222. if _, ok := customMetricsMetricsMap[database]; !ok {
  223. customMetricsMetricsMap[database] = make(map[string]map[string]*CustomMetricsCache)
  224. }
  225. if _, ok := customMetricsMetricsMap[database][region]; !ok {
  226. customMetricsMetricsMap[database][region] = make(map[string]*CustomMetricsCache)
  227. }
  228. if _, ok := customMetricsMetricsMap[database][region][namespace]; !ok {
  229. customMetricsMetricsMap[database][region][namespace] = &CustomMetricsCache{}
  230. customMetricsMetricsMap[database][region][namespace].Cache = make([]string, 0)
  231. }
  232. if customMetricsMetricsMap[database][region][namespace].Expire.After(time.Now()) {
  233. return customMetricsMetricsMap[database][region][namespace].Cache, nil
  234. }
  235. customMetricsMetricsMap[database][region][namespace].Cache = make([]string, 0)
  236. customMetricsMetricsMap[database][region][namespace].Expire = time.Now().Add(5 * time.Minute)
  237. for _, metric := range result.Metrics {
  238. if isDuplicate(customMetricsMetricsMap[database][region][namespace].Cache, *metric.MetricName) {
  239. continue
  240. }
  241. customMetricsMetricsMap[database][region][namespace].Cache = append(customMetricsMetricsMap[database][region][namespace].Cache, *metric.MetricName)
  242. }
  243. return customMetricsMetricsMap[database][region][namespace].Cache, nil
  244. }
  245. var dimensionsCacheLock sync.Mutex
  246. func getDimensionsForCustomMetrics(region string, namespace string, database string, getAllMetrics func(string, string, string) (cloudwatch.ListMetricsOutput, error)) ([]string, error) {
  247. result, err := getAllMetrics(region, namespace, database)
  248. if err != nil {
  249. return []string{}, err
  250. }
  251. dimensionsCacheLock.Lock()
  252. defer dimensionsCacheLock.Unlock()
  253. if _, ok := customMetricsDimensionsMap[database]; !ok {
  254. customMetricsDimensionsMap[database] = make(map[string]map[string]*CustomMetricsCache)
  255. }
  256. if _, ok := customMetricsDimensionsMap[database][region]; !ok {
  257. customMetricsDimensionsMap[database][region] = make(map[string]*CustomMetricsCache)
  258. }
  259. if _, ok := customMetricsDimensionsMap[database][region][namespace]; !ok {
  260. customMetricsDimensionsMap[database][region][namespace] = &CustomMetricsCache{}
  261. customMetricsDimensionsMap[database][region][namespace].Cache = make([]string, 0)
  262. }
  263. if customMetricsDimensionsMap[database][region][namespace].Expire.After(time.Now()) {
  264. return customMetricsDimensionsMap[database][region][namespace].Cache, nil
  265. }
  266. customMetricsDimensionsMap[database][region][namespace].Cache = make([]string, 0)
  267. customMetricsDimensionsMap[database][region][namespace].Expire = time.Now().Add(5 * time.Minute)
  268. for _, metric := range result.Metrics {
  269. for _, dimension := range metric.Dimensions {
  270. if isDuplicate(customMetricsDimensionsMap[database][region][namespace].Cache, *dimension.Name) {
  271. continue
  272. }
  273. customMetricsDimensionsMap[database][region][namespace].Cache = append(customMetricsDimensionsMap[database][region][namespace].Cache, *dimension.Name)
  274. }
  275. }
  276. return customMetricsDimensionsMap[database][region][namespace].Cache, nil
  277. }
  278. func isDuplicate(nameList []string, target string) bool {
  279. for _, name := range nameList {
  280. if name == target {
  281. return true
  282. }
  283. }
  284. return false
  285. }
  286. func isCustomMetrics(namespace string) bool {
  287. return strings.Index(namespace, "AWS/") != 0
  288. }