metrics.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. package cloudwatch
  2. import (
  3. "encoding/json"
  4. "sort"
  5. "strings"
  6. "sync"
  7. "time"
  8. "github.com/aws/aws-sdk-go/aws"
  9. "github.com/aws/aws-sdk-go/aws/awsutil"
  10. "github.com/aws/aws-sdk-go/aws/session"
  11. "github.com/aws/aws-sdk-go/service/cloudwatch"
  12. "github.com/grafana/grafana/pkg/middleware"
  13. "github.com/grafana/grafana/pkg/util"
  14. )
  15. var metricsMap map[string][]string
  16. var dimensionsMap map[string][]string
  17. type CustomMetricsCache struct {
  18. Expire time.Time
  19. Cache []string
  20. }
  21. var customMetricsMetricsMap map[string]map[string]map[string]*CustomMetricsCache
  22. var customMetricsDimensionsMap map[string]map[string]map[string]*CustomMetricsCache
  23. func init() {
  24. metricsMap = map[string][]string{
  25. "AWS/AutoScaling": {"GroupMinSize", "GroupMaxSize", "GroupDesiredCapacity", "GroupInServiceInstances", "GroupPendingInstances", "GroupStandbyInstances", "GroupTerminatingInstances", "GroupTotalInstances"},
  26. "AWS/Billing": {"EstimatedCharges"},
  27. "AWS/CloudFront": {"Requests", "BytesDownloaded", "BytesUploaded", "TotalErrorRate", "4xxErrorRate", "5xxErrorRate"},
  28. "AWS/CloudSearch": {"SuccessfulRequests", "SearchableDocuments", "IndexUtilization", "Partitions"},
  29. "AWS/DynamoDB": {"ConditionalCheckFailedRequests", "ConsumedReadCapacityUnits", "ConsumedWriteCapacityUnits", "OnlineIndexConsumedWriteCapacity", "OnlineIndexPercentageProgress", "OnlineIndexThrottleEvents", "ProvisionedReadCapacityUnits", "ProvisionedWriteCapacityUnits", "ReadThrottleEvents", "ReturnedItemCount", "SuccessfulRequestLatency", "SystemErrors", "ThrottledRequests", "UserErrors", "WriteThrottleEvents"},
  30. "AWS/ECS": {"CPUUtilization", "MemoryUtilization"},
  31. "AWS/ElastiCache": {
  32. "CPUUtilization", "FreeableMemory", "NetworkBytesIn", "NetworkBytesOut", "SwapUsage",
  33. "BytesUsedForCacheItems", "BytesReadIntoMemcached", "BytesWrittenOutFromMemcached", "CasBadval", "CasHits", "CasMisses", "CmdFlush", "CmdGet", "CmdSet", "CurrConnections", "CurrItems", "DecrHits", "DecrMisses", "DeleteHits", "DeleteMisses", "Evictions", "GetHits", "GetMisses", "IncrHits", "IncrMisses", "Reclaimed",
  34. "BytesUsedForHash", "CmdConfigGet", "CmdConfigSet", "CmdTouch", "CurrConfig", "EvictedUnfetched", "ExpiredUnfetched", "SlabsMoved", "TouchHits", "TouchMisses",
  35. "NewConnections", "NewItems", "UnusedMemory",
  36. "BytesUsedForCache", "CacheHits", "CacheMisses", "CurrConnections", "Evictions", "HyperLogLogBasedCmds", "NewConnections", "Reclaimed", "ReplicationBytes", "ReplicationLag", "SaveInProgress",
  37. "CurrItems", "GetTypeCmds", "HashBasedCmds", "KeyBasedCmds", "ListBasedCmds", "SetBasedCmds", "SetTypeCmds", "SortedSetBasedCmds", "StringBasedCmds",
  38. },
  39. "AWS/EBS": {"VolumeReadBytes", "VolumeWriteBytes", "VolumeReadOps", "VolumeWriteOps", "VolumeTotalReadTime", "VolumeTotalWriteTime", "VolumeIdleTime", "VolumeQueueLength", "VolumeThroughputPercentage", "VolumeConsumedReadWriteOps"},
  40. "AWS/EC2": {"CPUCreditUsage", "CPUCreditBalance", "CPUUtilization", "DiskReadOps", "DiskWriteOps", "DiskReadBytes", "DiskWriteBytes", "NetworkIn", "NetworkOut", "StatusCheckFailed", "StatusCheckFailed_Instance", "StatusCheckFailed_System"},
  41. "AWS/ELB": {"HealthyHostCount", "UnHealthyHostCount", "RequestCount", "Latency", "HTTPCode_ELB_4XX", "HTTPCode_ELB_5XX", "HTTPCode_Backend_2XX", "HTTPCode_Backend_3XX", "HTTPCode_Backend_4XX", "HTTPCode_Backend_5XX", "BackendConnectionErrors", "SurgeQueueLength", "SpilloverCount"},
  42. "AWS/ElasticMapReduce": {"IsIdle", "JobsRunning", "JobsFailed",
  43. "MapTasksRunning", "MapTasksRemaining", "MapSlotsOpen", "RemainingMapTasksPerSlot", "ReduceTasksRunning", "ReduceTasksRemaining", "ReduceSlotsOpen",
  44. "CoreNodesRunning", "CoreNodesPending", "LiveDataNodes", "TaskNodesRunning", "TaskNodesPending", "LiveTaskTrackers",
  45. "S3BytesWritten", "S3BytesRead", "HDFSUtilization", "HDFSBytesRead", "HDFSBytesWritten", "MissingBlocks", "TotalLoad",
  46. "BackupFailed", "MostRecentBackupDuration", "TimeSinceLastSuccessfulBackup",
  47. "IsIdle", "ContainerAllocated", "ContainerReserved", "ContainerPending", "AppsCompleted", "AppsFailed", "AppsKilled", "AppsPending", "AppsRunning", "AppsSubmitted",
  48. "CoreNodesRunning", "CoreNodesPending", "LiveDataNodes", "MRTotalNodes", "MRActiveNodes", "MRLostNodes", "MRUnhealthyNodes", "MRDecommissionedNodes", "MRRebootedNodes",
  49. "S3BytesWritten", "S3BytesRead", "HDFSUtilization", "HDFSBytesRead", "HDFSBytesWritten", "MissingBlocks", "CorruptBlocks", "TotalLoad", "MemoryTotalMB", "MemoryReservedMB", "MemoryAvailableMB", "MemoryAllocatedMB", "PendingDeletionBlocks", "UnderReplicatedBlocks", "DfsPendingReplicationBlocks", "CapacityRemainingGB",
  50. "HbaseBackupFailed", "MostRecentBackupDuration", "TimeSinceLastSuccessfulBackup"},
  51. "AWS/ES": {"ClusterStatus.green", "ClusterStatus.yellow", "ClusterStatus.red", "Nodes", "SearchableDocuments", "DeletedDocuments", "CPUUtilization", "FreeStorageSpace", "JVMMemoryPressure", "AutomatedSnapshotFailure", "MasterCPUUtilization", "MasterFreeStorageSpace", "MasterJVMMemoryPressure", "ReadLatency", "WriteLatency", "ReadThroughput", "WriteThroughput", "DiskQueueLength", "ReadIOPS", "WriteIOPS"},
  52. "AWS/Events": {"Invocations", "FailedInvocations", "TriggeredRules", "MatchedEvents", "ThrottledRules"},
  53. "AWS/Kinesis": {"GetRecords.Bytes", "GetRecords.IteratorAge", "GetRecords.IteratorAgeMilliseconds", "GetRecords.Latency", "GetRecords.Records", "GetRecords.Success", "IncomingBytes", "IncomingRecords", "PutRecord.Bytes", "PutRecord.Latency", "PutRecord.Success", "PutRecords.Bytes", "PutRecords.Latency", "PutRecords.Records", "PutRecords.Success", "ReadProvisionedThroughputExceeded", "WriteProvisionedThroughputExceeded", "IteratorAgeMilliseconds", "OutgoingBytes", "OutgoingRecords"},
  54. "AWS/Lambda": {"Invocations", "Errors", "Duration", "Throttles"},
  55. "AWS/Logs": {"IncomingBytes", "IncomingLogEvents", "ForwardedBytes", "ForwardedLogEvents", "DeliveryErrors", "DeliveryThrottling"},
  56. "AWS/ML": {"PredictCount", "PredictFailureCount"},
  57. "AWS/OpsWorks": {"cpu_idle", "cpu_nice", "cpu_system", "cpu_user", "cpu_waitio", "load_1", "load_5", "load_15", "memory_buffers", "memory_cached", "memory_free", "memory_swap", "memory_total", "memory_used", "procs"},
  58. "AWS/Redshift": {"CPUUtilization", "DatabaseConnections", "HealthStatus", "MaintenanceMode", "NetworkReceiveThroughput", "NetworkTransmitThroughput", "PercentageDiskSpaceUsed", "ReadIOPS", "ReadLatency", "ReadThroughput", "WriteIOPS", "WriteLatency", "WriteThroughput"},
  59. "AWS/RDS": {"BinLogDiskUsage", "CPUUtilization", "CPUCreditUsage", "CPUCreditBalance", "DatabaseConnections", "DiskQueueDepth", "FreeableMemory", "FreeStorageSpace", "ReplicaLag", "SwapUsage", "ReadIOPS", "WriteIOPS", "ReadLatency", "WriteLatency", "ReadThroughput", "WriteThroughput", "NetworkReceiveThroughput", "NetworkTransmitThroughput"},
  60. "AWS/Route53": {"HealthCheckStatus", "HealthCheckPercentageHealthy", "ConnectionTime", "SSLHandshakeTime", "TimeToFirstByte"},
  61. "AWS/SNS": {"NumberOfMessagesPublished", "PublishSize", "NumberOfNotificationsDelivered", "NumberOfNotificationsFailed"},
  62. "AWS/SQS": {"NumberOfMessagesSent", "SentMessageSize", "NumberOfMessagesReceived", "NumberOfEmptyReceives", "NumberOfMessagesDeleted", "ApproximateNumberOfMessagesDelayed", "ApproximateNumberOfMessagesVisible", "ApproximateNumberOfMessagesNotVisible"},
  63. "AWS/S3": {"BucketSizeBytes", "NumberOfObjects"},
  64. "AWS/SWF": {"DecisionTaskScheduleToStartTime", "DecisionTaskStartToCloseTime", "DecisionTasksCompleted", "StartedDecisionTasksTimedOutOnClose", "WorkflowStartToCloseTime", "WorkflowsCanceled", "WorkflowsCompleted", "WorkflowsContinuedAsNew", "WorkflowsFailed", "WorkflowsTerminated", "WorkflowsTimedOut",
  65. "ActivityTaskScheduleToCloseTime", "ActivityTaskScheduleToStartTime", "ActivityTaskStartToCloseTime", "ActivityTasksCanceled", "ActivityTasksCompleted", "ActivityTasksFailed", "ScheduledActivityTasksTimedOutOnClose", "ScheduledActivityTasksTimedOutOnStart", "StartedActivityTasksTimedOutOnClose", "StartedActivityTasksTimedOutOnHeartbeat"},
  66. "AWS/StorageGateway": {"CacheHitPercent", "CachePercentUsed", "CachePercentDirty", "CloudBytesDownloaded", "CloudDownloadLatency", "CloudBytesUploaded", "UploadBufferFree", "UploadBufferPercentUsed", "UploadBufferUsed", "QueuedWrites", "ReadBytes", "ReadTime", "TotalCacheSize", "WriteBytes", "WriteTime", "TimeSinceLastRecoveryPoint", "WorkingStorageFree", "WorkingStoragePercentUsed", "WorkingStorageUsed",
  67. "CacheHitPercent", "CachePercentUsed", "CachePercentDirty", "ReadBytes", "ReadTime", "WriteBytes", "WriteTime", "QueuedWrites"},
  68. "AWS/WAF": {"AllowedRequests", "BlockedRequests", "CountedRequests"},
  69. "AWS/WorkSpaces": {"Available", "Unhealthy", "ConnectionAttempt", "ConnectionSuccess", "ConnectionFailure", "SessionLaunchTime", "InSessionLatency", "SessionDisconnect"},
  70. }
  71. dimensionsMap = map[string][]string{
  72. "AWS/AutoScaling": {"AutoScalingGroupName"},
  73. "AWS/Billing": {"ServiceName", "LinkedAccount", "Currency"},
  74. "AWS/CloudFront": {"DistributionId", "Region"},
  75. "AWS/CloudSearch": {},
  76. "AWS/DynamoDB": {"TableName", "GlobalSecondaryIndexName", "Operation"},
  77. "AWS/ECS": {"ClusterName", "ServiceName"},
  78. "AWS/ElastiCache": {"CacheClusterId", "CacheNodeId"},
  79. "AWS/EBS": {"VolumeId"},
  80. "AWS/EC2": {"AutoScalingGroupName", "ImageId", "InstanceId", "InstanceType"},
  81. "AWS/ELB": {"LoadBalancerName", "AvailabilityZone"},
  82. "AWS/ElasticMapReduce": {"ClusterId", "JobFlowId", "JobId"},
  83. "AWS/ES": {"ClientId", "DomainName"},
  84. "AWS/Events": {"RuleName"},
  85. "AWS/Kinesis": {"StreamName", "ShardID"},
  86. "AWS/Lambda": {"FunctionName"},
  87. "AWS/Logs": {"LogGroupName", "DestinationType", "FilterName"},
  88. "AWS/ML": {"MLModelId", "RequestMode"},
  89. "AWS/OpsWorks": {"StackId", "LayerId", "InstanceId"},
  90. "AWS/Redshift": {"NodeID", "ClusterIdentifier"},
  91. "AWS/RDS": {"DBInstanceIdentifier", "DatabaseClass", "EngineName"},
  92. "AWS/Route53": {"HealthCheckId"},
  93. "AWS/SNS": {"Application", "Platform", "TopicName"},
  94. "AWS/SQS": {"QueueName"},
  95. "AWS/S3": {"BucketName", "StorageType"},
  96. "AWS/SWF": {"Domain", "WorkflowTypeName", "WorkflowTypeVersion", "ActivityTypeName", "ActivityTypeVersion"},
  97. "AWS/StorageGateway": {"GatewayId", "GatewayName", "VolumeId"},
  98. "AWS/WAF": {"Rule", "WebACL"},
  99. "AWS/WorkSpaces": {"DirectoryId", "WorkspaceId"},
  100. }
  101. customMetricsMetricsMap = make(map[string]map[string]map[string]*CustomMetricsCache)
  102. customMetricsDimensionsMap = make(map[string]map[string]map[string]*CustomMetricsCache)
  103. }
  104. // Whenever this list is updated, frontend list should also be updated.
  105. // Please update the region list in public/app/plugins/datasource/cloudwatch/partials/config.html
  106. func handleGetRegions(req *cwRequest, c *middleware.Context) {
  107. regions := []string{
  108. "ap-northeast-1", "ap-northeast-2", "ap-southeast-1", "ap-southeast-2", "cn-north-1",
  109. "eu-central-1", "eu-west-1", "sa-east-1", "us-east-1", "us-west-1", "us-west-2",
  110. }
  111. result := []interface{}{}
  112. for _, region := range regions {
  113. result = append(result, util.DynMap{"text": region, "value": region})
  114. }
  115. c.JSON(200, result)
  116. }
  117. func handleGetNamespaces(req *cwRequest, c *middleware.Context) {
  118. keys := []string{}
  119. for key := range metricsMap {
  120. keys = append(keys, key)
  121. }
  122. customNamespaces := req.DataSource.JsonData.Get("customMetricsNamespaces").MustString()
  123. if customNamespaces != "" {
  124. for _, key := range strings.Split(customNamespaces, ",") {
  125. keys = append(keys, key)
  126. }
  127. }
  128. sort.Sort(sort.StringSlice(keys))
  129. result := []interface{}{}
  130. for _, key := range keys {
  131. result = append(result, util.DynMap{"text": key, "value": key})
  132. }
  133. c.JSON(200, result)
  134. }
  135. func handleGetMetrics(req *cwRequest, c *middleware.Context) {
  136. reqParam := &struct {
  137. Parameters struct {
  138. Namespace string `json:"namespace"`
  139. } `json:"parameters"`
  140. }{}
  141. json.Unmarshal(req.Body, reqParam)
  142. var namespaceMetrics []string
  143. if !isCustomMetrics(reqParam.Parameters.Namespace) {
  144. var exists bool
  145. if namespaceMetrics, exists = metricsMap[reqParam.Parameters.Namespace]; !exists {
  146. c.JsonApiErr(404, "Unable to find namespace "+reqParam.Parameters.Namespace, nil)
  147. return
  148. }
  149. } else {
  150. var err error
  151. assumeRoleArn := req.DataSource.JsonData.Get("assumeRoleArn").MustString()
  152. if namespaceMetrics, err = getMetricsForCustomMetrics(req.Region, reqParam.Parameters.Namespace, req.DataSource.Database, assumeRoleArn, getAllMetrics); err != nil {
  153. c.JsonApiErr(500, "Unable to call AWS API", err)
  154. return
  155. }
  156. }
  157. sort.Sort(sort.StringSlice(namespaceMetrics))
  158. result := []interface{}{}
  159. for _, name := range namespaceMetrics {
  160. result = append(result, util.DynMap{"text": name, "value": name})
  161. }
  162. c.JSON(200, result)
  163. }
  164. func handleGetDimensions(req *cwRequest, c *middleware.Context) {
  165. reqParam := &struct {
  166. Parameters struct {
  167. Namespace string `json:"namespace"`
  168. } `json:"parameters"`
  169. }{}
  170. json.Unmarshal(req.Body, reqParam)
  171. var dimensionValues []string
  172. if !isCustomMetrics(reqParam.Parameters.Namespace) {
  173. var exists bool
  174. if dimensionValues, exists = dimensionsMap[reqParam.Parameters.Namespace]; !exists {
  175. c.JsonApiErr(404, "Unable to find dimension "+reqParam.Parameters.Namespace, nil)
  176. return
  177. }
  178. } else {
  179. var err error
  180. assumeRoleArn := req.DataSource.JsonData.Get("assumeRoleArn").MustString()
  181. if dimensionValues, err = getDimensionsForCustomMetrics(req.Region, reqParam.Parameters.Namespace, req.DataSource.Database, assumeRoleArn, getAllMetrics); err != nil {
  182. c.JsonApiErr(500, "Unable to call AWS API", err)
  183. return
  184. }
  185. }
  186. sort.Sort(sort.StringSlice(dimensionValues))
  187. result := []interface{}{}
  188. for _, name := range dimensionValues {
  189. result = append(result, util.DynMap{"text": name, "value": name})
  190. }
  191. c.JSON(200, result)
  192. }
  193. func getAllMetrics(region string, namespace string, database string, assumeRoleArn string) (cloudwatch.ListMetricsOutput, error) {
  194. cfg := &aws.Config{
  195. Region: aws.String(region),
  196. Credentials: getCredentials(database, region, assumeRoleArn),
  197. }
  198. svc := cloudwatch.New(session.New(cfg), cfg)
  199. params := &cloudwatch.ListMetricsInput{
  200. Namespace: aws.String(namespace),
  201. }
  202. var resp cloudwatch.ListMetricsOutput
  203. err := svc.ListMetricsPages(params,
  204. func(page *cloudwatch.ListMetricsOutput, lastPage bool) bool {
  205. metrics, _ := awsutil.ValuesAtPath(page, "Metrics")
  206. for _, metric := range metrics {
  207. resp.Metrics = append(resp.Metrics, metric.(*cloudwatch.Metric))
  208. }
  209. return !lastPage
  210. })
  211. if err != nil {
  212. return resp, err
  213. }
  214. return resp, nil
  215. }
  216. var metricsCacheLock sync.Mutex
  217. func getMetricsForCustomMetrics(region string, namespace string, database string, assumeRoleArn string, getAllMetrics func(string, string, string, string) (cloudwatch.ListMetricsOutput, error)) ([]string, error) {
  218. result, err := getAllMetrics(region, namespace, database, assumeRoleArn)
  219. if err != nil {
  220. return []string{}, err
  221. }
  222. metricsCacheLock.Lock()
  223. defer metricsCacheLock.Unlock()
  224. if _, ok := customMetricsMetricsMap[database]; !ok {
  225. customMetricsMetricsMap[database] = make(map[string]map[string]*CustomMetricsCache)
  226. }
  227. if _, ok := customMetricsMetricsMap[database][region]; !ok {
  228. customMetricsMetricsMap[database][region] = make(map[string]*CustomMetricsCache)
  229. }
  230. if _, ok := customMetricsMetricsMap[database][region][namespace]; !ok {
  231. customMetricsMetricsMap[database][region][namespace] = &CustomMetricsCache{}
  232. customMetricsMetricsMap[database][region][namespace].Cache = make([]string, 0)
  233. }
  234. if customMetricsMetricsMap[database][region][namespace].Expire.After(time.Now()) {
  235. return customMetricsMetricsMap[database][region][namespace].Cache, nil
  236. }
  237. customMetricsMetricsMap[database][region][namespace].Cache = make([]string, 0)
  238. customMetricsMetricsMap[database][region][namespace].Expire = time.Now().Add(5 * time.Minute)
  239. for _, metric := range result.Metrics {
  240. if isDuplicate(customMetricsMetricsMap[database][region][namespace].Cache, *metric.MetricName) {
  241. continue
  242. }
  243. customMetricsMetricsMap[database][region][namespace].Cache = append(customMetricsMetricsMap[database][region][namespace].Cache, *metric.MetricName)
  244. }
  245. return customMetricsMetricsMap[database][region][namespace].Cache, nil
  246. }
  247. var dimensionsCacheLock sync.Mutex
  248. func getDimensionsForCustomMetrics(region string, namespace string, database string, assumeRoleArn string, getAllMetrics func(string, string, string, string) (cloudwatch.ListMetricsOutput, error)) ([]string, error) {
  249. result, err := getAllMetrics(region, namespace, database, assumeRoleArn)
  250. if err != nil {
  251. return []string{}, err
  252. }
  253. dimensionsCacheLock.Lock()
  254. defer dimensionsCacheLock.Unlock()
  255. if _, ok := customMetricsDimensionsMap[database]; !ok {
  256. customMetricsDimensionsMap[database] = make(map[string]map[string]*CustomMetricsCache)
  257. }
  258. if _, ok := customMetricsDimensionsMap[database][region]; !ok {
  259. customMetricsDimensionsMap[database][region] = make(map[string]*CustomMetricsCache)
  260. }
  261. if _, ok := customMetricsDimensionsMap[database][region][namespace]; !ok {
  262. customMetricsDimensionsMap[database][region][namespace] = &CustomMetricsCache{}
  263. customMetricsDimensionsMap[database][region][namespace].Cache = make([]string, 0)
  264. }
  265. if customMetricsDimensionsMap[database][region][namespace].Expire.After(time.Now()) {
  266. return customMetricsDimensionsMap[database][region][namespace].Cache, nil
  267. }
  268. customMetricsDimensionsMap[database][region][namespace].Cache = make([]string, 0)
  269. customMetricsDimensionsMap[database][region][namespace].Expire = time.Now().Add(5 * time.Minute)
  270. for _, metric := range result.Metrics {
  271. for _, dimension := range metric.Dimensions {
  272. if isDuplicate(customMetricsDimensionsMap[database][region][namespace].Cache, *dimension.Name) {
  273. continue
  274. }
  275. customMetricsDimensionsMap[database][region][namespace].Cache = append(customMetricsDimensionsMap[database][region][namespace].Cache, *dimension.Name)
  276. }
  277. }
  278. return customMetricsDimensionsMap[database][region][namespace].Cache, nil
  279. }
  280. func isDuplicate(nameList []string, target string) bool {
  281. for _, name := range nameList {
  282. if name == target {
  283. return true
  284. }
  285. }
  286. return false
  287. }
  288. func isCustomMetrics(namespace string) bool {
  289. return strings.Index(namespace, "AWS/") != 0
  290. }