metrics.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. package cloudwatch
  2. import (
  3. "encoding/json"
  4. "sort"
  5. "strings"
  6. "sync"
  7. "time"
  8. "github.com/aws/aws-sdk-go/aws"
  9. "github.com/aws/aws-sdk-go/aws/awsutil"
  10. "github.com/aws/aws-sdk-go/aws/session"
  11. "github.com/aws/aws-sdk-go/service/cloudwatch"
  12. "github.com/grafana/grafana/pkg/middleware"
  13. "github.com/grafana/grafana/pkg/util"
  14. )
  15. var metricsMap map[string][]string
  16. var dimensionsMap map[string][]string
  17. type CustomMetricsCache struct {
  18. Expire time.Time
  19. Cache []string
  20. }
  21. var customMetricsMetricsMap map[string]map[string]map[string]*CustomMetricsCache
  22. var customMetricsDimensionsMap map[string]map[string]map[string]*CustomMetricsCache
  23. func init() {
  24. metricsMap = map[string][]string{
  25. "AWS/AutoScaling": {"GroupMinSize", "GroupMaxSize", "GroupDesiredCapacity", "GroupInServiceInstances", "GroupPendingInstances", "GroupStandbyInstances", "GroupTerminatingInstances", "GroupTotalInstances"},
  26. "AWS/Billing": {"EstimatedCharges"},
  27. "AWS/CloudFront": {"Requests", "BytesDownloaded", "BytesUploaded", "TotalErrorRate", "4xxErrorRate", "5xxErrorRate"},
  28. "AWS/CloudSearch": {"SuccessfulRequests", "SearchableDocuments", "IndexUtilization", "Partitions"},
  29. "AWS/DynamoDB": {"ConditionalCheckFailedRequests", "ConsumedReadCapacityUnits", "ConsumedWriteCapacityUnits", "OnlineIndexConsumedWriteCapacity", "OnlineIndexPercentageProgress", "OnlineIndexThrottleEvents", "ProvisionedReadCapacityUnits", "ProvisionedWriteCapacityUnits", "ReadThrottleEvents", "ReturnedItemCount", "SuccessfulRequestLatency", "SystemErrors", "ThrottledRequests", "UserErrors", "WriteThrottleEvents"},
  30. "AWS/ECS": {"CPUUtilization", "MemoryUtilization"},
  31. "AWS/ElastiCache": {
  32. "CPUUtilization", "FreeableMemory", "NetworkBytesIn", "NetworkBytesOut", "SwapUsage",
  33. "BytesUsedForCacheItems", "BytesReadIntoMemcached", "BytesWrittenOutFromMemcached", "CasBadval", "CasHits", "CasMisses", "CmdFlush", "CmdGet", "CmdSet", "CurrConnections", "CurrItems", "DecrHits", "DecrMisses", "DeleteHits", "DeleteMisses", "Evictions", "GetHits", "GetMisses", "IncrHits", "IncrMisses", "Reclaimed",
  34. "BytesUsedForHash", "CmdConfigGet", "CmdConfigSet", "CmdTouch", "CurrConfig", "EvictedUnfetched", "ExpiredUnfetched", "SlabsMoved", "TouchHits", "TouchMisses",
  35. "NewConnections", "NewItems", "UnusedMemory",
  36. "BytesUsedForCache", "CacheHits", "CacheMisses", "CurrConnections", "Evictions", "HyperLogLogBasedCmds", "NewConnections", "Reclaimed", "ReplicationBytes", "ReplicationLag", "SaveInProgress",
  37. "CurrItems", "GetTypeCmds", "HashBasedCmds", "KeyBasedCmds", "ListBasedCmds", "SetBasedCmds", "SetTypeCmds", "SortedSetBasedCmds", "StringBasedCmds",
  38. },
  39. "AWS/EBS": {"VolumeReadBytes", "VolumeWriteBytes", "VolumeReadOps", "VolumeWriteOps", "VolumeTotalReadTime", "VolumeTotalWriteTime", "VolumeIdleTime", "VolumeQueueLength", "VolumeThroughputPercentage", "VolumeConsumedReadWriteOps"},
  40. "AWS/EC2": {"CPUCreditUsage", "CPUCreditBalance", "CPUUtilization", "DiskReadOps", "DiskWriteOps", "DiskReadBytes", "DiskWriteBytes", "NetworkIn", "NetworkOut", "StatusCheckFailed", "StatusCheckFailed_Instance", "StatusCheckFailed_System"},
  41. "AWS/ELB": {"HealthyHostCount", "UnHealthyHostCount", "RequestCount", "Latency", "HTTPCode_ELB_4XX", "HTTPCode_ELB_5XX", "HTTPCode_Backend_2XX", "HTTPCode_Backend_3XX", "HTTPCode_Backend_4XX", "HTTPCode_Backend_5XX", "BackendConnectionErrors", "SurgeQueueLength", "SpilloverCount"},
  42. "AWS/ElasticBeanstalk": {"EnvironmentHealth"},
  43. "AWS/ElasticMapReduce": {"IsIdle", "JobsRunning", "JobsFailed",
  44. "MapTasksRunning", "MapTasksRemaining", "MapSlotsOpen", "RemainingMapTasksPerSlot", "ReduceTasksRunning", "ReduceTasksRemaining", "ReduceSlotsOpen",
  45. "CoreNodesRunning", "CoreNodesPending", "LiveDataNodes", "TaskNodesRunning", "TaskNodesPending", "LiveTaskTrackers",
  46. "S3BytesWritten", "S3BytesRead", "HDFSUtilization", "HDFSBytesRead", "HDFSBytesWritten", "MissingBlocks", "TotalLoad",
  47. "BackupFailed", "MostRecentBackupDuration", "TimeSinceLastSuccessfulBackup",
  48. "IsIdle", "ContainerAllocated", "ContainerReserved", "ContainerPending", "AppsCompleted", "AppsFailed", "AppsKilled", "AppsPending", "AppsRunning", "AppsSubmitted",
  49. "CoreNodesRunning", "CoreNodesPending", "LiveDataNodes", "MRTotalNodes", "MRActiveNodes", "MRLostNodes", "MRUnhealthyNodes", "MRDecommissionedNodes", "MRRebootedNodes",
  50. "S3BytesWritten", "S3BytesRead", "HDFSUtilization", "HDFSBytesRead", "HDFSBytesWritten", "MissingBlocks", "CorruptBlocks", "TotalLoad", "MemoryTotalMB", "MemoryReservedMB", "MemoryAvailableMB", "MemoryAllocatedMB", "PendingDeletionBlocks", "UnderReplicatedBlocks", "DfsPendingReplicationBlocks", "CapacityRemainingGB",
  51. "HbaseBackupFailed", "MostRecentBackupDuration", "TimeSinceLastSuccessfulBackup"},
  52. "AWS/ES": {"ClusterStatus.green", "ClusterStatus.yellow", "ClusterStatus.red", "Nodes", "SearchableDocuments", "DeletedDocuments", "CPUUtilization", "FreeStorageSpace", "JVMMemoryPressure", "AutomatedSnapshotFailure", "MasterCPUUtilization", "MasterFreeStorageSpace", "MasterJVMMemoryPressure", "ReadLatency", "WriteLatency", "ReadThroughput", "WriteThroughput", "DiskQueueLength", "ReadIOPS", "WriteIOPS"},
  53. "AWS/Events": {"Invocations", "FailedInvocations", "TriggeredRules", "MatchedEvents", "ThrottledRules"},
  54. "AWS/Kinesis": {"GetRecords.Bytes", "GetRecords.IteratorAge", "GetRecords.IteratorAgeMilliseconds", "GetRecords.Latency", "GetRecords.Records", "GetRecords.Success", "IncomingBytes", "IncomingRecords", "PutRecord.Bytes", "PutRecord.Latency", "PutRecord.Success", "PutRecords.Bytes", "PutRecords.Latency", "PutRecords.Records", "PutRecords.Success", "ReadProvisionedThroughputExceeded", "WriteProvisionedThroughputExceeded", "IteratorAgeMilliseconds", "OutgoingBytes", "OutgoingRecords"},
  55. "AWS/Lambda": {"Invocations", "Errors", "Duration", "Throttles"},
  56. "AWS/Logs": {"IncomingBytes", "IncomingLogEvents", "ForwardedBytes", "ForwardedLogEvents", "DeliveryErrors", "DeliveryThrottling"},
  57. "AWS/ML": {"PredictCount", "PredictFailureCount"},
  58. "AWS/OpsWorks": {"cpu_idle", "cpu_nice", "cpu_system", "cpu_user", "cpu_waitio", "load_1", "load_5", "load_15", "memory_buffers", "memory_cached", "memory_free", "memory_swap", "memory_total", "memory_used", "procs"},
  59. "AWS/Redshift": {"CPUUtilization", "DatabaseConnections", "HealthStatus", "MaintenanceMode", "NetworkReceiveThroughput", "NetworkTransmitThroughput", "PercentageDiskSpaceUsed", "ReadIOPS", "ReadLatency", "ReadThroughput", "WriteIOPS", "WriteLatency", "WriteThroughput"},
  60. "AWS/RDS": {"BinLogDiskUsage", "CPUUtilization", "CPUCreditUsage", "CPUCreditBalance", "DatabaseConnections", "DiskQueueDepth", "FreeableMemory", "FreeStorageSpace", "ReplicaLag", "SwapUsage", "ReadIOPS", "WriteIOPS", "ReadLatency", "WriteLatency", "ReadThroughput", "WriteThroughput", "NetworkReceiveThroughput", "NetworkTransmitThroughput"},
  61. "AWS/Route53": {"HealthCheckStatus", "HealthCheckPercentageHealthy", "ConnectionTime", "SSLHandshakeTime", "TimeToFirstByte"},
  62. "AWS/SNS": {"NumberOfMessagesPublished", "PublishSize", "NumberOfNotificationsDelivered", "NumberOfNotificationsFailed"},
  63. "AWS/SQS": {"NumberOfMessagesSent", "SentMessageSize", "NumberOfMessagesReceived", "NumberOfEmptyReceives", "NumberOfMessagesDeleted", "ApproximateNumberOfMessagesDelayed", "ApproximateNumberOfMessagesVisible", "ApproximateNumberOfMessagesNotVisible"},
  64. "AWS/S3": {"BucketSizeBytes", "NumberOfObjects"},
  65. "AWS/SWF": {"DecisionTaskScheduleToStartTime", "DecisionTaskStartToCloseTime", "DecisionTasksCompleted", "StartedDecisionTasksTimedOutOnClose", "WorkflowStartToCloseTime", "WorkflowsCanceled", "WorkflowsCompleted", "WorkflowsContinuedAsNew", "WorkflowsFailed", "WorkflowsTerminated", "WorkflowsTimedOut",
  66. "ActivityTaskScheduleToCloseTime", "ActivityTaskScheduleToStartTime", "ActivityTaskStartToCloseTime", "ActivityTasksCanceled", "ActivityTasksCompleted", "ActivityTasksFailed", "ScheduledActivityTasksTimedOutOnClose", "ScheduledActivityTasksTimedOutOnStart", "StartedActivityTasksTimedOutOnClose", "StartedActivityTasksTimedOutOnHeartbeat"},
  67. "AWS/StorageGateway": {"CacheHitPercent", "CachePercentUsed", "CachePercentDirty", "CloudBytesDownloaded", "CloudDownloadLatency", "CloudBytesUploaded", "UploadBufferFree", "UploadBufferPercentUsed", "UploadBufferUsed", "QueuedWrites", "ReadBytes", "ReadTime", "TotalCacheSize", "WriteBytes", "WriteTime", "TimeSinceLastRecoveryPoint", "WorkingStorageFree", "WorkingStoragePercentUsed", "WorkingStorageUsed",
  68. "CacheHitPercent", "CachePercentUsed", "CachePercentDirty", "ReadBytes", "ReadTime", "WriteBytes", "WriteTime", "QueuedWrites"},
  69. "AWS/WAF": {"AllowedRequests", "BlockedRequests", "CountedRequests"},
  70. "AWS/WorkSpaces": {"Available", "Unhealthy", "ConnectionAttempt", "ConnectionSuccess", "ConnectionFailure", "SessionLaunchTime", "InSessionLatency", "SessionDisconnect"},
  71. }
  72. dimensionsMap = map[string][]string{
  73. "AWS/AutoScaling": {"AutoScalingGroupName"},
  74. "AWS/Billing": {"ServiceName", "LinkedAccount", "Currency"},
  75. "AWS/CloudFront": {"DistributionId", "Region"},
  76. "AWS/CloudSearch": {},
  77. "AWS/DynamoDB": {"TableName", "GlobalSecondaryIndexName", "Operation"},
  78. "AWS/ECS": {"ClusterName", "ServiceName"},
  79. "AWS/ElastiCache": {"CacheClusterId", "CacheNodeId"},
  80. "AWS/EBS": {"VolumeId"},
  81. "AWS/EC2": {"AutoScalingGroupName", "ImageId", "InstanceId", "InstanceType"},
  82. "AWS/ELB": {"LoadBalancerName", "AvailabilityZone"},
  83. "AWS/ElasticBeanstalk": {"EnvironmentName"},
  84. "AWS/ElasticMapReduce": {"ClusterId", "JobFlowId", "JobId"},
  85. "AWS/ES": {"ClientId", "DomainName"},
  86. "AWS/Events": {"RuleName"},
  87. "AWS/Kinesis": {"StreamName", "ShardID"},
  88. "AWS/Lambda": {"FunctionName"},
  89. "AWS/Logs": {"LogGroupName", "DestinationType", "FilterName"},
  90. "AWS/ML": {"MLModelId", "RequestMode"},
  91. "AWS/OpsWorks": {"StackId", "LayerId", "InstanceId"},
  92. "AWS/Redshift": {"NodeID", "ClusterIdentifier"},
  93. "AWS/RDS": {"DBInstanceIdentifier", "DatabaseClass", "EngineName"},
  94. "AWS/Route53": {"HealthCheckId"},
  95. "AWS/SNS": {"Application", "Platform", "TopicName"},
  96. "AWS/SQS": {"QueueName"},
  97. "AWS/S3": {"BucketName", "StorageType"},
  98. "AWS/SWF": {"Domain", "WorkflowTypeName", "WorkflowTypeVersion", "ActivityTypeName", "ActivityTypeVersion"},
  99. "AWS/StorageGateway": {"GatewayId", "GatewayName", "VolumeId"},
  100. "AWS/WAF": {"Rule", "WebACL"},
  101. "AWS/WorkSpaces": {"DirectoryId", "WorkspaceId"},
  102. }
  103. customMetricsMetricsMap = make(map[string]map[string]map[string]*CustomMetricsCache)
  104. customMetricsDimensionsMap = make(map[string]map[string]map[string]*CustomMetricsCache)
  105. }
  106. // Whenever this list is updated, frontend list should also be updated.
  107. // Please update the region list in public/app/plugins/datasource/cloudwatch/partials/config.html
  108. func handleGetRegions(req *cwRequest, c *middleware.Context) {
  109. regions := []string{
  110. "ap-northeast-1", "ap-northeast-2", "ap-southeast-1", "ap-southeast-2", "cn-north-1",
  111. "eu-central-1", "eu-west-1", "sa-east-1", "us-east-1", "us-west-1", "us-west-2",
  112. }
  113. result := []interface{}{}
  114. for _, region := range regions {
  115. result = append(result, util.DynMap{"text": region, "value": region})
  116. }
  117. c.JSON(200, result)
  118. }
  119. func handleGetNamespaces(req *cwRequest, c *middleware.Context) {
  120. keys := []string{}
  121. for key := range metricsMap {
  122. keys = append(keys, key)
  123. }
  124. customNamespaces := req.DataSource.JsonData.Get("customMetricsNamespaces").MustString()
  125. if customNamespaces != "" {
  126. for _, key := range strings.Split(customNamespaces, ",") {
  127. keys = append(keys, key)
  128. }
  129. }
  130. sort.Sort(sort.StringSlice(keys))
  131. result := []interface{}{}
  132. for _, key := range keys {
  133. result = append(result, util.DynMap{"text": key, "value": key})
  134. }
  135. c.JSON(200, result)
  136. }
  137. func handleGetMetrics(req *cwRequest, c *middleware.Context) {
  138. reqParam := &struct {
  139. Parameters struct {
  140. Namespace string `json:"namespace"`
  141. } `json:"parameters"`
  142. }{}
  143. json.Unmarshal(req.Body, reqParam)
  144. var namespaceMetrics []string
  145. if !isCustomMetrics(reqParam.Parameters.Namespace) {
  146. var exists bool
  147. if namespaceMetrics, exists = metricsMap[reqParam.Parameters.Namespace]; !exists {
  148. c.JsonApiErr(404, "Unable to find namespace "+reqParam.Parameters.Namespace, nil)
  149. return
  150. }
  151. } else {
  152. var err error
  153. assumeRoleArn := req.DataSource.JsonData.Get("assumeRoleArn").MustString()
  154. if namespaceMetrics, err = getMetricsForCustomMetrics(req.Region, reqParam.Parameters.Namespace, req.DataSource.Database, assumeRoleArn, getAllMetrics); err != nil {
  155. c.JsonApiErr(500, "Unable to call AWS API", err)
  156. return
  157. }
  158. }
  159. sort.Sort(sort.StringSlice(namespaceMetrics))
  160. result := []interface{}{}
  161. for _, name := range namespaceMetrics {
  162. result = append(result, util.DynMap{"text": name, "value": name})
  163. }
  164. c.JSON(200, result)
  165. }
  166. func handleGetDimensions(req *cwRequest, c *middleware.Context) {
  167. reqParam := &struct {
  168. Parameters struct {
  169. Namespace string `json:"namespace"`
  170. } `json:"parameters"`
  171. }{}
  172. json.Unmarshal(req.Body, reqParam)
  173. var dimensionValues []string
  174. if !isCustomMetrics(reqParam.Parameters.Namespace) {
  175. var exists bool
  176. if dimensionValues, exists = dimensionsMap[reqParam.Parameters.Namespace]; !exists {
  177. c.JsonApiErr(404, "Unable to find dimension "+reqParam.Parameters.Namespace, nil)
  178. return
  179. }
  180. } else {
  181. var err error
  182. assumeRoleArn := req.DataSource.JsonData.Get("assumeRoleArn").MustString()
  183. if dimensionValues, err = getDimensionsForCustomMetrics(req.Region, reqParam.Parameters.Namespace, req.DataSource.Database, assumeRoleArn, getAllMetrics); err != nil {
  184. c.JsonApiErr(500, "Unable to call AWS API", err)
  185. return
  186. }
  187. }
  188. sort.Sort(sort.StringSlice(dimensionValues))
  189. result := []interface{}{}
  190. for _, name := range dimensionValues {
  191. result = append(result, util.DynMap{"text": name, "value": name})
  192. }
  193. c.JSON(200, result)
  194. }
  195. func getAllMetrics(region string, namespace string, database string, assumeRoleArn string) (cloudwatch.ListMetricsOutput, error) {
  196. cfg := &aws.Config{
  197. Region: aws.String(region),
  198. Credentials: getCredentials(database, region, assumeRoleArn),
  199. }
  200. svc := cloudwatch.New(session.New(cfg), cfg)
  201. params := &cloudwatch.ListMetricsInput{
  202. Namespace: aws.String(namespace),
  203. }
  204. var resp cloudwatch.ListMetricsOutput
  205. err := svc.ListMetricsPages(params,
  206. func(page *cloudwatch.ListMetricsOutput, lastPage bool) bool {
  207. metrics, _ := awsutil.ValuesAtPath(page, "Metrics")
  208. for _, metric := range metrics {
  209. resp.Metrics = append(resp.Metrics, metric.(*cloudwatch.Metric))
  210. }
  211. return !lastPage
  212. })
  213. if err != nil {
  214. return resp, err
  215. }
  216. return resp, nil
  217. }
  218. var metricsCacheLock sync.Mutex
  219. func getMetricsForCustomMetrics(region string, namespace string, database string, assumeRoleArn string, getAllMetrics func(string, string, string, string) (cloudwatch.ListMetricsOutput, error)) ([]string, error) {
  220. result, err := getAllMetrics(region, namespace, database, assumeRoleArn)
  221. if err != nil {
  222. return []string{}, err
  223. }
  224. metricsCacheLock.Lock()
  225. defer metricsCacheLock.Unlock()
  226. if _, ok := customMetricsMetricsMap[database]; !ok {
  227. customMetricsMetricsMap[database] = make(map[string]map[string]*CustomMetricsCache)
  228. }
  229. if _, ok := customMetricsMetricsMap[database][region]; !ok {
  230. customMetricsMetricsMap[database][region] = make(map[string]*CustomMetricsCache)
  231. }
  232. if _, ok := customMetricsMetricsMap[database][region][namespace]; !ok {
  233. customMetricsMetricsMap[database][region][namespace] = &CustomMetricsCache{}
  234. customMetricsMetricsMap[database][region][namespace].Cache = make([]string, 0)
  235. }
  236. if customMetricsMetricsMap[database][region][namespace].Expire.After(time.Now()) {
  237. return customMetricsMetricsMap[database][region][namespace].Cache, nil
  238. }
  239. customMetricsMetricsMap[database][region][namespace].Cache = make([]string, 0)
  240. customMetricsMetricsMap[database][region][namespace].Expire = time.Now().Add(5 * time.Minute)
  241. for _, metric := range result.Metrics {
  242. if isDuplicate(customMetricsMetricsMap[database][region][namespace].Cache, *metric.MetricName) {
  243. continue
  244. }
  245. customMetricsMetricsMap[database][region][namespace].Cache = append(customMetricsMetricsMap[database][region][namespace].Cache, *metric.MetricName)
  246. }
  247. return customMetricsMetricsMap[database][region][namespace].Cache, nil
  248. }
  249. var dimensionsCacheLock sync.Mutex
  250. func getDimensionsForCustomMetrics(region string, namespace string, database string, assumeRoleArn string, getAllMetrics func(string, string, string, string) (cloudwatch.ListMetricsOutput, error)) ([]string, error) {
  251. result, err := getAllMetrics(region, namespace, database, assumeRoleArn)
  252. if err != nil {
  253. return []string{}, err
  254. }
  255. dimensionsCacheLock.Lock()
  256. defer dimensionsCacheLock.Unlock()
  257. if _, ok := customMetricsDimensionsMap[database]; !ok {
  258. customMetricsDimensionsMap[database] = make(map[string]map[string]*CustomMetricsCache)
  259. }
  260. if _, ok := customMetricsDimensionsMap[database][region]; !ok {
  261. customMetricsDimensionsMap[database][region] = make(map[string]*CustomMetricsCache)
  262. }
  263. if _, ok := customMetricsDimensionsMap[database][region][namespace]; !ok {
  264. customMetricsDimensionsMap[database][region][namespace] = &CustomMetricsCache{}
  265. customMetricsDimensionsMap[database][region][namespace].Cache = make([]string, 0)
  266. }
  267. if customMetricsDimensionsMap[database][region][namespace].Expire.After(time.Now()) {
  268. return customMetricsDimensionsMap[database][region][namespace].Cache, nil
  269. }
  270. customMetricsDimensionsMap[database][region][namespace].Cache = make([]string, 0)
  271. customMetricsDimensionsMap[database][region][namespace].Expire = time.Now().Add(5 * time.Minute)
  272. for _, metric := range result.Metrics {
  273. for _, dimension := range metric.Dimensions {
  274. if isDuplicate(customMetricsDimensionsMap[database][region][namespace].Cache, *dimension.Name) {
  275. continue
  276. }
  277. customMetricsDimensionsMap[database][region][namespace].Cache = append(customMetricsDimensionsMap[database][region][namespace].Cache, *dimension.Name)
  278. }
  279. }
  280. return customMetricsDimensionsMap[database][region][namespace].Cache, nil
  281. }
  282. func isDuplicate(nameList []string, target string) bool {
  283. for _, name := range nameList {
  284. if name == target {
  285. return true
  286. }
  287. }
  288. return false
  289. }
  290. func isCustomMetrics(namespace string) bool {
  291. return strings.Index(namespace, "AWS/") != 0
  292. }