|
NodeManager: |
|
Node ID: ed5029aed12dfb118ce5ec8eeddd392389018cb6a21a8c84b00cbebd |
|
Node name: 192.168.0.2 |
|
InitialConfigResources: {node:192.168.0.2: 10000, CPU: 200000, accelerator_type:A40: 10000, memory: 846480855040000, node:__internal_head__: 10000, object_store_memory: 21474836480000, GPU: 20000} |
|
ClusterTaskManager: |
|
========== Node: ed5029aed12dfb118ce5ec8eeddd392389018cb6a21a8c84b00cbebd ================= |
|
Infeasible queue length: 0 |
|
Schedule queue length: 0 |
|
Dispatch queue length: 0 |
|
num_waiting_for_resource: 0 |
|
num_waiting_for_plasma_memory: 0 |
|
num_waiting_for_remote_node_resources: 0 |
|
num_worker_not_started_by_job_config_not_exist: 0 |
|
num_worker_not_started_by_registration_timeout: 0 |
|
num_tasks_waiting_for_workers: 0 |
|
num_cancelled_tasks: 0 |
|
cluster_resource_scheduler state: |
|
Local id: -609853312980384924 Local resources: {"total":{GPU: [10000, 10000], node:192.168.0.2: [10000], accelerator_type:A40: [10000], CPU: [200000], object_store_memory: [21474836480000], node:__internal_head__: [10000], memory: [846480855040000]}}, "available": {GPU: [10000, 10000], node:192.168.0.2: [10000], accelerator_type:A40: [10000], CPU: [200000], object_store_memory: [21474836480000], node:__internal_head__: [10000], memory: [846480855040000]}}, "labels":{"ray.io/node_id":"ed5029aed12dfb118ce5ec8eeddd392389018cb6a21a8c84b00cbebd",} is_draining: 0 is_idle: 1 Cluster resources: node id: -609853312980384924{"total":{accelerator_type:A40: 10000, node:192.168.0.2: 10000, GPU: 20000, memory: 846480855040000, CPU: 200000, object_store_memory: 21474836480000, node:__internal_head__: 10000}}, "available": {accelerator_type:A40: 10000, node:192.168.0.2: 10000, object_store_memory: 21474836480000, CPU: 200000, node:__internal_head__: 10000, memory: 846480855040000, GPU: 20000}}, "labels":{"ray.io/node_id":"ed5029aed12dfb118ce5ec8eeddd392389018cb6a21a8c84b00cbebd",}, "is_draining": 0, "draining_deadline_timestamp_ms": -1} { "placment group locations": [], "node to bundles": []} |
|
Waiting tasks size: 0 |
|
Number of executing tasks: 0 |
|
Number of pinned task arguments: 0 |
|
Number of total spilled tasks: 0 |
|
Number of spilled waiting tasks: 0 |
|
Number of spilled unschedulable tasks: 0 |
|
Resource usage { |
|
} |
|
Backlog Size per scheduling descriptor :{workerId: num backlogs}: |
|
|
|
Running tasks by scheduling class: |
|
================================================== |
|
|
|
ClusterResources: |
|
LocalObjectManager: |
|
- num pinned objects: 0 |
|
- pinned objects size: 0 |
|
- num objects pending restore: 0 |
|
- num objects pending spill: 0 |
|
- num bytes pending spill: 0 |
|
- num bytes currently spilled: 0 |
|
- cumulative spill requests: 0 |
|
- cumulative restore requests: 0 |
|
- spilled objects pending delete: 0 |
|
|
|
ObjectManager: |
|
- num local objects: 0 |
|
- num unfulfilled push requests: 0 |
|
- num object pull requests: 0 |
|
- num chunks received total: 0 |
|
- num chunks received failed (all): 0 |
|
- num chunks received failed / cancelled: 0 |
|
- num chunks received failed / plasma error: 0 |
|
Event stats: |
|
Global stats: 0 total (0 active) |
|
Queueing time: mean = -nan s, max = -0.000 s, min = 9223372036.855 s, total = 0.000 s |
|
Execution time: mean = -nan s, total = 0.000 s |
|
Event stats: |
|
PushManager: |
|
- num pushes in flight: 0 |
|
- num chunks in flight: 0 |
|
- num chunks remaining: 0 |
|
- max chunks allowed: 409 |
|
OwnershipBasedObjectDirectory: |
|
- num listeners: 0 |
|
- cumulative location updates: 0 |
|
- num location updates per second: 0.000 |
|
- num location lookups per second: 0.000 |
|
- num locations added per second: 0.000 |
|
- num locations removed per second: 0.000 |
|
BufferPool: |
|
- create buffer state map size: 0 |
|
PullManager: |
|
- num bytes available for pulled objects: 2147483648 |
|
- num bytes being pulled (all): 0 |
|
- num bytes being pulled / pinned: 0 |
|
- get request bundles: BundlePullRequestQueue{0 total, 0 active, 0 inactive, 0 unpullable} |
|
- wait request bundles: BundlePullRequestQueue{0 total, 0 active, 0 inactive, 0 unpullable} |
|
- task request bundles: BundlePullRequestQueue{0 total, 0 active, 0 inactive, 0 unpullable} |
|
- first get request bundle: N/A |
|
- first wait request bundle: N/A |
|
- first task request bundle: N/A |
|
- num objects queued: 0 |
|
- num objects actively pulled (all): 0 |
|
- num objects actively pulled / pinned: 0 |
|
- num bundles being pulled: 0 |
|
- num pull retries: 0 |
|
- max timeout seconds: 0 |
|
- max timeout request is already processed. No entry. |
|
|
|
WorkerPool: |
|
- registered jobs: 1 |
|
- process_failed_job_config_missing: 0 |
|
- process_failed_rate_limited: 0 |
|
- process_failed_pending_registration: 0 |
|
- process_failed_runtime_env_setup_failed: 0 |
|
- num PYTHON workers: 20 |
|
- num PYTHON drivers: 1 |
|
- num PYTHON pending start requests: 0 |
|
- num PYTHON pending registration requests: 0 |
|
- num object spill callbacks queued: 0 |
|
- num object restore queued: 0 |
|
- num util functions queued: 0 |
|
- num idle workers: 20 |
|
TaskDependencyManager: |
|
- task deps map size: 0 |
|
- get req map size: 0 |
|
- wait req map size: 0 |
|
- local objects map size: 0 |
|
WaitManager: |
|
- num active wait requests: 0 |
|
Subscriber: |
|
Channel WORKER_OBJECT_EVICTION |
|
- cumulative subscribe requests: 0 |
|
- cumulative unsubscribe requests: 0 |
|
- active subscribed publishers: 0 |
|
- cumulative published messages: 0 |
|
- cumulative processed messages: 0 |
|
Channel WORKER_REF_REMOVED_CHANNEL |
|
- cumulative subscribe requests: 0 |
|
- cumulative unsubscribe requests: 0 |
|
- active subscribed publishers: 0 |
|
- cumulative published messages: 0 |
|
- cumulative processed messages: 0 |
|
Channel WORKER_OBJECT_LOCATIONS_CHANNEL |
|
- cumulative subscribe requests: 0 |
|
- cumulative unsubscribe requests: 0 |
|
- active subscribed publishers: 0 |
|
- cumulative published messages: 0 |
|
- cumulative processed messages: 0 |
|
num async plasma notifications: 0 |
|
Remote node managers: |
|
Event stats: |
|
Global stats: 327336 total (35 active) |
|
Queueing time: mean = 157.796 ms, max = 1921.160 s, min = -0.001 s, total = 51652.241 s |
|
Execution time: mean = 11.183 ms, total = 3660.583 s |
|
Event stats: |
|
NodeManagerService.grpc_server.ReportWorkerBacklog.HandleRequestImpl - 78537 total (0 active), Execution time: mean = 36.782 us, total = 2.889 s, Queueing time: mean = 107.699 us, max = 3.225 ms, min = 1.438 us, total = 8.458 s |
|
NodeManagerService.grpc_server.ReportWorkerBacklog - 78537 total (0 active), Execution time: mean = 525.207 us, total = 41.248 s, Queueing time: mean = 0.000 s, max = -0.000 s, min = 9223372036.855 s, total = 0.000 s |
|
NodeManager.CheckGC - 37369 total (1 active), Execution time: mean = 3.184 us, total = 118.978 ms, Queueing time: mean = 99.695 us, max = 51.449 ms, min = 3.386 us, total = 3.726 s |
|
RaySyncer.OnDemandBroadcasting - 37369 total (1 active), Execution time: mean = 11.670 us, total = 436.105 ms, Queueing time: mean = 92.257 us, max = 51.440 ms, min = 7.347 us, total = 3.448 s |
|
ObjectManager.UpdateAvailableMemory - 37368 total (0 active), Execution time: mean = 5.998 us, total = 224.145 ms, Queueing time: mean = 104.379 us, max = 1.031 ms, min = 2.098 us, total = 3.900 s |
|
RayletWorkerPool.deadline_timer.kill_idle_workers - 18695 total (1 active), Execution time: mean = 19.301 us, total = 360.831 ms, Queueing time: mean = 76.564 us, max = 13.722 ms, min = 4.133 us, total = 1.431 s |
|
MemoryMonitor.CheckIsMemoryUsageAboveThreshold - 14932 total (1 active), Execution time: mean = 456.426 us, total = 6.815 s, Queueing time: mean = 75.037 us, max = 1.472 ms, min = -0.001 s, total = 1.120 s |
|
NodeManager.ScheduleAndDispatchTasks - 3741 total (1 active), Execution time: mean = 15.314 us, total = 57.290 ms, Queueing time: mean = 67.187 us, max = 2.582 ms, min = 6.718 us, total = 251.345 ms |
|
NodeManager.deadline_timer.flush_free_objects - 3740 total (1 active), Execution time: mean = 9.528 us, total = 35.635 ms, Queueing time: mean = 182.702 us, max = 2.380 ms, min = 160.000 ns, total = 683.306 ms |
|
NodeManager.deadline_timer.spill_objects_when_over_threshold - 3740 total (1 active), Execution time: mean = 2.970 us, total = 11.107 ms, Queueing time: mean = 186.949 us, max = 2.379 ms, min = 4.508 us, total = 699.189 ms |
|
NodeManagerService.grpc_server.GetResourceLoad.HandleRequestImpl - 3739 total (0 active), Execution time: mean = 102.185 us, total = 382.070 ms, Queueing time: mean = 112.302 us, max = 1.512 ms, min = 4.918 us, total = 419.896 ms |
|
NodeManagerService.grpc_server.GetResourceLoad - 3739 total (0 active), Execution time: mean = 625.139 us, total = 2.337 s, Queueing time: mean = 0.000 s, max = -0.000 s, min = 9223372036.855 s, total = 0.000 s |
|
ClusterResourceManager.ResetRemoteNodeView - 1247 total (1 active), Execution time: mean = 9.266 us, total = 11.555 ms, Queueing time: mean = 72.901 us, max = 363.446 us, min = 7.807 us, total = 90.907 ms |
|
NodeManager.GcsCheckAlive - 748 total (1 active), Execution time: mean = 324.276 us, total = 242.559 ms, Queueing time: mean = 623.636 us, max = 2.445 ms, min = 6.025 us, total = 466.480 ms |
|
ray::rpc::NodeInfoGcsService.grpc_client.CheckAlive.OnReplyReceived - 748 total (0 active), Execution time: mean = 55.115 us, total = 41.226 ms, Queueing time: mean = 105.471 us, max = 307.469 us, min = 11.913 us, total = 78.893 ms |
|
ray::rpc::NodeInfoGcsService.grpc_client.CheckAlive - 748 total (0 active), Execution time: mean = 1.559 ms, total = 1.166 s, Queueing time: mean = 0.000 s, max = -0.000 s, min = 9223372036.855 s, total = 0.000 s |
|
NodeManager.deadline_timer.record_metrics - 748 total (1 active), Execution time: mean = 553.899 us, total = 414.316 ms, Queueing time: mean = 394.386 us, max = 2.257 ms, min = 8.454 us, total = 295.001 ms |
|
NodeManager.deadline_timer.debug_state_dump - 374 total (1 active, 1 running), Execution time: mean = 1.822 ms, total = 681.602 ms, Queueing time: mean = 73.015 us, max = 183.426 us, min = 11.269 us, total = 27.308 ms |
|
ClientConnection.async_read.ProcessMessageHeader - 241 total (21 active), Execution time: mean = 8.391 us, total = 2.022 ms, Queueing time: mean = 214.220 s, max = 1921.160 s, min = 23.644 us, total = 51627.016 s |
|
ClientConnection.async_read.ProcessMessage - 220 total (0 active), Execution time: mean = 355.794 us, total = 78.275 ms, Queueing time: mean = 20.354 us, max = 494.085 us, min = 2.397 us, total = 4.478 ms |
|
NodeManagerService.grpc_server.RequestWorkerLease - 83 total (0 active), Execution time: mean = 48.706 ms, total = 4.043 s, Queueing time: mean = 0.000 s, max = -0.000 s, min = 9223372036.855 s, total = 0.000 s |
|
NodeManagerService.grpc_server.RequestWorkerLease.HandleRequestImpl - 83 total (0 active), Execution time: mean = 100.509 us, total = 8.342 ms, Queueing time: mean = 180.288 us, max = 674.029 us, min = 6.921 us, total = 14.964 ms |
|
NodeManagerService.grpc_server.ReturnWorker.HandleRequestImpl - 73 total (0 active), Execution time: mean = 105.669 us, total = 7.714 ms, Queueing time: mean = 101.687 us, max = 252.805 us, min = 19.400 us, total = 7.423 ms |
|
NodeManagerService.grpc_server.ReturnWorker - 73 total (0 active), Execution time: mean = 589.105 us, total = 43.005 ms, Queueing time: mean = 0.000 s, max = -0.000 s, min = 9223372036.855 s, total = 0.000 s |
|
WorkerPool.PopWorkerCallback - 73 total (0 active), Execution time: mean = 36.946 us, total = 2.697 ms, Queueing time: mean = 164.270 us, max = 539.776 us, min = 15.433 us, total = 11.992 ms |
|
- 65 total (0 active), Execution time: mean = 913.708 ns, total = 59.391 us, Queueing time: mean = 98.104 us, max = 237.802 us, min = 20.527 us, total = 6.377 ms |
|
RaySyncer.BroadcastMessage - 65 total (0 active), Execution time: mean = 214.865 us, total = 13.966 ms, Queueing time: mean = 691.308 ns, max = 1.206 us, min = 91.000 ns, total = 44.935 us |
|
NodeManager.deadline_timer.print_event_loop_stats - 63 total (1 active), Execution time: mean = 2.828 ms, total = 178.176 ms, Queueing time: mean = 70.600 us, max = 169.082 us, min = 13.745 us, total = 4.448 ms |
|
ClientConnection.async_write.DoAsyncWrites - 22 total (0 active), Execution time: mean = 1.435 us, total = 31.563 us, Queueing time: mean = 66.914 us, max = 367.875 us, min = 8.996 us, total = 1.472 ms |
|
NodeManagerService.grpc_server.GetSystemConfig.HandleRequestImpl - 21 total (0 active), Execution time: mean = 129.760 us, total = 2.725 ms, Queueing time: mean = 104.003 us, max = 159.261 us, min = 13.379 us, total = 2.184 ms |
|
NodeManagerService.grpc_server.GetSystemConfig - 21 total (0 active), Execution time: mean = 1.379 ms, total = 28.968 ms, Queueing time: mean = 0.000 s, max = -0.000 s, min = 9223372036.855 s, total = 0.000 s |
|
ObjectManager.ObjectDeleted - 21 total (0 active), Execution time: mean = 20.184 us, total = 423.864 us, Queueing time: mean = 120.258 us, max = 235.929 us, min = 29.898 us, total = 2.525 ms |
|
ObjectManager.ObjectAdded - 21 total (0 active), Execution time: mean = 14.666 us, total = 307.984 us, Queueing time: mean = 125.073 us, max = 550.635 us, min = 8.198 us, total = 2.627 ms |
|
PeriodicalRunner.RunFnPeriodically - 13 total (0 active), Execution time: mean = 439.203 us, total = 5.710 ms, Queueing time: mean = 4.809 ms, max = 12.424 ms, min = 61.413 us, total = 62.521 ms |
|
NodeManagerService.grpc_server.CancelWorkerLease.HandleRequestImpl - 10 total (0 active), Execution time: mean = 84.638 us, total = 846.385 us, Queueing time: mean = 261.769 us, max = 482.894 us, min = 112.774 us, total = 2.618 ms |
|
NodeManagerService.grpc_server.CancelWorkerLease - 10 total (0 active), Execution time: mean = 875.644 us, total = 8.756 ms, Queueing time: mean = 0.000 s, max = -0.000 s, min = 9223372036.855 s, total = 0.000 s |
|
ray::rpc::InternalPubSubGcsService.grpc_client.GcsSubscriberPoll - 8 total (1 active), Execution time: mean = 449.831 s, total = 3598.645 s, Queueing time: mean = 0.000 s, max = -0.000 s, min = 9223372036.855 s, total = 0.000 s |
|
ray::rpc::InternalPubSubGcsService.grpc_client.GcsSubscriberPoll.OnReplyReceived - 7 total (0 active), Execution time: mean = 422.222 us, total = 2.956 ms, Queueing time: mean = 100.169 us, max = 238.879 us, min = 29.074 us, total = 701.180 us |
|
NodeManager.GCTaskFailureReason - 5 total (1 active), Execution time: mean = 8.422 us, total = 42.108 us, Queueing time: mean = 68.676 us, max = 126.591 us, min = 59.744 us, total = 343.378 us |
|
ray::rpc::InternalPubSubGcsService.grpc_client.GcsSubscriberCommandBatch.OnReplyReceived - 2 total (0 active), Execution time: mean = 137.931 us, total = 275.863 us, Queueing time: mean = 2.023 ms, max = 4.028 ms, min = 18.196 us, total = 4.047 ms |
|
ray::rpc::InternalPubSubGcsService.grpc_client.GcsSubscriberCommandBatch - 2 total (0 active), Execution time: mean = 1.513 ms, total = 3.027 ms, Queueing time: mean = 0.000 s, max = -0.000 s, min = 9223372036.855 s, total = 0.000 s |
|
RaySyncerRegister - 2 total (0 active), Execution time: mean = 1.986 us, total = 3.973 us, Queueing time: mean = 180.500 ns, max = 284.000 ns, min = 77.000 ns, total = 361.000 ns |
|
ray::rpc::JobInfoGcsService.grpc_client.GetAllJobInfo - 1 total (0 active), Execution time: mean = 1.721 ms, total = 1.721 ms, Queueing time: mean = 0.000 s, max = -0.000 s, min = 9223372036.855 s, total = 0.000 s |
|
ray::rpc::NodeInfoGcsService.grpc_client.GetAllNodeInfo - 1 total (0 active), Execution time: mean = 1.482 ms, total = 1.482 ms, Queueing time: mean = 0.000 s, max = -0.000 s, min = 9223372036.855 s, total = 0.000 s |
|
ray::rpc::JobInfoGcsService.grpc_client.ReportJobError - 1 total (0 active), Execution time: mean = 1.897 ms, total = 1.897 ms, Queueing time: mean = 0.000 s, max = -0.000 s, min = 9223372036.855 s, total = 0.000 s |
|
ray::rpc::NodeInfoGcsService.grpc_client.GetAllNodeInfo.OnReplyReceived - 1 total (0 active), Execution time: mean = 142.402 us, total = 142.402 us, Queueing time: mean = 115.097 us, max = 115.097 us, min = 115.097 us, total = 115.097 us |
|
Subscriber.HandlePublishedMessage_GCS_JOB_CHANNEL - 1 total (0 active), Execution time: mean = 69.860 us, total = 69.860 us, Queueing time: mean = 301.959 us, max = 301.959 us, min = 301.959 us, total = 301.959 us |
|
ray::rpc::InternalKVGcsService.grpc_client.GetInternalConfig - 1 total (0 active), Execution time: mean = 2.102 ms, total = 2.102 ms, Queueing time: mean = 0.000 s, max = -0.000 s, min = 9223372036.855 s, total = 0.000 s |
|
ray::rpc::JobInfoGcsService.grpc_client.GetAllJobInfo.OnReplyReceived - 1 total (0 active), Execution time: mean = 20.913 us, total = 20.913 us, Queueing time: mean = 20.083 us, max = 20.083 us, min = 20.083 us, total = 20.083 us |
|
ray::rpc::NodeInfoGcsService.grpc_client.RegisterNode.OnReplyReceived - 1 total (0 active), Execution time: mean = 585.655 us, total = 585.655 us, Queueing time: mean = 25.912 us, max = 25.912 us, min = 25.912 us, total = 25.912 us |
|
ray::rpc::InternalKVGcsService.grpc_client.GetInternalConfig.OnReplyReceived - 1 total (0 active), Execution time: mean = 22.315 ms, total = 22.315 ms, Queueing time: mean = 78.086 us, max = 78.086 us, min = 78.086 us, total = 78.086 us |
|
ray::rpc::JobInfoGcsService.grpc_client.ReportJobError.OnReplyReceived - 1 total (0 active), Execution time: mean = 71.261 us, total = 71.261 us, Queueing time: mean = 144.822 us, max = 144.822 us, min = 144.822 us, total = 144.822 us |
|
ray::rpc::JobInfoGcsService.grpc_client.AddJob - 1 total (0 active), Execution time: mean = 1.647 ms, total = 1.647 ms, Queueing time: mean = 0.000 s, max = -0.000 s, min = 9223372036.855 s, total = 0.000 s |
|
ray::rpc::JobInfoGcsService.grpc_client.AddJob.OnReplyReceived - 1 total (0 active), Execution time: mean = 53.657 us, total = 53.657 us, Queueing time: mean = 375.226 us, max = 375.226 us, min = 375.226 us, total = 375.226 us |
|
ray::rpc::NodeInfoGcsService.grpc_client.RegisterNode - 1 total (0 active), Execution time: mean = 2.339 ms, total = 2.339 ms, Queueing time: mean = 0.000 s, max = -0.000 s, min = 9223372036.855 s, total = 0.000 s |
|
DebugString() time ms: 1 |