getRepairFunc returns the repair function based on the component.
(hco *options.HealthCheckerOptions)
| 55 | |
| 56 | // getRepairFunc returns the repair function based on the component. |
| 57 | func getRepairFunc(hco *options.HealthCheckerOptions) func() { |
| 58 | // Use `systemctl kill` instead of `systemctl restart` for the repair function. |
| 59 | // We start to rely on the kernel message difference for the two commands to |
| 60 | // indicate if the component restart is due to an administrative plan (restart) |
| 61 | // or a system issue that needs repair (kill). |
| 62 | // See https://github.com/kubernetes/node-problem-detector/issues/847. |
| 63 | switch hco.Component { |
| 64 | case types.DockerComponent: |
| 65 | // Use "docker ps" for docker health check. Not using crictl for docker to remove |
| 66 | // dependency on the kubelet. |
| 67 | return func() { |
| 68 | if _, err := execCommand(types.CmdTimeout, "pkill", "-SIGUSR1", "dockerd"); err != nil { |
| 69 | klog.Errorf("Failed to send SIGUSR1 to dockerd: %v", err) |
| 70 | } |
| 71 | if _, err := execCommand(types.CmdTimeout, "systemctl", "kill", "--kill-who=main", hco.Service); err != nil { |
| 72 | klog.Errorf("Failed to kill service %s: %v", hco.Service, err) |
| 73 | } |
| 74 | } |
| 75 | default: |
| 76 | // Just kill the service for all other components |
| 77 | return func() { |
| 78 | if _, err := execCommand(types.CmdTimeout, "systemctl", "kill", "--kill-who=main", hco.Service); err != nil { |
| 79 | klog.Errorf("Failed to kill service %s: %v", hco.Service, err) |
| 80 | } |
| 81 | } |
| 82 | } |
| 83 | } |
| 84 | |
| 85 | // checkForPattern returns (true, nil) if logPattern occurs less than logCountThreshold number of times since last |
| 86 | // service restart. (false, nil) otherwise. |
no test coverage detected