Catch and format exception information, send alert message to Feishu.
(self, alert_address: str = None, excp_info: str = None)
| 149 | self.last_step_loss = cur_step_loss |
| 150 | |
| 151 | def monitor_exception(self, alert_address: str = None, excp_info: str = None): |
| 152 | """Catch and format exception information, send alert message to Feishu.""" |
| 153 | filtered_trace = excp_info.split("\n")[-10:] |
| 154 | format_trace = "" |
| 155 | for line in filtered_trace: |
| 156 | format_trace += "\n" + line |
| 157 | send_alert_message( |
| 158 | address=alert_address, |
| 159 | message=f"Catch Exception from {socket.gethostname()} with rank id {gpc.get_global_rank()}:{format_trace}", |
| 160 | ) |
| 161 | |
| 162 | def handle_sigterm(self, alert_address: str = None): |
| 163 | """Catch SIGTERM signal, and send alert message to Feishu.""" |
no test coverage detected