Add read seek optimization for log monitoring
Signed-off-by: Felipe Cardoso <felipe.cardoso@hotmail.it>
This commit is contained in:
@@ -21,11 +21,13 @@ class TrainingMonitor:
|
|||||||
self.sftp_client = None
|
self.sftp_client = None
|
||||||
self._running = False
|
self._running = False
|
||||||
self._monitor_task = None
|
self._monitor_task = None
|
||||||
self.recent_logs: List[str] = [] # Store recent log lines
|
self.recent_logs: List[str] = []
|
||||||
self.max_log_lines: int = 100 # Keep last 100 lines
|
self.max_log_lines: int = 500
|
||||||
self.current_status: Optional[TrainingStatus] = None
|
self.current_status: Optional[TrainingStatus] = None
|
||||||
self.remote_path = settings.TRAINING_LOG_REMOTE_PATH if hasattr(settings, 'TRAINING_LOG_REMOTE_PATH') else None
|
self.remote_path = settings.TRAINING_LOG_REMOTE_PATH if hasattr(settings, 'TRAINING_LOG_REMOTE_PATH') else None
|
||||||
self.local_path = settings.TRAINING_LOG_LOCAL_PATH if hasattr(settings, 'TRAINING_LOG_LOCAL_PATH') else None
|
self.local_path = settings.TRAINING_LOG_LOCAL_PATH if hasattr(settings, 'TRAINING_LOG_LOCAL_PATH') else None
|
||||||
|
self._file_handle = None
|
||||||
|
self._last_position = 0
|
||||||
|
|
||||||
def _parse_tqdm_line(self, line: str) -> Optional[TrainingStatus]:
|
def _parse_tqdm_line(self, line: str) -> Optional[TrainingStatus]:
|
||||||
"""Parse tqdm output line into TrainingStatus"""
|
"""Parse tqdm output line into TrainingStatus"""
|
||||||
@@ -87,30 +89,91 @@ class TrainingMonitor:
|
|||||||
logger.error(f"Error reading local log: {str(e)}")
|
logger.error(f"Error reading local log: {str(e)}")
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
|
async def _open_log_file(self):
|
||||||
|
"""Open and maintain file handle"""
|
||||||
|
if self.remote_path:
|
||||||
|
if not self.sftp_client:
|
||||||
|
await self._connect_sftp()
|
||||||
|
self._file_handle = self.sftp_client.open(self.remote_path, 'rb')
|
||||||
|
else:
|
||||||
|
self._file_handle = await aiofiles.open(self.local_path, 'rb')
|
||||||
|
|
||||||
|
async def _read_new_content(self) -> str:
|
||||||
|
"""Read only new content since last read"""
|
||||||
|
if not self._file_handle:
|
||||||
|
await self._open_log_file()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get file size
|
||||||
|
if self.remote_path:
|
||||||
|
self._file_handle.seek(0, 2) # Seek to end
|
||||||
|
file_size = self._file_handle.tell()
|
||||||
|
else:
|
||||||
|
file_size = os.path.getsize(self.local_path)
|
||||||
|
|
||||||
|
if file_size < self._last_position:
|
||||||
|
# File has been truncated/rotated
|
||||||
|
logger.info("Log file has been truncated, reading from start")
|
||||||
|
self._last_position = 0
|
||||||
|
|
||||||
|
# Seek to last position
|
||||||
|
self._file_handle.seek(self._last_position)
|
||||||
|
|
||||||
|
# Read new content
|
||||||
|
new_content = self._file_handle.read()
|
||||||
|
if isinstance(new_content, bytes):
|
||||||
|
new_content = new_content.decode('utf-8')
|
||||||
|
|
||||||
|
# Update position
|
||||||
|
self._last_position = file_size
|
||||||
|
|
||||||
|
return new_content
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error reading log: {str(e)}")
|
||||||
|
# Try to reopen the file on error
|
||||||
|
await self._reopen_log_file()
|
||||||
|
return ""
|
||||||
|
|
||||||
|
async def _reopen_log_file(self):
|
||||||
|
"""Reopen file handle in case of errors"""
|
||||||
|
try:
|
||||||
|
if self._file_handle:
|
||||||
|
self._file_handle.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
self._file_handle = None
|
||||||
|
await self._open_log_file()
|
||||||
|
|
||||||
|
|
||||||
async def _monitor_log(self):
|
async def _monitor_log(self):
|
||||||
"""Monitor log file for updates"""
|
"""Monitor log file for updates"""
|
||||||
while self._running:
|
while self._running:
|
||||||
try:
|
try:
|
||||||
content = (await self._read_remote_log() if self.remote_path
|
new_content = await self._read_new_content()
|
||||||
else await self._read_local_log())
|
if new_content:
|
||||||
|
# Process new lines
|
||||||
|
new_lines = new_content.splitlines()
|
||||||
|
if new_lines:
|
||||||
|
# Update recent logs
|
||||||
|
self.recent_logs.extend(new_lines)
|
||||||
|
self.recent_logs = self.recent_logs[-self.max_log_lines:]
|
||||||
|
|
||||||
# Get last line containing progress info
|
# Update status from last progress line
|
||||||
lines = content.splitlines()
|
for line in reversed(new_lines):
|
||||||
self.recent_logs = lines[-self.max_log_lines:] if lines else []
|
if '|' in line:
|
||||||
for line in reversed(lines):
|
status = self._parse_tqdm_line(line)
|
||||||
if '|' in line: # Basic check for tqdm output
|
if status:
|
||||||
status = self._parse_tqdm_line(line)
|
self.current_status = status
|
||||||
if status:
|
break
|
||||||
self.current_status = status
|
|
||||||
break
|
|
||||||
|
|
||||||
await asyncio.sleep(1) # Check every second
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Monitor error: {str(e)}")
|
logger.error(f"Monitor error: {str(e)}")
|
||||||
await asyncio.sleep(5) # Wait before retry
|
await asyncio.sleep(5)
|
||||||
|
|
||||||
async def get_log(self, lines: int = 50) -> List[str]:
|
async def get_log(self, lines: int = 50) -> List[str]:
|
||||||
"""Get recent log entries"""
|
"""Get recent log entries"""
|
||||||
@@ -134,6 +197,15 @@ class TrainingMonitor:
|
|||||||
await self._monitor_task
|
await self._monitor_task
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Close file handle
|
||||||
|
if self._file_handle:
|
||||||
|
try:
|
||||||
|
self._file_handle.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
self._file_handle = None
|
||||||
|
|
||||||
logger.info("Training monitor stopped")
|
logger.info("Training monitor stopped")
|
||||||
|
|
||||||
async def get_status(self) -> Optional[TrainingStatus]:
|
async def get_status(self) -> Optional[TrainingStatus]:
|
||||||
|
|||||||
Reference in New Issue
Block a user