fix(v2.4.3): flock serialization for bot_action_dispatch

Iteration 3 discovered a race condition: parallel CLI invocations of
change-lite-domain occasionally produced 'no secret in config' because
one process read config.json while another was mid-rewrite (jq -> tmp -> mv).
asyncio.Lock in bot.py only protects bot->bot races; CLI-level races and
bot+CLI mixed races were still possible.

- install.sh: bot_action_dispatch wraps dispatch in flock(1) on
  /var/lock/gotelegram-bot-action.lock, 30s timeout, EX_TEMPFAIL on timeout
- common.sh: flock added to critical deps; apt_pkg_for_cmd/dnf_pkg_for_cmd
  map flock -> util-linux
- common.sh: check_deps_present includes flock
- version bumped to 2.4.3 (common.sh + bot.py)
This commit is contained in:
anten-ka
2026-04-10 13:35:22 +03:00
parent 724eeb92d9
commit e9af6e969f
3 changed files with 48 additions and 6 deletions

View File

@@ -1323,8 +1323,45 @@ bot_action_change_lite_domain() {
return 0
}
# Main dispatcher — called from main() when --action=X is present
# Main dispatcher — called from main() when --action=X is present.
# Uses a file lock (flock) so concurrent CLI invocations (from multiple bot
# users, or from bot + manual CLI) serialize cleanly. Without this, two
# parallel `change-lite-domain` calls raced on the jq-rewrite of config.json
# and one process would see a truncated file ("no secret in config").
bot_action_dispatch() {
local lock_file="/var/lock/gotelegram-bot-action.lock"
# Make sure /var/lock exists (it does on Debian/Ubuntu; be defensive for minimal images)
[ -d /var/lock ] || mkdir -p /var/lock 2>/dev/null || true
if command -v flock >/dev/null 2>&1; then
# Wait up to 30 seconds for the lock — bot actions are fast (<5s
# typical), so 30s is plenty for legitimate serialization but short
# enough to surface a stuck process.
(
flock -w 30 9 || {
# If we time out, emit JSON error for the bot parent.
local json_out=0 a
for a in "$@"; do
[ "$a" = "--json" ] && json_out=1
done
if [ "$json_out" = "1" ]; then
bot_emit_json "error" "another action in progress (lock timeout)" "code=lock_timeout"
fi
exit 75 # EX_TEMPFAIL
}
_bot_action_dispatch_locked "$@"
) 9>"$lock_file"
return $?
else
# No flock installed — run unlocked with a warning. ensure_deps/check_deps
# normally ensures util-linux is present, so this branch is defensive.
log_warning "flock not available — bot actions not serialized"
_bot_action_dispatch_locked "$@"
return $?
fi
}
_bot_action_dispatch_locked() {
local action="" tpl_id="" domain="" json_out=0 arg
for arg in "$@"; do
case "$arg" in