Index: /branches/amp_4_0/platform/config/telegraf/telegraf.conf
===================================================================
--- /branches/amp_4_0/platform/config/telegraf/telegraf.conf	(revision 2954)
+++ /branches/amp_4_0/platform/config/telegraf/telegraf.conf	(working copy)
@@ -165,14 +165,38 @@
 '''
 
 [[processors.starlark]]
-  order = 7
-  # GLOBAL HOST REMOVAL: Removes the default Telegraf 'host' tag/field from ALL metrics
-  # This ensures the default Telegraf host field, which PostgreSQL does not have a column for, is removed universally.
+  order = 10
+  # Robust mapping of host identifiers to match database schema
   source = '''
 def apply(metric):
-    if "host" in metric.fields:
-        metric.fields.pop("host")
-    if "host" in metric.tags:
-        metric.tags.pop("host")
+    # Tables that have a 'host' column in the schema
+    system_metrics = ["cpu", "mem", "disk", "diskio", "net", "system", "docker", 
+                      "docker_container_cpu", "docker_container_mem", "docker_container_net", "docker_container_blkio"]
+    
+    if metric.name in system_metrics:
+        # Keep 'host', but remove 'source'
+        if "source" in metric.tags: metric.tags.pop("source")
+        return metric
+
+    # Tables that use 'agent_host' instead of 'host' (SNMP Device Metrics)
+    device_prefixes = ["an_", "ag_", "asf_", "apv_"]
+    is_device = False
+    for p in device_prefixes:
+        if metric.name.startswith(p):
+            is_device = True
+            break
+            
+    if is_device:
+        # Map source/host to agent_host
+        addr = metric.tags.get("source") or metric.tags.get("agent_host") or metric.tags.get("host")
+        if addr:
+            metric.tags["agent_host"] = addr
+            
+        # Remove columns that don't exist in SNMP tables
+        if "host" in metric.tags: metric.tags.pop("host")
+        if "host" in metric.fields: metric.fields.pop("host")
+        if "source" in metric.tags: metric.tags.pop("source")
+        if "source" in metric.fields: metric.fields.pop("source")
+        
     return metric
 '''
Index: /branches/amp_4_0/platform/tools/container/README.md
===================================================================
--- /branches/amp_4_0/platform/tools/container/README.md	(revision 2954)
+++ /branches/amp_4_0/platform/tools/container/README.md	(working copy)
@@ -3,6 +3,8 @@
 ## 1. Introduction
 
 This directory contains the configurations and scripts to deploy the AMP platform on **Docker Swarm**.
+
+For operations and incident handling, see `TROUBLESHOOTING.md`.
 
 ### Prerequisites
 
Index: /branches/amp_4_0/platform/tools/container/TROUBLESHOOTING.md
===================================================================
--- /branches/amp_4_0/platform/tools/container/TROUBLESHOOTING.md	(nonexistent)
+++ /branches/amp_4_0/platform/tools/container/TROUBLESHOOTING.md	(working copy)
@@ -0,0 +1,318 @@
+# AMP Docker Swarm Debug & Troubleshooting Runbook
+
+This runbook is for the Docker Swarm deployment under `platform/tools/container`.
+
+Set dynamic paths once per shell session:
+
+```bash
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+cd "$SCRIPT_DIR"
+```
+
+## 1. Quick Triage (First 5 Minutes)
+
+Run from manager node:
+
+```bash
+cd "$SCRIPT_DIR"
+./manage_amp.sh status
+docker service ls
+docker stack ps amp --no-trunc
+docker node ls
+```
+
+If Swarm itself is unstable:
+
+```bash
+docker info | grep -i swarm
+systemctl status docker
+journalctl -u docker -n 200 --no-pager
+```
+
+## 2. Service and Task Status
+
+Stack level:
+
+```bash
+docker stack services amp
+docker stack ps amp --no-trunc
+```
+
+Single service:
+
+```bash
+docker service ps amp_opensearch --no-trunc
+docker service ps amp_timescaledb --no-trunc
+docker service inspect amp_opensearch --pretty
+```
+
+Find failing tasks quickly:
+
+```bash
+docker stack ps amp --no-trunc | egrep "Rejected|Failed|Shutdown|Complete"
+```
+
+## 3. Logs: Where and How to Check
+
+### 3.1 Swarm service logs
+
+```bash
+docker service logs -f --tail 200 amp_opensearch
+docker service logs -f --tail 200 amp_timescaledb
+docker service logs -f --tail 200 amp_nginx
+docker service logs -f --tail 200 amp_logstash
+docker service logs -f --tail 200 amp_amp-core
+```
+
+### 3.2 Task/container logs on specific nodes
+
+```bash
+docker ps --format "table {{.ID}}\t{{.Names}}\t{{.Status}}" | grep amp_
+docker logs -f <container_id>
+```
+
+### 3.3 Host-mounted logs
+
+`manage_amp.sh` uses `AMP_LOG_ROOT` (default: `/var/log/amp`).
+
+```bash
+ls -lah /var/log/amp
+ls -lah /var/log/amp/nginx
+ls -lah /var/log/amp/opensearch
+ls -lah /var/log/amp/amp-core
+tail -n 200 /var/log/amp/nginx/error.log
+```
+
+### 3.4 Deployment script log
+
+```bash
+tail -n 300 "$SCRIPT_DIR/manage_amp.log"
+```
+
+## 4. Restart Operations
+
+Recommended restart wrappers:
+
+```bash
+# Restart all services in stack
+./manage_amp.sh restart
+
+# Restart one service (short name or full swarm service name)
+./manage_amp.sh restart opensearch
+./manage_amp.sh restart amp_opensearch
+```
+
+Equivalent direct Docker command:
+
+```bash
+docker service update --force amp_opensearch
+```
+
+## 5. Build and Update Images
+
+### 5.1 Full rebuild + rollout (recommended)
+
+```bash
+cd "$SCRIPT_DIR"
+./manage_amp.sh build
+./manage_amp.sh deploy --auto
+```
+
+Use this for standard update flows. `--auto` regenerates dynamic cluster values before deployment.
+
+### 5.2 Build only one image on local build machine
+
+Set registry first:
+
+```bash
+export REGISTRY="<MANAGER_IP>:5000"
+```
+
+### `timescaledb`
+
+```bash
+docker build -t "$REGISTRY/amp/timescaledb:${TAG_TIMESCALEDB:-latest-pg14}" \
+  -f services/timescaledb/Dockerfile.patroni services/timescaledb
+docker push "$REGISTRY/amp/timescaledb:${TAG_TIMESCALEDB:-latest-pg14}"
+```
+
+### `nginx` (GUI image)
+
+Build GUI first, then image:
+
+```bash
+./manage_amp.sh build   # easiest path; includes GUI build logic
+```
+
+If you must do manual nginx-only build, ensure `services/nginx/gui/` already contains built GUI artifacts before `docker build`.
+
+### `amp-core`
+
+Requires binaries (`backend`, `webui_agent`) to exist:
+
+```bash
+# from repo root
+cd "$REPO_ROOT"
+make amp
+
+# from container dir
+cd "$SCRIPT_DIR"
+docker build -t "$REGISTRY/amp/amp-core:${TAG_AMP_CORE:-1.0.0}" services/amp-core
+docker push "$REGISTRY/amp/amp-core:${TAG_AMP_CORE:-1.0.0}"
+```
+
+### 5.3 Roll out updated image to Swarm
+
+Option A (preferred): redeploy stack with tags in `.env`.
+
+```bash
+./manage_amp.sh deploy --auto
+```
+
+Option B (single-service fast rollout):
+
+```bash
+docker service update --image "$REGISTRY/amp/opensearch:${TAG_OPENSEARCH}" amp_opensearch
+```
+
+Then verify:
+
+```bash
+docker service ps amp_opensearch --no-trunc
+docker service inspect amp_opensearch --pretty
+```
+
+## 6. Rollback
+
+If a service update breaks:
+
+```bash
+docker service rollback amp_opensearch
+docker service ps amp_opensearch --no-trunc
+```
+
+For full stack rollback, restore prior image tags in `.env` and redeploy:
+
+```bash
+./manage_amp.sh deploy --auto
+```
+
+## 7. Node and Cluster Troubleshooting
+
+Check node availability and labels:
+
+```bash
+docker node ls
+docker node inspect <node_id> --pretty
+docker node inspect <node_id> --format '{{json .Spec.Labels}}'
+```
+
+Storage label required for stateful placement:
+
+```bash
+docker node update --label-add type=storage <node_id>
+```
+
+Drain a bad node:
+
+```bash
+docker node update --availability drain <node_id>
+docker node update --availability active <node_id>
+```
+
+## 8. Network and Port Checks
+
+Swarm required ports between nodes:
+- `2377/tcp`
+- `7946/tcp`
+- `7946/udp`
+- `4789/udp`
+
+App ports commonly used:
+- `80`, `443`, `5000`, `5432`, `9200`, `9300`, `5601`
+
+Validate listeners:
+
+```bash
+ss -tulpen | egrep ":80|:443|:5000|:5432|:9200|:9300|:5601|:514"
+```
+
+Kernel prerequisite for OpenSearch:
+
+```bash
+sysctl vm.max_map_count
+sysctl -w vm.max_map_count=262144
+```
+
+## 9. Secrets, Configs, and Certificates
+
+Check secrets/configs:
+
+```bash
+docker secret ls
+docker config ls | grep cert_
+```
+
+Initialize/rotate secrets:
+
+```bash
+./manage_amp.sh init-secrets
+./manage_amp.sh rotate-secrets
+```
+
+Recreate certs and security config:
+
+```bash
+./manage_amp.sh setup
+./manage_amp.sh security_init
+./manage_amp.sh configurator
+```
+
+One-shot post deployment:
+
+```bash
+./manage_amp.sh post_deploy
+```
+
+## 10. Common Failure Patterns
+
+- OpenSearch exits immediately:
+  - Check `vm.max_map_count=262144` on every node.
+  - Check `docker service logs amp_opensearch`.
+
+- Workers cannot pull images:
+  - Configure worker insecure registry:
+  - `./manage_amp.sh config_registry <MANAGER_IP>`
+  - Validate registry: `curl http://<MANAGER_IP>:5000/v2/_catalog`
+
+- Service stuck in `Pending`:
+  - Check node constraints/labels in service spec.
+  - Check node status (`Drain`, `Down`, resource pressure).
+
+- Cert-related TLS failures:
+  - Re-run `./manage_amp.sh setup`
+  - Verify Docker configs `cert_*` exist.
+
+- UI reachable but backend/API unhealthy:
+  - Check `amp_amp-core` logs and `/var/log/amp/amp-core`.
+  - Confirm DB access through HAProxy (`:5432`) and secrets are correct.
+
+## 11. Useful One-Liners
+
+```bash
+# live watch service replica state
+watch -n 2 'docker service ls'
+
+# show all AMP task placements
+docker stack ps amp --format 'table {{.Name}}\t{{.Node}}\t{{.CurrentState}}\t{{.Error}}'
+
+# restart only failed service quickly
+docker service ls --format '{{.Name}} {{.Replicas}}' | awk '{split($2,a,"/"); if (a[1]!=a[2]) print $1, $2}'
+```
+
+## 12. Operational Notes
+
+- Prefer `./manage_amp.sh deploy --auto` after changes; it refreshes generated config and stack artifacts.
+- Do not edit generated values in `stack.yml` manually if you are using auto-config flow.
+- For planned upgrades, change explicit `TAG_*` values in `.env`, then run build/deploy in sequence.
Index: /branches/amp_4_0/platform/tools/container/manage_amp.sh
===================================================================
--- /branches/amp_4_0/platform/tools/container/manage_amp.sh	(revision 2954)
+++ /branches/amp_4_0/platform/tools/container/manage_amp.sh	(working copy)
@@ -438,16 +438,13 @@
     echo "--- Initializing Telegraf Config Directory ---"
     
     TELEGRAF_MAIN_CONF="/opt/amp/telegraf.conf"
-    if [ ! -f "$TELEGRAF_MAIN_CONF" ]; then
-        echo "Creating default $TELEGRAF_MAIN_CONF"
-        if [ -f "$SERVICES_DIR/telegraf/telegraf.conf" ]; then
-            sudo cp "$SERVICES_DIR/telegraf/telegraf.conf" "$TELEGRAF_MAIN_CONF" 2>/dev/null || true
-            sudo chmod 644 "$TELEGRAF_MAIN_CONF" 2>/dev/null || true
-        else
-            echo "⚠️  Source config not found: $SERVICES_DIR/telegraf/telegraf.conf"
-        fi
+    echo "Updating $TELEGRAF_MAIN_CONF with latest configuration..."
+    if [ -f "$SERVICES_DIR/telegraf/telegraf.conf" ]; then
+        sudo cp -f "$SERVICES_DIR/telegraf/telegraf.conf" "$TELEGRAF_MAIN_CONF" 2>/dev/null || true
+        sudo chmod 644 "$TELEGRAF_MAIN_CONF" 2>/dev/null || true
+        echo "✅ Telegraf main config updated successfully."
     else
-        echo "✅ Telegraf main config already exists: $TELEGRAF_MAIN_CONF"
+        echo "⚠️  Source config not found: $SERVICES_DIR/telegraf/telegraf.conf"
     fi
 
     TELEGRAF_CONFIG_DIR="/opt/amp/telegraf.d"
@@ -455,19 +452,17 @@
         echo "Creating directory: $TELEGRAF_CONFIG_DIR"
         sudo mkdir -p "$TELEGRAF_CONFIG_DIR"
     else
-        echo "✅ Telegraf config directory already exists: $TELEGRAF_CONFIG_DIR"
+        echo "✅ Telegraf config directory exists: $TELEGRAF_CONFIG_DIR"
     fi
     
-    # Copy default SNMP configs from services directory (only if target files don't exist)
+    # Copy default SNMP configs from services directory (always overwrite to ensure latest)
     if [ -d "$SERVICES_DIR/telegraf/telegraf.d" ]; then
-        echo "Ensuring default Telegraf configs exist..."
+        echo "Syncing default Telegraf configs..."
         for conf in "$SERVICES_DIR/telegraf/telegraf.d"/*.toml; do
             if [ -f "$conf" ]; then
                 filename=$(basename "$conf")
-                if [ ! -f "$TELEGRAF_CONFIG_DIR/$filename" ]; then
-                    sudo cp "$conf" "$TELEGRAF_CONFIG_DIR/" 2>/dev/null || true
-                    sudo chmod 644 "$TELEGRAF_CONFIG_DIR/$filename" 2>/dev/null || true
-                fi
+                sudo cp -f "$conf" "$TELEGRAF_CONFIG_DIR/" 2>/dev/null || true
+                sudo chmod 644 "$TELEGRAF_CONFIG_DIR/$filename" 2>/dev/null || true
             fi
         done
         echo "✅ Default configs synced to $TELEGRAF_CONFIG_DIR"
Index: /branches/amp_4_0/platform/tools/container/services/setup/install_curl.sh
===================================================================
--- /branches/amp_4_0/platform/tools/container/services/setup/install_curl.sh	(revision 2954)
+++ /branches/amp_4_0/platform/tools/container/services/setup/install_curl.sh	(working copy)
@@ -5,11 +5,11 @@
 
 set -e
 
-echo "--- Installing curl ---"
+echo "--- Installing curl, snmp, and network tools ---"
 
 # Check if already installed (offline mode support)
-if command -v curl &>/dev/null && command -v jq &>/dev/null; then
-    echo "✅ curl and jq already installed (skipping dnf)"
+if command -v curl &>/dev/null && command -v jq &>/dev/null && command -v snmpwalk &>/dev/null && command -v tcpdump &>/dev/null; then
+    echo "✅ curl, jq, snmpwalk, and tcpdump already installed (skipping dnf)"
     curl --version | head -1
     exit 0
 fi
@@ -20,13 +20,13 @@
         echo "Detected RHEL-based distribution: $ID"
         # --allowerasing is key for Rocky Linux 9 to replace curl-minimal
         dnf install -y epel-release 2>/dev/null || echo "epel-release not available (offline?)"
-        dnf install -y --allowerasing curl jq bind-utils iputils || {
+        dnf install -y --allowerasing curl jq bind-utils iputils net-snmp-utils tcpdump || {
             echo "❌ Package installation failed. In offline mode, ensure packages are pre-installed."
             exit 1
         }
     elif [[ "$ID" == "debian" || "$ID" == "ubuntu" ]]; then
         echo "Detected Debian-based distribution: $ID"
-        apt-get update && apt-get install -y curl jq
+        apt-get update && apt-get install -y curl jq dnsutils iputils-ping snmp tcpdump
     else
         echo "⚠️  Unsupported distribution: $ID"
         exit 1
@@ -36,5 +36,5 @@
     exit 1
 fi
 
-echo "✅ curl installation complete."
+echo "✅ Package installation complete."
 curl --version | head -1
Index: /branches/amp_4_0/platform/tools/container/services/telegraf/telegraf.conf
===================================================================
--- /branches/amp_4_0/platform/tools/container/services/telegraf/telegraf.conf	(revision 2954)
+++ /branches/amp_4_0/platform/tools/container/services/telegraf/telegraf.conf	(working copy)
@@ -45,27 +45,27 @@
 # Using Docker input for container stats (Metrics -> TimescaleDB)
 [[inputs.docker]]
   endpoint = "unix:///var/run/docker.sock"
-  # docker_api_version = ""  # Unsupported in this version
   gather_services = false
   timeout = "5s"
 
   docker_label_include = []
   tag_env = ["JAVA_HOME", "HEAP_SIZE"]
 
-# Using Docker logs (Logs -> Logstash)
-[[inputs.docker_log]]
-  endpoint = "unix:///var/run/docker.sock"
-  # docker_api_version = ""  # Unsupported in this version
-  container_name_include = [] # All containers
-  timeout = "5s"
+# Docker logs (Logs -> Logstash) are currently disabled.
+# The inputs.docker_log plugin in this version doesn't negotiate API 1.44 properly.
+# [[inputs.docker_log]]
+#   endpoint = "unix:///var/run/docker.sock"
+#   container_name_include = [] # All containers
+#   timeout = "5s"
 
-# === Processors (Ported from configure_telegraf_timescale.sh) ===
+# === Processors ===
 
 [[processors.regex]]
   order = 1
   namepass = ["asf_http_service"]
   [[processors.regex.fields]]
     key = ".*"
+    # Escaping backslash for TOML/Regex
     pattern = "\\\\x00"
     replacement = ""
 
@@ -77,26 +77,56 @@
     for key, value in metric.fields.items():
         if type(value) == "string" and "\x00" in value:
             metric.fields[key] = value.replace("\x00", "")
-    if "http_service_index" not in metric.fields or metric.fields["http_service_index"] == None:
+    if "http_service_index" not in metric.fields or metric.fields[ "http_service_index"] == None:
         metric.fields["http_service_index"] = 0
     return metric
 '''
 
 [[processors.starlark]]
   order = 3
-  namepass = ["asf_http_service"]
+  # Handle host/source mapping for specific tables that lack these columns
   source = '''
 def apply(metric):
-    if "host" in metric.fields:
-        metric.fields.pop("host")
-    if "host" in metric.tags:
-        metric.tags.pop("host")
-    if "http_service_anomaly_contentlength" in metric.fields:
-        metric.fields.pop("http_service_anomaly_contentlength")
+    # Tables that have a 'host' column in the schema
+    system_metrics = ["cpu", "mem", "disk", "diskio", "net", "system", "docker", 
+                      "docker_container_cpu", "docker_container_mem", "docker_container_net", "docker_container_blkio"]
+    
+    if metric.name in system_metrics:
+        # These match create_system_metrics_tables.sql
+        if "source" in metric.tags: metric.tags.pop("source")
+        return metric
+
+    # Tables that use 'agent_host' instead of 'host' (SNMP Device Metrics)
+    # These match telegraf_snmp_timescale.sql
+    device_prefixes = ["an_", "ag_", "asf_", "apv_"]
+    is_device = False
+    for p in device_prefixes:
+        if metric.name.startswith(p):
+            is_device = True
+            break
+            
+    if is_device:
+        # Capture device identifier from source/host/agent_host tag
+        addr = metric.tags.get("source") or metric.tags.get("agent_host") or metric.tags.get("host")
+        if addr:
+            metric.tags["agent_host"] = addr
+            
+        # Clean up tags that don't have matching columns in SNMP tables
+        if "host" in metric.tags: metric.tags.pop("host")
+        if "host" in metric.fields: metric.fields.pop("host")
+        if "source" in metric.tags: metric.tags.pop("source")
+        if "source" in metric.fields: metric.fields.pop("source")
+        
+        # Specific fix for asf_http_service anomaly field name mismatch
+        if metric.name == "asf_http_service":
+            if "http_service_anomaly_contentlength" in metric.fields:
+                metric.fields.pop("http_service_anomaly_contentlength")
+                
     return metric
 '''
 
 [[processors.starlark]]
+  # Filtering and validation
   order = 4
   source = '''
 def apply(metric):
@@ -104,14 +134,14 @@
         return None
     if metric.name == 'asf_syslog_history' and 'idx' not in metric.fields:
         return None
-    if metric.name not in ['apv_real_stats']:
-        return metric
-    if 'real_server_id' in metric.tags or 'serverid' in metric.fields:
-        return metric
-    return None
+    if metric.name == 'apv_real_stats':
+        if 'real_server_id' not in metric.tags and 'serverid' not in metric.fields:
+            return None
+    return metric
 '''
 
 [[processors.starlark]]
+  # APV Specific: Calculate total hits
   order = 5
   namepass = ["apv_virtual_stats"]
   source = '''
@@ -125,51 +155,41 @@
 '''
 
 [[processors.starlark]]
+  # APV Specific: Clean up LLB strings
+  order = 6
   namepass = ["apv_llb_stats"]
   source = '''
 def apply(metric):
     if metric.name != "apv_llb_stats":
         return metric
 
-    # Fields that must remain as strings
     string_fields = ["link_resp_time", "link_status", "link_down_event", 
                      "link_name", "link_gateway", "host"]
 
-    # Fields that must be integers
     int_fields = ["link_index", "link_hits", "link_conn", "link_usage",
                   "link_down_count", "link_up_time", "link_down_time",
                   "link_bandwid_in", "link_bandwid_out", "link_thresh"]
 
-    # Convert string fields safely
     for f in string_fields:
         if f in metric.fields and metric.fields[f] != None:
             metric.fields[f] = str(metric.fields[f])
 
-    # Convert int fields safely
     for f in int_fields:
         if f in metric.fields:
             val = metric.fields[f]
             if val == None:
-                continue
+                metric.fields[f] = 0
             elif type(val) == "int":
-                metric.fields[f] = val
+                pass
             elif type(val) == "float":
                 metric.fields[f] = int(val)
             elif type(val) == "string":
-                # remove non-digit characters like 'kbps' or 'ms'
                 digits = ""
                 for c in val.elems():
                     if c in "0123456789.":
                         digits += c
-                if digits == "":
-                    metric.fields[f] = 0
-                else:
-                    metric.fields[f] = int(float(digits))
+                metric.fields[f] = int(float(digits)) if digits else 0
             else:
                 metric.fields[f] = 0
-
-    if "host" in metric.tags:
-        metric.tags.pop("host")
-
     return metric
 '''
Index: /branches/amp_4_0/platform/tools/container/stack.yml.template
===================================================================
--- /branches/amp_4_0/platform/tools/container/stack.yml.template	(revision 2954)
+++ /branches/amp_4_0/platform/tools/container/stack.yml.template	(working copy)
@@ -393,10 +393,10 @@
         protocol: tcp
 
   telegraf:
-    image: ${REGISTRY:-127.0.0.1:5000}/amp/telegraf:${TAG_TELEGRAF:-1.29.1}
+    image: ${REGISTRY:-127.0.0.1:5000}/amp/telegraf:${TAG_TELEGRAF:-1.36.4}
     user: root
     security_opt:
-      - label=disable
+      - label:disable
     deploy:
       mode: global # Run on EVERY node to monitor docker
     entrypoint: ["telegraf", "--config", "/etc/telegraf/telegraf.conf", "--config", "/etc/telegraf/telegraf.d/apv.toml", "--config", "/etc/telegraf/telegraf.d/ag.toml", "--config", "/etc/telegraf/telegraf.d/asf.toml"]
@@ -412,6 +412,7 @@
     volumes:
       - /opt/amp/telegraf.conf:/etc/telegraf/telegraf.conf:ro
       - /var/run/docker.sock:/var/run/docker.sock
+      - /var/run/utmp:/var/run/utmp:ro
       - /dev:/dev:ro
       - /:/rootfs:ro
       - /proc:/rootfs/proc:ro
