3 files changed, 329 insertions, 76 deletions
diff --git a/doc/developer/topotests.rst b/doc/developer/topotests.rst
index 7976a206f..e684b9c8a 100644
--- a/doc/developer/topotests.rst
+++ b/doc/developer/topotests.rst
@@ -232,6 +232,85 @@ for ``master`` branch:
 
 and create ``frr`` user and ``frrvty`` group as shown above.
 
+Debugging Topotest Failures
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For the below debugging options which launch programs, if the topotest is run
+within screen_ or tmux_, ``gdb``, the shell or ``vtysh`` will be launched using
+that windowing program, otherwise mininet's ``xterm`` functionality will be used
+to launch the given program.
+
+If you wish to force the use of ``xterm`` rather than ``tmux`` or ``screen``, or
+wish to use ``gnome-terminal`` instead of ``xterm``, set the environment
+variable ``FRR_TOPO_TERMINAL`` to either ``xterm`` or ``gnome-terminal``.
+
+.. _screen: https://www.gnu.org/software/screen/
+.. _tmux: https://github.com/tmux/tmux/wiki
+
+Spawning ``vtysh`` or Shells on Routers
+"""""""""""""""""""""""""""""""""""""""
+
+Topotest can automatically launch a shell or ``vtysh`` for any or all routers in
+a test. This is enabled by specifying 1 of 2 CLI arguments ``--shell`` or
+``--vtysh``. Both of these options can be set to a single router value, multiple
+comma-seperated values, or ``all``.
+
+When either of these options are specified topotest will pause after each test
+to allow for inspection of the router state.
+
+Here's an example of launching ``vtysh`` on routers ``rt1`` and ``rt2``.
+
+.. code:: shell
+
+   pytest --vtysh=rt1,rt2 all-protocol-startup
+
+Spawning Mininet CLI, ``vtysh`` or Shells on Routers on Test Failure
+""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+Similar to the previous section one can have ``vtysh`` or a shell launched on
+routers, but in this case only when a test fails. To launch the given process on
+each router after a test failure specify one of ``--shell-on-error`` or
+``--vtysh-on-error``.
+
+
+Here's an example of having ``vtysh`` launched on test failure.
+
+.. code:: shell
+
+   pytest --vtysh-on-error all-protocol-startup
+
+
+Additionally, one can have the mininet CLI invoked on test failures by
+specifying the ``--mininet-on-error`` CLI option as shown in the example below.
+
+.. code:: shell
+
+   pytest --mininet-on-error all-protocol-startup
+
+Debugging with GDB
+""""""""""""""""""
+
+Topotest can automatically launch any daemon with ``gdb``, possibly setting
+breakpoints for any test run. This is enabled by specifying 1 or 2 CLI arguments
+``--gdb-routers`` and ``--gdb-daemons``. Additionally ``--gdb-breakpoints`` can
+be used to automatically set breakpoints in the launched ``gdb`` processes.
+
+Each of these options can be set to a single value, multiple comma-seperated
+values, or ``all``. If ``--gdb-routers`` is empty but ``--gdb_daemons`` is set
+then the given daemons will be launched in ``gdb`` on all routers in the test.
+Likewise if ``--gdb_routers`` is set, but ``--gdb_daemons`` is empty then all
+daemons on the given routers will be launched in ``gdb``.
+
+Here's an example of launching ``zebra`` and ``bgpd`` inside ``gdb`` on router
+``r1`` with a breakpoint set on ``nb_config_diff``
+
+.. code:: shell
+
+   pytest --gdb-routers=r1 \
+          --gdb-daemons=bgpd,zebra \
+          --gdb-breakpoints=nb_config_diff \
+          all-protocol-startup
+
 .. _topotests_docker:
 
 Running Tests with Docker
diff --git a/tests/topotests/conftest.py b/tests/topotests/conftest.py
index 04e9961f1..7ad5d8c9a 100755
--- a/tests/topotests/conftest.py
+++ b/tests/topotests/conftest.py
@@ -2,13 +2,14 @@
 Topotest conftest.py file.
 """
 
+import os
+import pdb
+import pytest
+
 from lib.topogen import get_topogen, diagnose_env
 from lib.topotest import json_cmp_result
+from lib.topotest import g_extra_config as topotest_extra_config
 from lib.topolog import logger
-import pytest
-
-topology_only = False
-
 
 def pytest_addoption(parser):
     """
@@ -16,20 +17,72 @@ def pytest_addoption(parser):
     only run the setup_module() to setup the topology without running any tests.
     """
     parser.addoption(
+        "--gdb-breakpoints",
+        metavar="SYMBOL[,SYMBOL...]",
+        help="Comma-separated list of functions to set gdb breakpoints on",
+    )
+
+    parser.addoption(
+        "--gdb-daemons",
+        metavar="DAEMON[,DAEMON...]",
+        help="Comma-separated list of daemons to spawn gdb on, or 'all'",
+    )
+
+    parser.addoption(
+        "--gdb-routers",
+        metavar="ROUTER[,ROUTER...]",
+        help="Comma-separated list of routers to spawn gdb on, or 'all'",
+    )
+
+    parser.addoption(
+        "--mininet-on-error",
+        action="store_true",
+        help="Mininet cli on test failure",
+    )
+
+    parser.addoption(
+        "--pause-after",
+        action="store_true",
+        help="Pause after each test",
+    )
+
+    parser.addoption(
+        "--shell",
+        metavar="ROUTER[,ROUTER...]",
+        help="Comma-separated list of routers to spawn shell on, or 'all'",
+    )
+
+    parser.addoption(
+        "--shell-on-error",
+        action="store_true",
+        help="Spawn shell on all routers on test failure",
+    )
+
+    parser.addoption(
         "--topology-only",
         action="store_true",
         help="Only set up this topology, don't run tests",
     )
 
+    parser.addoption(
+        "--vtysh",
+        metavar="ROUTER[,ROUTER...]",
+        help="Comma-separated list of routers to spawn vtysh on, or 'all'",
+    )
+
+    parser.addoption(
+        "--vtysh-on-error",
+        action="store_true",
+        help="Spawn vtysh on all routers on test failure",
+    )
+
 
 def pytest_runtest_call():
     """
     This function must be run after setup_module(), it does standarized post
     setup routines. It is only being used for the 'topology-only' option.
     """
-    global topology_only
-
-    if topology_only:
+    if topotest_extra_config["topology_only"]:
         tgen = get_topogen()
         if tgen is not None:
             # Allow user to play with the setup.
@@ -42,6 +95,8 @@ def pytest_assertrepr_compare(op, left, right):
     """
     Show proper assertion error message for json_cmp results.
     """
+    del op
+
     json_result = left
     if not isinstance(json_result, json_cmp_result):
         json_result = right
@@ -52,43 +107,105 @@ def pytest_assertrepr_compare(op, left, right):
 
 
 def pytest_configure(config):
-    "Assert that the environment is correctly configured."
-
-    global topology_only
+    """
+    Assert that the environment is correctly configured, and get extra config.
+    """
 
     if not diagnose_env():
-        pytest.exit("enviroment has errors, please read the logs")
+        pytest.exit("environment has errors, please read the logs")
+
+    gdb_routers = config.getoption("--gdb-routers")
+    gdb_routers = gdb_routers.split(",") if gdb_routers else []
+    topotest_extra_config["gdb_routers"] = gdb_routers
+
+    gdb_daemons = config.getoption("--gdb-daemons")
+    gdb_daemons = gdb_daemons.split(",") if gdb_daemons else []
+    topotest_extra_config["gdb_daemons"] = gdb_daemons
+
+    gdb_breakpoints = config.getoption("--gdb-breakpoints")
+    gdb_breakpoints = gdb_breakpoints.split(",") if gdb_breakpoints else []
+    topotest_extra_config["gdb_breakpoints"] = gdb_breakpoints
+
+    mincli_on_error = config.getoption("--mininet-on-error")
+    topotest_extra_config["mininet_on_error"] = mincli_on_error
 
-    if config.getoption("--topology-only"):
-        topology_only = True
+    shell = config.getoption("--shell")
+    topotest_extra_config["shell"] = shell.split(",") if shell else []
+
+    pause_after = config.getoption("--pause-after")
+
+    shell_on_error = config.getoption("--shell-on-error")
+    topotest_extra_config["shell_on_error"] = shell_on_error
+
+    vtysh = config.getoption("--vtysh")
+    topotest_extra_config["vtysh"] = vtysh.split(",") if vtysh else []
+
+    vtysh_on_error = config.getoption("--vtysh-on-error")
+    topotest_extra_config["vtysh_on_error"] = vtysh_on_error
+
+    topotest_extra_config["pause_after"] = (
+        pause_after or shell or vtysh
+    )
+
+    topotest_extra_config["topology_only"] = config.getoption("--topology-only")
 
 
 def pytest_runtest_makereport(item, call):
     "Log all assert messages to default logger with error level"
-    # Nothing happened
-    if call.excinfo is None:
-        return
 
-    parent = item.parent
-    modname = parent.module.__name__
+    # Nothing happened
+    if call.when == "call":
+        pause = topotest_extra_config["pause_after"]
+    else:
+        pause = False
 
-    # Treat skips as non errors
-    if call.excinfo.typename != "AssertionError":
-        logger.info(
-            'assert skipped at "{}/{}": {}'.format(
-                modname, item.name, call.excinfo.value
+    if call.excinfo is None:
+        error = False
+    else:
+        parent = item.parent
+        modname = parent.module.__name__
+
+        # Treat skips as non errors, don't pause after
+        if call.excinfo.typename != "AssertionError":
+            pause = False
+            error = False
+            logger.info(
+                'assert skipped at "{}/{}": {}'.format(
+                    modname, item.name, call.excinfo.value
+                )
+            )
+        else:
+            error = True
+            # Handle assert failures
+            parent._previousfailed = item       # pylint: disable=W0212
+            logger.error(
+                'assert failed at "{}/{}": {}'.format(modname, item.name, call.excinfo.value)
             )
-        )
-        return
-
-    # Handle assert failures
-    parent._previousfailed = item
-    logger.error(
-        'assert failed at "{}/{}": {}'.format(modname, item.name, call.excinfo.value)
-    )
 
-    # (topogen) Set topology error to avoid advancing in the test.
-    tgen = get_topogen()
-    if tgen is not None:
-        # This will cause topogen to report error on `routers_have_failure`.
-        tgen.set_error("{}/{}".format(modname, item.name))
+            # (topogen) Set topology error to avoid advancing in the test.
+            tgen = get_topogen()
+            if tgen is not None:
+                # This will cause topogen to report error on `routers_have_failure`.
+                tgen.set_error("{}/{}".format(modname, item.name))
+
+
+    if error and topotest_extra_config["shell_on_error"]:
+        for router in tgen.routers():
+            pause = True
+            tgen.net[router].runInWindow(os.getenv("SHELL", "bash"))
+
+    if error and topotest_extra_config["vtysh_on_error"]:
+        for router in tgen.routers():
+            pause = True
+            tgen.net[router].runInWindow("vtysh")
+
+    if error and topotest_extra_config["mininet_on_error"]:
+        tgen.mininet_cli()
+
+    if pause:
+        try:
+            user = raw_input('Testing paused, "pdb" to debug, "Enter" to continue: ')
+        except NameError:
+            user = input('Testing paused, "pdb" to debug, "Enter" to continue: ')
+        if user.strip() == "pdb":
+            pdb.set_trace()
diff --git a/tests/topotests/lib/topotest.py b/tests/topotests/lib/topotest.py
index 5cc1a6981..7f768f5b8 100644
--- a/tests/topotests/lib/topotest.py
+++ b/tests/topotests/lib/topotest.py
@@ -50,7 +50,9 @@ from mininet.node import Node, OVSSwitch, Host
 from mininet.log import setLogLevel, info
 from mininet.cli import CLI
 from mininet.link import Intf
+from mininet.term import makeTerm
 
+g_extra_config = {}
 
 def gdb_core(obj, daemon, corefiles):
     gdbcmds = """
@@ -1303,6 +1305,37 @@ class Router(Node):
             logger.info("No daemon {} known".format(daemon))
         # print "Daemons after:", self.daemons
 
+    # Run a command in a new window (gnome-terminal, screen, tmux, xterm)
+    def runInWindow(self, cmd, title=None):
+        topo_terminal = os.getenv("FRR_TOPO_TERMINAL")
+        if topo_terminal or (
+                "TMUX" not in os.environ and "STY" not in os.environ
+        ):
+            term = topo_terminal if topo_terminal else "xterm"
+            makeTerm(
+                self,
+                title=title if title else cmd,
+                term=term,
+                cmd=cmd)
+        else:
+            nscmd = "sudo nsenter -m -n -t {} {}".format(self.pid, cmd)
+            if "TMUX" in os.environ:
+                self.cmd("tmux select-layout main-horizontal")
+                wcmd = "tmux split-window -h"
+                cmd = "{} {}".format(wcmd, nscmd)
+            elif "STY" in os.environ:
+                if os.path.exists(
+                        "/run/screen/S-{}/{}".format(
+                            os.environ['USER'], os.environ['STY']
+                        )
+                ):
+                    wcmd = "screen"
+                else:
+                    wcmd = "sudo -u {} screen".format(os.environ["SUDO_USER"])
+                cmd = "{} {}".format(wcmd, nscmd)
+            self.cmd(cmd)
+
+
     def startRouter(self, tgen=None):
         # Disable integrated-vtysh-config
         self.cmd(
@@ -1355,6 +1388,14 @@ class Router(Node):
                 return "LDP/MPLS Tests need mpls kernel modules"
         self.cmd("echo 100000 > /proc/sys/net/mpls/platform_labels")
 
+        shell_routers = g_extra_config["shell"]
+        if "all" in shell_routers or self.name in shell_routers:
+            self.runInWindow(os.getenv("SHELL", "bash"))
+
+        vtysh_routers = g_extra_config["vtysh"]
+        if "all" in vtysh_routers or self.name in vtysh_routers:
+            self.runInWindow("vtysh")
+
         if self.daemons["eigrpd"] == 1:
             eigrpd_path = os.path.join(self.daemondir, "eigrpd")
             if not os.path.isfile(eigrpd_path):
@@ -1381,6 +1422,10 @@ class Router(Node):
     def startRouterDaemons(self, daemons=None):
         "Starts all FRR daemons for this router."
 
+        gdb_breakpoints =  g_extra_config["gdb_breakpoints"]
+        gdb_daemons = g_extra_config["gdb_daemons"]
+        gdb_routers = g_extra_config["gdb_routers"]
+
         bundle_data = ""
 
         if os.path.exists("/etc/frr/support_bundle_commands.conf"):
@@ -1410,7 +1455,7 @@ class Router(Node):
         # If `daemons` was specified then some upper API called us with
         # specific daemons, otherwise just use our own configuration.
         daemons_list = []
-        if daemons != None:
+        if daemons is not None:
             daemons_list = daemons
         else:
             # Append all daemons configured.
@@ -1418,47 +1463,67 @@ class Router(Node):
                 if self.daemons[daemon] == 1:
                     daemons_list.append(daemon)
 
-        # Start Zebra first
-        if "zebra" in daemons_list:
-            zebra_path = os.path.join(self.daemondir, "zebra")
-            zebra_option = self.daemons_options["zebra"]
-            self.cmd(
-                "ASAN_OPTIONS=log_path=zebra.asan {0} {1} --log file:zebra.log --log-level debug -s 90000000 -d > zebra.out 2> zebra.err".format(
-                    zebra_path, zebra_option
+        def start_daemon(daemon, extra_opts=None):
+            daemon_opts = self.daemons_options.get(daemon, "")
+            rediropt = " > {0}.out 2> {0}.err".format(daemon)
+            if daemon == "snmpd":
+                binary = "/usr/sbin/snmpd"
+                cmdenv = ""
+                cmdopt = "{} -C -c /etc/frr/snmpd.conf -p ".format(
+                    daemon_opts
+                ) + "/var/run/{}/snmpd.pid -x /etc/frr/agentx".format(self.routertype)
+            else:
+                binary = os.path.join(self.daemondir, daemon)
+                cmdenv = "ASAN_OPTIONS=log_path={0}.asan".format(daemon)
+                cmdopt = "{} --log file:{}.log --log-level debug".format(
+                    daemon_opts, daemon
                 )
-            )
-            logger.debug("{}: {} zebra started".format(self, self.routertype))
+            if extra_opts:
+                cmdopt += " " + extra_opts
+
+            if (
+                (gdb_routers or gdb_daemons)
+                and (not gdb_routers
+                     or self.name in gdb_routers
+                     or "all" in gdb_routers)
+                and (not gdb_daemons
+                     or daemon in gdb_daemons
+                     or "all" in gdb_daemons)
+            ):
+                if daemon == "snmpd":
+                    cmdopt += " -f "
+
+                cmdopt += rediropt
+                gdbcmd = "sudo -E gdb " + binary
+                if gdb_breakpoints:
+                    gdbcmd += " -ex 'set breakpoint pending on'"
+                for bp in gdb_breakpoints:
+                    gdbcmd += " -ex 'b {}'".format(bp)
+                gdbcmd += " -ex 'run {}'".format(cmdopt)
+
+                self.runInWindow(gdbcmd, daemon)
+            else:
+                if daemon != "snmpd":
+                    cmdopt += " -d "
+                cmdopt += rediropt
+                self.cmd(" ".join([cmdenv, binary, cmdopt]))
+            logger.info("{}: {} {} started".format(self, self.routertype, daemon))
 
-            # Remove `zebra` so we don't attempt to start it again.
+
+        # Start Zebra first
+        if "zebra" in daemons_list:
+            start_daemon("zebra", "-s 90000000")
             while "zebra" in daemons_list:
                 daemons_list.remove("zebra")
 
         # Start staticd next if required
         if "staticd" in daemons_list:
-            staticd_path = os.path.join(self.daemondir, "staticd")
-            staticd_option = self.daemons_options["staticd"]
-            self.cmd(
-                "ASAN_OPTIONS=log_path=staticd.asan {0} {1} --log file:staticd.log --log-level debug -d > staticd.out 2> staticd.err".format(
-                    staticd_path, staticd_option
-                )
-            )
-            logger.debug("{}: {} staticd started".format(self, self.routertype))
-
-            # Remove `staticd` so we don't attempt to start it again.
+            start_daemon("staticd")
             while "staticd" in daemons_list:
                 daemons_list.remove("staticd")
 
         if "snmpd" in daemons_list:
-            snmpd_path = "/usr/sbin/snmpd"
-            snmpd_option = self.daemons_options["snmpd"]
-            self.cmd(
-                "{0} {1} -C -c /etc/frr/snmpd.conf -p /var/run/{2}/snmpd.pid -x /etc/frr/agentx > snmpd.out 2> snmpd.err".format(
-                    snmpd_path, snmpd_option, self.routertype
-                )
-            )
-            logger.info("{}: {} snmpd started".format(self, self.routertype))
-
-            # Remove `snmpd` so we don't attempt to start it again.
+            start_daemon("snmpd")
             while "snmpd" in daemons_list:
                 daemons_list.remove("snmpd")
 
@@ -1470,17 +1535,9 @@ class Router(Node):
 
         # Now start all the other daemons
         for daemon in daemons_list:
-            # Skip disabled daemons and zebra
             if self.daemons[daemon] == 0:
                 continue
-
-            daemon_path = os.path.join(self.daemondir, daemon)
-            self.cmd(
-                "ASAN_OPTIONS=log_path={2}.asan {0} {1} --log file:{2}.log --log-level debug -d > {2}.out 2> {2}.err".format(
-                    daemon_path, self.daemons_options.get(daemon, ""), daemon
-                )
-            )
-            logger.debug("{}: {} {} started".format(self, self.routertype, daemon))
+            start_daemon(daemon)
 
         # Check if daemons are running.
         rundaemons = self.cmd("ls -1 /var/run/%s/*.pid" % self.routertype)