Queues: - GTK Debug: https://build.webkit.org/#/builders/63 - WPE Debug: https://build.webkit.org/#/builders/14 After the migration to the external hosts, these bots are timing out on the global 1200sec timer.
Example worker trace from WPE-Debug build https://build.webkit.org/#/builders/14/builds/4074 https://build.webkit.org/results/WPE-Linux-64-bit-Debug-Tests/256601@main%20(4074)/74-python_stack_trace.txt SIGTERM signal receivedTraceback(most recent call last): File "/app/webkit/Tools/Scripts/run-webkit-tests", line 46, in <module> sys.exit(main(sys.argv[1:], sys.stdout, sys.stderr)) File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/run_webkit_tests.py", line 92, in main run_details = run(port, options, args, stderr) File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/run_webkit_tests.py", line 501, in run run_details = manager.run(args) File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/controllers/manager.py", line 413, in run temp_initial_results, temp_retry_results, temp_enabled_pixel_tests_in_retry = self._run_test_subset(test_inputs, device_type=device_type) File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/controllers/manager.py", line 487, in _run_test_subset initial_results = self._run_tests(test_inputs, self._options.repeat_each, self._options.iterations, int(self._options.child_processes), retrying=False, device_type=device_type) File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/controllers/manager.py", line 570, in _run_tests return self._runner.run_tests(self._expectations[device_type], new_test_inputs, num_workers, retrying, device_type) File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/controllers/layout_test_runner.py", line 166, in run_tests with TaskPool( File "/app/webkit/Tools/Scripts/libraries/webkitcorepy/webkitcorepy/task_pool.py", line 396, in __enter__ worker.start() File "/usr/lib/python3.10/multiprocessing/process.py", line 121, in start self._popen = self._Popen(self) File "/usr/lib/python3.10/multiprocessing/context.py", line 224, in _Popen return _default_context.get_context().Process._Popen(process_obj) File "/usr/lib/python3.10/multiprocessing/context.py", line 281, in _Popen return Popen(process_obj) File "/usr/lib/python3.10/multiprocessing/popen_fork.py", line 19, in __init__ self._launch(process_obj) File "/usr/lib/python3.10/multiprocessing/popen_fork.py", line 71, in _launch code = process_obj._bootstrap(parent_sentinel=child_r) File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/app/webkit/Tools/Scripts/libraries/webkitcorepy/webkitcorepy/task_pool.py", line 303, in main queue.send(_Result(value=task(None), id=task.id)) File "/app/webkit/Tools/Scripts/libraries/webkitcorepy/webkitcorepy/task_pool.py", line 56, in __call__ return self.function(*self.args, **self.kwargs) File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/controllers/layout_test_runner.py", line 77, in run_shard return Worker.instance.run_tests(shard) File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/controllers/layout_test_runner.py", line 319, in run_tests Worker.instance.run_test(input, shard.name) File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/controllers/layout_test_runner.py", line 347, in run_test result = self._run_test_with_or_without_timeout(test_input, test_timeout_sec, stop_when_done) File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/controllers/layout_test_runner.py", line 404, in _run_test_with_or_without_timeout return self._run_test_in_this_thread(test_input, stop_when_done) File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/controllers/layout_test_runner.py", line 490, in _run_test_in_this_thread return self._run_single_test(self._driver, test_input, stop_when_done) File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/controllers/layout_test_runner.py", line 493, in _run_single_test return single_test_runner.run_single_test( File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/controllers/single_test_runner.py", line 48, in run_single_test return runner.run() File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/controllers/single_test_runner.py", line 128, in run return self._run_compare_test() File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/controllers/single_test_runner.py", line 131, in _run_compare_test driver_output = self._driver.run_test(self._driver_input(), self._stop_when_done) File "/app/webkit/Tools/Scripts/webkitpy/port/driver.py", line 863, in run_test return self._driver.run_test(driver_input, stop_when_done) File "/app/webkit/Tools/Scripts/webkitpy/port/driver.py", line 234, in run_test text, audio = self._read_first_block(deadline, driver_input.test_name) # First block is either text or audio File "/app/webkit/Tools/Scripts/webkitpy/port/driver.py", line 673, in _read_first_block block = self._read_block(deadline, test_name) File "/app/webkit/Tools/Scripts/webkitpy/port/driver.py", line 735, in _read_block out_line, err_line = self._server_process.read_either_stdout_or_stderr_line(deadline) File "/app/webkit/Tools/Scripts/webkitpy/port/server_process.py", line 237, in read_either_stdout_or_stderr_line return_value = self._read(deadline, retrieve_bytes_from_buffers) File "/app/webkit/Tools/Scripts/webkitpy/port/server_process.py", line 385, in _read self._wait_for_data_and_update_buffers_using_select(deadline) File "/app/webkit/Tools/Scripts/webkitpy/port/server_process.py", line 286, in _wait_for_data_and_update_buffers_using_select read_fds, _, _ = select.select(select_fds, [], select_fds, max(deadline - time.time(), 0))
Not sure if related, but I'm seeing a considerable number of warnings like this: 00:53:10.446 2 worker/0 ERROR: Failed to get RTTimeUSecMax from RealtimeKit: GDBus.Error:org.freedesktop.DBus.Error.InvalidArgs: No such interface “org.freedesktop.portal.Realtime” 00:53:10.446 2 worker/0 /app/webkit/Source/WTF/wtf/linux/RealTimeThreads.cpp(204) : void WTF::RealTimeThreads::realTimeKitMakeThreadRealTime(uint64_t, uint64_t, uint32_t)
(In reply to Lauro Moura from comment #2) > Not sure if related, but I'm seeing a considerable number of warnings like > this: > > 00:53:10.446 2 worker/0 ERROR: Failed to get RTTimeUSecMax from > RealtimeKit: GDBus.Error:org.freedesktop.DBus.Error.InvalidArgs: No such > interface “org.freedesktop.portal.Realtime” > 00:53:10.446 2 worker/0 > /app/webkit/Source/WTF/wtf/linux/RealTimeThreads.cpp(204) : void > WTF::RealTimeThreads::realTimeKitMakeThreadRealTime(uint64_t, uint64_t, > uint32_t) These should just be warnings. But the runner could also install `xdg-desktop-portal > 1.14.6`
(In reply to Patrick Griffis from comment #3) > (In reply to Lauro Moura from comment #2) > > Not sure if related, but I'm seeing a considerable number of warnings like > > this: > > > > 00:53:10.446 2 worker/0 ERROR: Failed to get RTTimeUSecMax from > > RealtimeKit: GDBus.Error:org.freedesktop.DBus.Error.InvalidArgs: No such > > interface “org.freedesktop.portal.Realtime” > > 00:53:10.446 2 worker/0 > > /app/webkit/Source/WTF/wtf/linux/RealTimeThreads.cpp(204) : void > > WTF::RealTimeThreads::realTimeKitMakeThreadRealTime(uint64_t, uint64_t, > > uint32_t) > > These should just be warnings. But the runner could also install > `xdg-desktop-portal > 1.14.6` I enabled bullseye-backports on the bot and installed the following versions: * xdg-desktop-portal - 1.15.0-2~bpo11+1 * xdg-desktop-portal-gtk - 1.14.0 Unfortunately, the errors keep appearing (Current job: https://build.webkit.org/#/builders/14/builds/4202 ) Btw, there this screenshot portal issue with a similar error, fixed by downgrading from 1.15.0 to 1.14.6, that might bring some clue: > The only way this error can happen is if the implementation D-Bus proxy fails to be created. I can confirm it works on xdg-desktop-portal-gnome, and xdg-desktop-portal-gtk, but I don't know if -kde and -wlr do something different that could be incompatible with the latest changes. https://github.com/flatpak/xdg-desktop-portal/issues/861
After some manual testing on the debug external bot: - This issue is not debug-specific, also happens to release builds on the external host and started happening on the WPE release tester too, which is a standalone machine. - Bisecting consistently shows this issue starting in move to the FDO-SDK 22.08 (255085@main) - The SDK upgrade bumped Python from 3.9.9 to 3.10.6, which might be related. Could not reproduce yet on my desktop or with a subset of the tests other than the full suite.
(In reply to Lauro Moura from comment #5) > After some manual testing on the debug external bot: > > - This issue is not debug-specific, also happens to release builds on the > external host and started happening on the WPE release tester too, which is > a standalone machine. > - Bisecting consistently shows this issue starting in move to the FDO-SDK > 22.08 (255085@main) > - The SDK upgrade bumped Python from 3.9.9 to 3.10.6, which might be related. > > Could not reproduce yet on my desktop or with a subset of the tests other > than the full suite. I landed this the other day... [GTK][WPE][run-webkit-tests] layout tests are deadlocking when a test crashes due to unhandled BrokenPipeError https://bugs.webkit.org/show_bug.cgi?id=248533 What I found is that when running the layout tests with one worker (export NUMBER_OF_PROCESSORS=1) it helps to debug this issues, because with one worker you will see the python exception that need to be fixed, otherwise with several workers the exception is hidden and you get a deadlock.
(In reply to Carlos Alberto Lopez Perez from comment #6) > > > What I found is that when running the layout tests with one worker (export > NUMBER_OF_PROCESSORS=1) it helps to debug this issues, because with one > worker you will see the python exception that need to be fixed, otherwise > with several workers the exception is hidden and you get a deadlock. And of course it is also really helpful to pass --debug-rwt-logging to run-webkit-tests to have debug output enabled
(In reply to Carlos Alberto Lopez Perez from comment #6) > (In reply to Lauro Moura from comment #5) > > After some manual testing on the debug external bot: > > > > - This issue is not debug-specific, also happens to release builds on the > > external host and started happening on the WPE release tester too, which is > > a standalone machine. > > - Bisecting consistently shows this issue starting in move to the FDO-SDK > > 22.08 (255085@main) > > - The SDK upgrade bumped Python from 3.9.9 to 3.10.6, which might be related. > > > > Could not reproduce yet on my desktop or with a subset of the tests other > > than the full suite. > > I landed this the other day... > > [GTK][WPE][run-webkit-tests] layout tests are deadlocking when a test > crashes due to unhandled BrokenPipeError > https://bugs.webkit.org/show_bug.cgi?id=248533 > > > What I found is that when running the layout tests with one worker (export > NUMBER_OF_PROCESSORS=1) it helps to debug this issues, because with one > worker you will see the python exception that need to be fixed, otherwise > with several workers the exception is hidden and you get a deadlock. This commit indeed fixed the 1 worker case, but I'm still getting the #c1 backtrace whenever I run with n>1 workers under heavy load (managed to reproduce locally with 12 child processes in my 8-core laptop). Changing the version inside flatpak[1], I could reliably reproduce the issue beginning with python 3.9.13 and 3.10.5. [1] https://gist.github.com/lauromoura/5159c8428048d7d592b2562a142f3cf2
And the specific CPython commit introducing this issue is this one: https://github.com/python/cpython/commit/524d2750e33b4d9c98a562943863abe7fd1236cd > bpo-47029: Fix BrokenPipeError in multiprocessing.Queue at garbage collection and explicit close (GH-31913) I couldn't figure out yet where exactly in our tooling this his causing issues, so I'm trying to downgrade Python to 3.10.5 in our SDK as a workaround for a while.
Another finding: The deadlock happens only when the suite exits early. This explains why debug runs failed before the release ones. And we can trigger the hang using a low number for `exit-after-n-failures`, for example. And here's a trace from the main script after attaching to it when it hangs: Thread 0x00007f0280d1b640 (most recent call first): File "/opt/cpython-flatpak/lib/python3.9/multiprocessing/connection.py", line 373 in _send File "/opt/cpython-flatpak/lib/python3.9/multiprocessing/connection.py", line 416 in _send_bytes File "/opt/cpython-flatpak/lib/python3.9/multiprocessing/connection.py", line 205 in send_bytes File "/opt/cpython-flatpak/lib/python3.9/multiprocessing/queues.py", line 250 in _feed File "/opt/cpython-flatpak/lib/python3.9/threading.py", line 910 in run File "/opt/cpython-flatpak/lib/python3.9/threading.py", line 973 in _bootstrap_inner File "/opt/cpython-flatpak/lib/python3.9/threading.py", line 930 in _bootstrap Current thread 0x00007f028bb4cb80 (most recent call first): File "<string>", line 1 in <module> File "/opt/cpython-flatpak/lib/python3.9/threading.py", line 1073 in _wait_for_tstate_lock File "/opt/cpython-flatpak/lib/python3.9/threading.py", line 1053 in join File "/opt/cpython-flatpak/lib/python3.9/multiprocessing/queues.py", line 199 in _finalize_join File "/opt/cpython-flatpak/lib/python3.9/multiprocessing/util.py", line 224 in __call__ File "/opt/cpython-flatpak/lib/python3.9/multiprocessing/queues.py", line 151 in join_thread File "/app/webkit/Tools/Scripts/libraries/webkitcorepy/webkitcorepy/task_pool.py", line 165 in close File "/app/webkit/Tools/Scripts/libraries/webkitcorepy/webkitcorepy/task_pool.py", line 479 in __exit__ File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/controllers/layout_test_runner.py", line 180 in run_tests File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/controllers/manager.py", line 570 in _run_tests File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/controllers/manager.py", line 487 in _run_test_subset File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/controllers/manager.py", line 413 in run File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/run_webkit_tests.py", line 508 in run File "/app/webkit/Tools/Scripts/webkitpy/layout_tests/run_webkit_tests.py", line 92 in main File "/app/webkit/Tools/Scripts/run-webkit-tests", line 46 in <module>
Pull request: https://github.com/WebKit/WebKit/pull/8564
Marking as fixed as the PR https://github.com/WebKit/WebKit/pull/8564 fixing it was merged earlier this year.