2 * Copyright © 2016 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include <linux/prime_numbers.h>
26 #include <linux/pm_qos.h>
27 #include <linux/sort.h>
29 #include "gem/i915_gem_pm.h"
30 #include "gem/selftests/mock_context.h"
32 #include "gt/intel_engine_heartbeat.h"
33 #include "gt/intel_engine_pm.h"
34 #include "gt/intel_engine_user.h"
35 #include "gt/intel_gt.h"
36 #include "gt/intel_gt_requests.h"
37 #include "gt/selftest_engine_heartbeat.h"
39 #include "i915_random.h"
40 #include "i915_selftest.h"
41 #include "igt_flush_test.h"
42 #include "igt_live_test.h"
43 #include "igt_spinner.h"
44 #include "lib_sw_fence.h"
47 #include "mock_gem_device.h"
49 static unsigned int num_uabi_engines(struct drm_i915_private *i915)
51 struct intel_engine_cs *engine;
55 for_each_uabi_engine(engine, i915)
61 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
63 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
66 static int igt_add_request(void *arg)
68 struct drm_i915_private *i915 = arg;
69 struct i915_request *request;
71 /* Basic preliminary test to create a request and let it loose! */
73 request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
77 i915_request_add(request);
82 static int igt_wait_request(void *arg)
84 const long T = HZ / 4;
85 struct drm_i915_private *i915 = arg;
86 struct i915_request *request;
89 /* Submit a request, then wait upon it */
91 request = mock_request(rcs0(i915)->kernel_context, T);
95 i915_request_get(request);
97 if (i915_request_wait(request, 0, 0) != -ETIME) {
98 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
102 if (i915_request_wait(request, 0, T) != -ETIME) {
103 pr_err("request wait succeeded (expected timeout before submit!)\n");
107 if (i915_request_completed(request)) {
108 pr_err("request completed before submit!!\n");
112 i915_request_add(request);
114 if (i915_request_wait(request, 0, 0) != -ETIME) {
115 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
119 if (i915_request_completed(request)) {
120 pr_err("request completed immediately!\n");
124 if (i915_request_wait(request, 0, T / 2) != -ETIME) {
125 pr_err("request wait succeeded (expected timeout!)\n");
129 if (i915_request_wait(request, 0, T) == -ETIME) {
130 pr_err("request wait timed out!\n");
134 if (!i915_request_completed(request)) {
135 pr_err("request not complete after waiting!\n");
139 if (i915_request_wait(request, 0, T) == -ETIME) {
140 pr_err("request wait timed out when already complete!\n");
146 i915_request_put(request);
147 mock_device_flush(i915);
151 static int igt_fence_wait(void *arg)
153 const long T = HZ / 4;
154 struct drm_i915_private *i915 = arg;
155 struct i915_request *request;
158 /* Submit a request, treat it as a fence and wait upon it */
160 request = mock_request(rcs0(i915)->kernel_context, T);
164 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
165 pr_err("fence wait success before submit (expected timeout)!\n");
169 i915_request_add(request);
171 if (dma_fence_is_signaled(&request->fence)) {
172 pr_err("fence signaled immediately!\n");
176 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
177 pr_err("fence wait success after submit (expected timeout)!\n");
181 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
182 pr_err("fence wait timed out (expected success)!\n");
186 if (!dma_fence_is_signaled(&request->fence)) {
187 pr_err("fence unsignaled after waiting!\n");
191 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
192 pr_err("fence wait timed out when complete (expected success)!\n");
198 mock_device_flush(i915);
202 static int igt_request_rewind(void *arg)
204 struct drm_i915_private *i915 = arg;
205 struct i915_request *request, *vip;
206 struct i915_gem_context *ctx[2];
207 struct intel_context *ce;
210 ctx[0] = mock_context(i915, "A");
212 ce = i915_gem_context_get_engine(ctx[0], RCS0);
213 GEM_BUG_ON(IS_ERR(ce));
214 request = mock_request(ce, 2 * HZ);
215 intel_context_put(ce);
221 i915_request_get(request);
222 i915_request_add(request);
224 ctx[1] = mock_context(i915, "B");
226 ce = i915_gem_context_get_engine(ctx[1], RCS0);
227 GEM_BUG_ON(IS_ERR(ce));
228 vip = mock_request(ce, 0);
229 intel_context_put(ce);
235 /* Simulate preemption by manual reordering */
236 if (!mock_cancel_request(request)) {
237 pr_err("failed to cancel request (already executed)!\n");
238 i915_request_add(vip);
241 i915_request_get(vip);
242 i915_request_add(vip);
244 request->engine->submit_request(request);
248 if (i915_request_wait(vip, 0, HZ) == -ETIME) {
249 pr_err("timed out waiting for high priority request\n");
253 if (i915_request_completed(request)) {
254 pr_err("low priority request already completed\n");
260 i915_request_put(vip);
262 mock_context_close(ctx[1]);
263 i915_request_put(request);
265 mock_context_close(ctx[0]);
266 mock_device_flush(i915);
271 struct intel_engine_cs *engine;
272 struct i915_gem_context **contexts;
273 atomic_long_t num_waits, num_fences;
274 int ncontexts, max_batch;
275 struct i915_request *(*request_alloc)(struct intel_context *ce);
278 static struct i915_request *
279 __mock_request_alloc(struct intel_context *ce)
281 return mock_request(ce, 0);
284 static struct i915_request *
285 __live_request_alloc(struct intel_context *ce)
287 return intel_context_create_request(ce);
290 static int __igt_breadcrumbs_smoketest(void *arg)
292 struct smoketest *t = arg;
293 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
294 const unsigned int total = 4 * t->ncontexts + 1;
295 unsigned int num_waits = 0, num_fences = 0;
296 struct i915_request **requests;
297 I915_RND_STATE(prng);
302 * A very simple test to catch the most egregious of list handling bugs.
304 * At its heart, we simply create oodles of requests running across
305 * multiple kthreads and enable signaling on them, for the sole purpose
306 * of stressing our breadcrumb handling. The only inspection we do is
307 * that the fences were marked as signaled.
310 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
314 order = i915_random_order(total, &prng);
320 while (!kthread_should_stop()) {
321 struct i915_sw_fence *submit, *wait;
322 unsigned int n, count;
324 submit = heap_fence_create(GFP_KERNEL);
330 wait = heap_fence_create(GFP_KERNEL);
332 i915_sw_fence_commit(submit);
333 heap_fence_put(submit);
338 i915_random_reorder(order, total, &prng);
339 count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
341 for (n = 0; n < count; n++) {
342 struct i915_gem_context *ctx =
343 t->contexts[order[n] % t->ncontexts];
344 struct i915_request *rq;
345 struct intel_context *ce;
347 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
348 GEM_BUG_ON(IS_ERR(ce));
349 rq = t->request_alloc(ce);
350 intel_context_put(ce);
357 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
361 requests[n] = i915_request_get(rq);
362 i915_request_add(rq);
365 err = i915_sw_fence_await_dma_fence(wait,
371 i915_request_put(rq);
377 i915_sw_fence_commit(submit);
378 i915_sw_fence_commit(wait);
380 if (!wait_event_timeout(wait->wait,
381 i915_sw_fence_done(wait),
383 struct i915_request *rq = requests[count - 1];
385 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
386 atomic_read(&wait->pending), count,
387 rq->fence.context, rq->fence.seqno,
391 intel_gt_set_wedged(t->engine->gt);
392 GEM_BUG_ON(!i915_request_completed(rq));
393 i915_sw_fence_wait(wait);
397 for (n = 0; n < count; n++) {
398 struct i915_request *rq = requests[n];
400 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
402 pr_err("%llu:%llu was not signaled!\n",
403 rq->fence.context, rq->fence.seqno);
407 i915_request_put(rq);
410 heap_fence_put(wait);
411 heap_fence_put(submit);
422 atomic_long_add(num_fences, &t->num_fences);
423 atomic_long_add(num_waits, &t->num_waits);
431 static int mock_breadcrumbs_smoketest(void *arg)
433 struct drm_i915_private *i915 = arg;
434 struct smoketest t = {
435 .engine = rcs0(i915),
438 .request_alloc = __mock_request_alloc
440 unsigned int ncpus = num_online_cpus();
441 struct task_struct **threads;
446 * Smoketest our breadcrumb/signal handling for requests across multiple
447 * threads. A very simple test to only catch the most egregious of bugs.
448 * See __igt_breadcrumbs_smoketest();
451 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
455 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
461 for (n = 0; n < t.ncontexts; n++) {
462 t.contexts[n] = mock_context(t.engine->i915, "mock");
463 if (!t.contexts[n]) {
469 for (n = 0; n < ncpus; n++) {
470 threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
472 if (IS_ERR(threads[n])) {
473 ret = PTR_ERR(threads[n]);
478 get_task_struct(threads[n]);
481 yield(); /* start all threads before we begin */
482 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
484 for (n = 0; n < ncpus; n++) {
487 err = kthread_stop(threads[n]);
491 put_task_struct(threads[n]);
493 pr_info("Completed %lu waits for %lu fence across %d cpus\n",
494 atomic_long_read(&t.num_waits),
495 atomic_long_read(&t.num_fences),
499 for (n = 0; n < t.ncontexts; n++) {
502 mock_context_close(t.contexts[n]);
510 int i915_request_mock_selftests(void)
512 static const struct i915_subtest tests[] = {
513 SUBTEST(igt_add_request),
514 SUBTEST(igt_wait_request),
515 SUBTEST(igt_fence_wait),
516 SUBTEST(igt_request_rewind),
517 SUBTEST(mock_breadcrumbs_smoketest),
519 struct drm_i915_private *i915;
520 intel_wakeref_t wakeref;
523 i915 = mock_gem_device();
527 with_intel_runtime_pm(&i915->runtime_pm, wakeref)
528 err = i915_subtests(tests, i915);
530 mock_destroy_device(i915);
535 static int live_nop_request(void *arg)
537 struct drm_i915_private *i915 = arg;
538 struct intel_engine_cs *engine;
539 struct igt_live_test t;
543 * Submit various sized batches of empty requests, to each engine
544 * (individually), and wait for the batch to complete. We can check
545 * the overhead of submitting requests to the hardware.
548 for_each_uabi_engine(engine, i915) {
549 unsigned long n, prime;
550 IGT_TIMEOUT(end_time);
551 ktime_t times[2] = {};
553 err = igt_live_test_begin(&t, i915, __func__, engine->name);
557 intel_engine_pm_get(engine);
558 for_each_prime_number_from(prime, 1, 8192) {
559 struct i915_request *request = NULL;
561 times[1] = ktime_get_raw();
563 for (n = 0; n < prime; n++) {
564 i915_request_put(request);
565 request = i915_request_create(engine->kernel_context);
567 return PTR_ERR(request);
570 * This space is left intentionally blank.
572 * We do not actually want to perform any
573 * action with this request, we just want
574 * to measure the latency in allocation
575 * and submission of our breadcrumbs -
576 * ensuring that the bare request is sufficient
577 * for the system to work (i.e. proper HEAD
578 * tracking of the rings, interrupt handling,
579 * etc). It also gives us the lowest bounds
583 i915_request_get(request);
584 i915_request_add(request);
586 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
587 i915_request_put(request);
589 times[1] = ktime_sub(ktime_get_raw(), times[1]);
593 if (__igt_timeout(end_time, NULL))
596 intel_engine_pm_put(engine);
598 err = igt_live_test_end(&t);
602 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
604 ktime_to_ns(times[0]),
605 prime, div64_u64(ktime_to_ns(times[1]), prime));
611 static struct i915_vma *empty_batch(struct drm_i915_private *i915)
613 struct drm_i915_gem_object *obj;
614 struct i915_vma *vma;
618 obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
620 return ERR_CAST(obj);
622 cmd = i915_gem_object_pin_map(obj, I915_MAP_WB);
628 *cmd = MI_BATCH_BUFFER_END;
630 __i915_gem_object_flush_map(obj, 0, 64);
631 i915_gem_object_unpin_map(obj);
633 intel_gt_chipset_flush(&i915->gt);
635 vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
641 err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
645 /* Force the wait wait now to avoid including it in the benchmark */
646 err = i915_vma_sync(vma);
655 i915_gem_object_put(obj);
659 static struct i915_request *
660 empty_request(struct intel_engine_cs *engine,
661 struct i915_vma *batch)
663 struct i915_request *request;
666 request = i915_request_create(engine->kernel_context);
670 err = engine->emit_bb_start(request,
673 I915_DISPATCH_SECURE);
677 i915_request_get(request);
679 i915_request_add(request);
680 return err ? ERR_PTR(err) : request;
683 static int live_empty_request(void *arg)
685 struct drm_i915_private *i915 = arg;
686 struct intel_engine_cs *engine;
687 struct igt_live_test t;
688 struct i915_vma *batch;
692 * Submit various sized batches of empty requests, to each engine
693 * (individually), and wait for the batch to complete. We can check
694 * the overhead of submitting requests to the hardware.
697 batch = empty_batch(i915);
699 return PTR_ERR(batch);
701 for_each_uabi_engine(engine, i915) {
702 IGT_TIMEOUT(end_time);
703 struct i915_request *request;
704 unsigned long n, prime;
705 ktime_t times[2] = {};
707 err = igt_live_test_begin(&t, i915, __func__, engine->name);
711 intel_engine_pm_get(engine);
713 /* Warmup / preload */
714 request = empty_request(engine, batch);
715 if (IS_ERR(request)) {
716 err = PTR_ERR(request);
717 intel_engine_pm_put(engine);
720 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
722 for_each_prime_number_from(prime, 1, 8192) {
723 times[1] = ktime_get_raw();
725 for (n = 0; n < prime; n++) {
726 i915_request_put(request);
727 request = empty_request(engine, batch);
728 if (IS_ERR(request)) {
729 err = PTR_ERR(request);
730 intel_engine_pm_put(engine);
734 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
736 times[1] = ktime_sub(ktime_get_raw(), times[1]);
740 if (__igt_timeout(end_time, NULL))
743 i915_request_put(request);
744 intel_engine_pm_put(engine);
746 err = igt_live_test_end(&t);
750 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
752 ktime_to_ns(times[0]),
753 prime, div64_u64(ktime_to_ns(times[1]), prime));
757 i915_vma_unpin(batch);
762 static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
764 struct drm_i915_gem_object *obj;
765 const int gen = INTEL_GEN(i915);
766 struct i915_vma *vma;
770 obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
772 return ERR_CAST(obj);
774 vma = i915_vma_instance(obj, i915->gt.vm, NULL);
780 err = i915_vma_pin(vma, 0, 0, PIN_USER);
784 cmd = i915_gem_object_pin_map(obj, I915_MAP_WC);
791 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
792 *cmd++ = lower_32_bits(vma->node.start);
793 *cmd++ = upper_32_bits(vma->node.start);
794 } else if (gen >= 6) {
795 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
796 *cmd++ = lower_32_bits(vma->node.start);
798 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
799 *cmd++ = lower_32_bits(vma->node.start);
801 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
803 __i915_gem_object_flush_map(obj, 0, 64);
804 i915_gem_object_unpin_map(obj);
806 intel_gt_chipset_flush(&i915->gt);
811 i915_gem_object_put(obj);
815 static int recursive_batch_resolve(struct i915_vma *batch)
819 cmd = i915_gem_object_pin_map(batch->obj, I915_MAP_WC);
823 *cmd = MI_BATCH_BUFFER_END;
825 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
826 i915_gem_object_unpin_map(batch->obj);
828 intel_gt_chipset_flush(batch->vm->gt);
833 static int live_all_engines(void *arg)
835 struct drm_i915_private *i915 = arg;
836 const unsigned int nengines = num_uabi_engines(i915);
837 struct intel_engine_cs *engine;
838 struct i915_request **request;
839 struct igt_live_test t;
840 struct i915_vma *batch;
845 * Check we can submit requests to all engines simultaneously. We
846 * send a recursive batch to each engine - checking that we don't
847 * block doing so, and that they don't complete too soon.
850 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
854 err = igt_live_test_begin(&t, i915, __func__, "");
858 batch = recursive_batch(i915);
860 err = PTR_ERR(batch);
861 pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
865 i915_vma_lock(batch);
868 for_each_uabi_engine(engine, i915) {
869 request[idx] = intel_engine_create_kernel_request(engine);
870 if (IS_ERR(request[idx])) {
871 err = PTR_ERR(request[idx]);
872 pr_err("%s: Request allocation failed with err=%d\n",
877 err = i915_request_await_object(request[idx], batch->obj, 0);
879 err = i915_vma_move_to_active(batch, request[idx], 0);
882 err = engine->emit_bb_start(request[idx],
887 request[idx]->batch = batch;
889 i915_request_get(request[idx]);
890 i915_request_add(request[idx]);
894 i915_vma_unlock(batch);
897 for_each_uabi_engine(engine, i915) {
898 if (i915_request_completed(request[idx])) {
899 pr_err("%s(%s): request completed too early!\n",
900 __func__, engine->name);
907 err = recursive_batch_resolve(batch);
909 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err);
914 for_each_uabi_engine(engine, i915) {
917 timeout = i915_request_wait(request[idx], 0,
918 MAX_SCHEDULE_TIMEOUT);
921 pr_err("%s: error waiting for request on %s, err=%d\n",
922 __func__, engine->name, err);
926 GEM_BUG_ON(!i915_request_completed(request[idx]));
927 i915_request_put(request[idx]);
932 err = igt_live_test_end(&t);
936 for_each_uabi_engine(engine, i915) {
938 i915_request_put(request[idx]);
941 i915_vma_unpin(batch);
948 static int live_sequential_engines(void *arg)
950 struct drm_i915_private *i915 = arg;
951 const unsigned int nengines = num_uabi_engines(i915);
952 struct i915_request **request;
953 struct i915_request *prev = NULL;
954 struct intel_engine_cs *engine;
955 struct igt_live_test t;
960 * Check we can submit requests to all engines sequentially, such
961 * that each successive request waits for the earlier ones. This
962 * tests that we don't execute requests out of order, even though
963 * they are running on independent engines.
966 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
970 err = igt_live_test_begin(&t, i915, __func__, "");
975 for_each_uabi_engine(engine, i915) {
976 struct i915_vma *batch;
978 batch = recursive_batch(i915);
980 err = PTR_ERR(batch);
981 pr_err("%s: Unable to create batch for %s, err=%d\n",
982 __func__, engine->name, err);
986 i915_vma_lock(batch);
987 request[idx] = intel_engine_create_kernel_request(engine);
988 if (IS_ERR(request[idx])) {
989 err = PTR_ERR(request[idx]);
990 pr_err("%s: Request allocation failed for %s with err=%d\n",
991 __func__, engine->name, err);
996 err = i915_request_await_dma_fence(request[idx],
999 i915_request_add(request[idx]);
1000 pr_err("%s: Request await failed for %s with err=%d\n",
1001 __func__, engine->name, err);
1006 err = i915_request_await_object(request[idx],
1009 err = i915_vma_move_to_active(batch, request[idx], 0);
1012 err = engine->emit_bb_start(request[idx],
1017 request[idx]->batch = batch;
1019 i915_request_get(request[idx]);
1020 i915_request_add(request[idx]);
1022 prev = request[idx];
1026 i915_vma_unlock(batch);
1032 for_each_uabi_engine(engine, i915) {
1035 if (i915_request_completed(request[idx])) {
1036 pr_err("%s(%s): request completed too early!\n",
1037 __func__, engine->name);
1042 err = recursive_batch_resolve(request[idx]->batch);
1044 pr_err("%s: failed to resolve batch, err=%d\n",
1049 timeout = i915_request_wait(request[idx], 0,
1050 MAX_SCHEDULE_TIMEOUT);
1053 pr_err("%s: error waiting for request on %s, err=%d\n",
1054 __func__, engine->name, err);
1058 GEM_BUG_ON(!i915_request_completed(request[idx]));
1062 err = igt_live_test_end(&t);
1066 for_each_uabi_engine(engine, i915) {
1072 cmd = i915_gem_object_pin_map(request[idx]->batch->obj,
1075 *cmd = MI_BATCH_BUFFER_END;
1077 __i915_gem_object_flush_map(request[idx]->batch->obj,
1079 i915_gem_object_unpin_map(request[idx]->batch->obj);
1081 intel_gt_chipset_flush(engine->gt);
1084 i915_vma_put(request[idx]->batch);
1085 i915_request_put(request[idx]);
1093 static int __live_parallel_engine1(void *arg)
1095 struct intel_engine_cs *engine = arg;
1096 IGT_TIMEOUT(end_time);
1097 unsigned long count;
1101 intel_engine_pm_get(engine);
1103 struct i915_request *rq;
1105 rq = i915_request_create(engine->kernel_context);
1111 i915_request_get(rq);
1112 i915_request_add(rq);
1115 if (i915_request_wait(rq, 0, HZ / 5) < 0)
1117 i915_request_put(rq);
1122 } while (!__igt_timeout(end_time, NULL));
1123 intel_engine_pm_put(engine);
1125 pr_info("%s: %lu request + sync\n", engine->name, count);
1129 static int __live_parallel_engineN(void *arg)
1131 struct intel_engine_cs *engine = arg;
1132 IGT_TIMEOUT(end_time);
1133 unsigned long count;
1137 intel_engine_pm_get(engine);
1139 struct i915_request *rq;
1141 rq = i915_request_create(engine->kernel_context);
1147 i915_request_add(rq);
1149 } while (!__igt_timeout(end_time, NULL));
1150 intel_engine_pm_put(engine);
1152 pr_info("%s: %lu requests\n", engine->name, count);
1156 static bool wake_all(struct drm_i915_private *i915)
1158 if (atomic_dec_and_test(&i915->selftest.counter)) {
1159 wake_up_var(&i915->selftest.counter);
1166 static int wait_for_all(struct drm_i915_private *i915)
1171 if (wait_var_event_timeout(&i915->selftest.counter,
1172 !atomic_read(&i915->selftest.counter),
1173 i915_selftest.timeout_jiffies))
1179 static int __live_parallel_spin(void *arg)
1181 struct intel_engine_cs *engine = arg;
1182 struct igt_spinner spin;
1183 struct i915_request *rq;
1187 * Create a spinner running for eternity on each engine. If a second
1188 * spinner is incorrectly placed on the same engine, it will not be
1189 * able to start in time.
1192 if (igt_spinner_init(&spin, engine->gt)) {
1193 wake_all(engine->i915);
1197 intel_engine_pm_get(engine);
1198 rq = igt_spinner_create_request(&spin,
1199 engine->kernel_context,
1200 MI_NOOP); /* no preemption */
1201 intel_engine_pm_put(engine);
1206 wake_all(engine->i915);
1210 i915_request_get(rq);
1211 i915_request_add(rq);
1212 if (igt_wait_for_spinner(&spin, rq)) {
1213 /* Occupy this engine for the whole test */
1214 err = wait_for_all(engine->i915);
1216 pr_err("Failed to start spinner on %s\n", engine->name);
1219 igt_spinner_end(&spin);
1221 if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0)
1223 i915_request_put(rq);
1226 igt_spinner_fini(&spin);
1230 static int live_parallel_engines(void *arg)
1232 struct drm_i915_private *i915 = arg;
1233 static int (* const func[])(void *arg) = {
1234 __live_parallel_engine1,
1235 __live_parallel_engineN,
1236 __live_parallel_spin,
1239 const unsigned int nengines = num_uabi_engines(i915);
1240 struct intel_engine_cs *engine;
1241 int (* const *fn)(void *arg);
1242 struct task_struct **tsk;
1246 * Check we can submit requests to all engines concurrently. This
1247 * tests that we load up the system maximally.
1250 tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL);
1254 for (fn = func; !err && *fn; fn++) {
1255 char name[KSYM_NAME_LEN];
1256 struct igt_live_test t;
1259 snprintf(name, sizeof(name), "%ps", *fn);
1260 err = igt_live_test_begin(&t, i915, __func__, name);
1264 atomic_set(&i915->selftest.counter, nengines);
1267 for_each_uabi_engine(engine, i915) {
1268 tsk[idx] = kthread_run(*fn, engine,
1271 if (IS_ERR(tsk[idx])) {
1272 err = PTR_ERR(tsk[idx]);
1275 get_task_struct(tsk[idx++]);
1278 yield(); /* start all threads before we kthread_stop() */
1281 for_each_uabi_engine(engine, i915) {
1284 if (IS_ERR(tsk[idx]))
1287 status = kthread_stop(tsk[idx]);
1291 put_task_struct(tsk[idx++]);
1294 if (igt_live_test_end(&t))
1303 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1305 struct i915_request *rq;
1309 * Before execlists, all contexts share the same ringbuffer. With
1310 * execlists, each context/engine has a separate ringbuffer and
1311 * for the purposes of this test, inexhaustible.
1313 * For the global ringbuffer though, we have to be very careful
1314 * that we do not wrap while preventing the execution of requests
1315 * with a unsignaled fence.
1317 if (HAS_EXECLISTS(ctx->i915))
1320 rq = igt_request_alloc(ctx, engine);
1326 ret = rq->ring->size - rq->reserved_space;
1327 i915_request_add(rq);
1329 sz = rq->ring->emit - rq->head;
1331 sz += rq->ring->size;
1333 ret /= 2; /* leave half spare, in case of emergency! */
1339 static int live_breadcrumbs_smoketest(void *arg)
1341 struct drm_i915_private *i915 = arg;
1342 const unsigned int nengines = num_uabi_engines(i915);
1343 const unsigned int ncpus = num_online_cpus();
1344 unsigned long num_waits, num_fences;
1345 struct intel_engine_cs *engine;
1346 struct task_struct **threads;
1347 struct igt_live_test live;
1348 intel_wakeref_t wakeref;
1349 struct smoketest *smoke;
1350 unsigned int n, idx;
1355 * Smoketest our breadcrumb/signal handling for requests across multiple
1356 * threads. A very simple test to only catch the most egregious of bugs.
1357 * See __igt_breadcrumbs_smoketest();
1359 * On real hardware this time.
1362 wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1364 file = mock_file(i915);
1366 ret = PTR_ERR(file);
1370 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1376 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1382 smoke[0].request_alloc = __live_request_alloc;
1383 smoke[0].ncontexts = 64;
1384 smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1385 sizeof(*smoke[0].contexts),
1387 if (!smoke[0].contexts) {
1392 for (n = 0; n < smoke[0].ncontexts; n++) {
1393 smoke[0].contexts[n] = live_context(i915, file);
1394 if (IS_ERR(smoke[0].contexts[n])) {
1395 ret = PTR_ERR(smoke[0].contexts[n]);
1400 ret = igt_live_test_begin(&live, i915, __func__, "");
1405 for_each_uabi_engine(engine, i915) {
1406 smoke[idx] = smoke[0];
1407 smoke[idx].engine = engine;
1408 smoke[idx].max_batch =
1409 max_batches(smoke[0].contexts[0], engine);
1410 if (smoke[idx].max_batch < 0) {
1411 ret = smoke[idx].max_batch;
1414 /* One ring interleaved between requests from all cpus */
1415 smoke[idx].max_batch /= num_online_cpus() + 1;
1416 pr_debug("Limiting batches to %d requests on %s\n",
1417 smoke[idx].max_batch, engine->name);
1419 for (n = 0; n < ncpus; n++) {
1420 struct task_struct *tsk;
1422 tsk = kthread_run(__igt_breadcrumbs_smoketest,
1423 &smoke[idx], "igt/%d.%d", idx, n);
1429 get_task_struct(tsk);
1430 threads[idx * ncpus + n] = tsk;
1436 yield(); /* start all threads before we begin */
1437 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1443 for_each_uabi_engine(engine, i915) {
1444 for (n = 0; n < ncpus; n++) {
1445 struct task_struct *tsk = threads[idx * ncpus + n];
1451 err = kthread_stop(tsk);
1452 if (err < 0 && !ret)
1455 put_task_struct(tsk);
1458 num_waits += atomic_long_read(&smoke[idx].num_waits);
1459 num_fences += atomic_long_read(&smoke[idx].num_fences);
1462 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1463 num_waits, num_fences, idx, ncpus);
1465 ret = igt_live_test_end(&live) ?: ret;
1467 kfree(smoke[0].contexts);
1475 intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1480 int i915_request_live_selftests(struct drm_i915_private *i915)
1482 static const struct i915_subtest tests[] = {
1483 SUBTEST(live_nop_request),
1484 SUBTEST(live_all_engines),
1485 SUBTEST(live_sequential_engines),
1486 SUBTEST(live_parallel_engines),
1487 SUBTEST(live_empty_request),
1488 SUBTEST(live_breadcrumbs_smoketest),
1491 if (intel_gt_is_wedged(&i915->gt))
1494 return i915_subtests(tests, i915);
1497 static int switch_to_kernel_sync(struct intel_context *ce, int err)
1499 struct i915_request *rq;
1500 struct dma_fence *fence;
1502 rq = intel_engine_create_kernel_request(ce->engine);
1506 fence = i915_active_fence_get(&ce->timeline->last_request);
1508 i915_request_await_dma_fence(rq, fence);
1509 dma_fence_put(fence);
1512 rq = i915_request_get(rq);
1513 i915_request_add(rq);
1514 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1516 i915_request_put(rq);
1518 while (!err && !intel_engine_is_idle(ce->engine))
1519 intel_engine_flush_submission(ce->engine);
1525 struct intel_engine_cs *engine;
1526 unsigned long count;
1532 struct perf_series {
1533 struct drm_i915_private *i915;
1534 unsigned int nengines;
1535 struct intel_context *ce[];
1538 static int cmp_u32(const void *A, const void *B)
1540 const u32 *a = A, *b = B;
1545 static u32 trifilter(u32 *a)
1550 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1552 sum = mul_u32_u32(a[2], 2);
1556 GEM_BUG_ON(sum > U32_MAX);
1561 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1563 u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
1565 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1568 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1570 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1571 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1578 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1580 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1588 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1590 *cs++ = MI_SEMAPHORE_WAIT |
1591 MI_SEMAPHORE_GLOBAL_GTT |
1601 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1603 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1606 static void semaphore_set(u32 *sema, u32 value)
1608 WRITE_ONCE(*sema, value);
1609 wmb(); /* flush the update to the cache, and beyond */
1612 static u32 *hwsp_scratch(const struct intel_context *ce)
1614 return memset32(ce->engine->status_page.addr + 1000, 0, 21);
1617 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
1619 return (i915_ggtt_offset(ce->engine->status_page.vma) +
1620 offset_in_page(dw));
1623 static int measure_semaphore_response(struct intel_context *ce)
1625 u32 *sema = hwsp_scratch(ce);
1626 const u32 offset = hwsp_offset(ce, sema);
1627 u32 elapsed[TF_COUNT], cycles;
1628 struct i915_request *rq;
1634 * Measure how many cycles it takes for the HW to detect the change
1635 * in a semaphore value.
1637 * A: read CS_TIMESTAMP from CPU
1639 * B: read CS_TIMESTAMP on GPU
1641 * Semaphore latency: B - A
1644 semaphore_set(sema, -1);
1646 rq = i915_request_create(ce);
1650 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
1652 i915_request_add(rq);
1657 cs = emit_store_dw(cs, offset, 0);
1658 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1659 cs = emit_semaphore_poll_until(cs, offset, i);
1660 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1661 cs = emit_store_dw(cs, offset, 0);
1664 intel_ring_advance(rq, cs);
1665 i915_request_add(rq);
1667 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1672 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1674 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1675 semaphore_set(sema, i);
1678 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1683 elapsed[i - 1] = sema[i] - cycles;
1686 cycles = trifilter(elapsed);
1687 pr_info("%s: semaphore response %d cycles, %lluns\n",
1688 ce->engine->name, cycles >> TF_BIAS,
1689 cycles_to_ns(ce->engine, cycles));
1691 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1694 intel_gt_set_wedged(ce->engine->gt);
1698 static int measure_idle_dispatch(struct intel_context *ce)
1700 u32 *sema = hwsp_scratch(ce);
1701 const u32 offset = hwsp_offset(ce, sema);
1702 u32 elapsed[TF_COUNT], cycles;
1708 * Measure how long it takes for us to submit a request while the
1709 * engine is idle, but is resting in our context.
1711 * A: read CS_TIMESTAMP from CPU
1713 * B: read CS_TIMESTAMP on GPU
1715 * Submission latency: B - A
1718 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
1719 struct i915_request *rq;
1721 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1725 rq = i915_request_create(ce);
1731 cs = intel_ring_begin(rq, 4);
1733 i915_request_add(rq);
1738 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1740 intel_ring_advance(rq, cs);
1744 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1745 i915_request_add(rq);
1750 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1754 for (i = 0; i < ARRAY_SIZE(elapsed); i++)
1755 elapsed[i] = sema[i] - elapsed[i];
1757 cycles = trifilter(elapsed);
1758 pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
1759 ce->engine->name, cycles >> TF_BIAS,
1760 cycles_to_ns(ce->engine, cycles));
1762 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1765 intel_gt_set_wedged(ce->engine->gt);
1769 static int measure_busy_dispatch(struct intel_context *ce)
1771 u32 *sema = hwsp_scratch(ce);
1772 const u32 offset = hwsp_offset(ce, sema);
1773 u32 elapsed[TF_COUNT + 1], cycles;
1779 * Measure how long it takes for us to submit a request while the
1780 * engine is busy, polling on a semaphore in our context. With
1781 * direct submission, this will include the cost of a lite restore.
1783 * A: read CS_TIMESTAMP from CPU
1785 * B: read CS_TIMESTAMP on GPU
1787 * Submission latency: B - A
1790 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1791 struct i915_request *rq;
1793 rq = i915_request_create(ce);
1799 cs = intel_ring_begin(rq, 12);
1801 i915_request_add(rq);
1806 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
1807 cs = emit_semaphore_poll_until(cs, offset, i);
1808 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1810 intel_ring_advance(rq, cs);
1812 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
1819 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1820 i915_request_add(rq);
1822 semaphore_set(sema, i - 1);
1826 wait_for(READ_ONCE(sema[i - 1]), 500);
1827 semaphore_set(sema, i - 1);
1829 for (i = 1; i <= TF_COUNT; i++) {
1830 GEM_BUG_ON(sema[i] == -1);
1831 elapsed[i - 1] = sema[i] - elapsed[i];
1834 cycles = trifilter(elapsed);
1835 pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
1836 ce->engine->name, cycles >> TF_BIAS,
1837 cycles_to_ns(ce->engine, cycles));
1839 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1842 intel_gt_set_wedged(ce->engine->gt);
1846 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
1849 i915_ggtt_offset(engine->status_page.vma) +
1850 offset_in_page(sema);
1851 struct i915_request *rq;
1854 rq = i915_request_create(engine->kernel_context);
1858 cs = intel_ring_begin(rq, 4);
1860 i915_request_add(rq);
1864 cs = emit_semaphore_poll(cs, mode, value, offset);
1866 intel_ring_advance(rq, cs);
1867 i915_request_add(rq);
1872 static int measure_inter_request(struct intel_context *ce)
1874 u32 *sema = hwsp_scratch(ce);
1875 const u32 offset = hwsp_offset(ce, sema);
1876 u32 elapsed[TF_COUNT + 1], cycles;
1877 struct i915_sw_fence *submit;
1881 * Measure how long it takes to advance from one request into the
1882 * next. Between each request we flush the GPU caches to memory,
1883 * update the breadcrumbs, and then invalidate those caches.
1884 * We queue up all the requests to be submitted in one batch so
1885 * it should be one set of contiguous measurements.
1887 * A: read CS_TIMESTAMP on GPU
1889 * B: read CS_TIMESTAMP on GPU
1891 * Request latency: B - A
1894 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
1898 submit = heap_fence_create(GFP_KERNEL);
1900 semaphore_set(sema, 1);
1904 intel_engine_flush_submission(ce->engine);
1905 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1906 struct i915_request *rq;
1909 rq = i915_request_create(ce);
1915 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
1919 i915_request_add(rq);
1923 cs = intel_ring_begin(rq, 4);
1925 i915_request_add(rq);
1930 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1932 intel_ring_advance(rq, cs);
1933 i915_request_add(rq);
1936 i915_sw_fence_commit(submit);
1938 intel_engine_flush_submission(ce->engine);
1939 heap_fence_put(submit);
1941 semaphore_set(sema, 1);
1942 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1946 for (i = 1; i <= TF_COUNT; i++)
1947 elapsed[i - 1] = sema[i + 1] - sema[i];
1949 cycles = trifilter(elapsed);
1950 pr_info("%s: inter-request latency %d cycles, %lluns\n",
1951 ce->engine->name, cycles >> TF_BIAS,
1952 cycles_to_ns(ce->engine, cycles));
1954 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1957 i915_sw_fence_commit(submit);
1958 heap_fence_put(submit);
1959 semaphore_set(sema, 1);
1961 intel_gt_set_wedged(ce->engine->gt);
1965 static int measure_context_switch(struct intel_context *ce)
1967 u32 *sema = hwsp_scratch(ce);
1968 const u32 offset = hwsp_offset(ce, sema);
1969 struct i915_request *fence = NULL;
1970 u32 elapsed[TF_COUNT + 1], cycles;
1975 * Measure how long it takes to advance from one request in one
1976 * context to a request in another context. This allows us to
1977 * measure how long the context save/restore take, along with all
1978 * the inter-context setup we require.
1980 * A: read CS_TIMESTAMP on GPU
1982 * B: read CS_TIMESTAMP on GPU
1984 * Context switch latency: B - A
1987 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
1991 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1992 struct intel_context *arr[] = {
1993 ce, ce->engine->kernel_context
1995 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
1997 for (j = 0; j < ARRAY_SIZE(arr); j++) {
1998 struct i915_request *rq;
2000 rq = i915_request_create(arr[j]);
2007 err = i915_request_await_dma_fence(rq,
2010 i915_request_add(rq);
2015 cs = intel_ring_begin(rq, 4);
2017 i915_request_add(rq);
2022 cs = emit_timestamp_store(cs, ce, addr);
2023 addr += sizeof(u32);
2025 intel_ring_advance(rq, cs);
2027 i915_request_put(fence);
2028 fence = i915_request_get(rq);
2030 i915_request_add(rq);
2033 i915_request_put(fence);
2034 intel_engine_flush_submission(ce->engine);
2036 semaphore_set(sema, 1);
2037 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2041 for (i = 1; i <= TF_COUNT; i++)
2042 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2044 cycles = trifilter(elapsed);
2045 pr_info("%s: context switch latency %d cycles, %lluns\n",
2046 ce->engine->name, cycles >> TF_BIAS,
2047 cycles_to_ns(ce->engine, cycles));
2049 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2052 i915_request_put(fence);
2053 semaphore_set(sema, 1);
2055 intel_gt_set_wedged(ce->engine->gt);
2059 static int measure_preemption(struct intel_context *ce)
2061 u32 *sema = hwsp_scratch(ce);
2062 const u32 offset = hwsp_offset(ce, sema);
2063 u32 elapsed[TF_COUNT], cycles;
2069 * We measure two latencies while triggering preemption. The first
2070 * latency is how long it takes for us to submit a preempting request.
2071 * The second latency is how it takes for us to return from the
2072 * preemption back to the original context.
2074 * A: read CS_TIMESTAMP from CPU
2076 * B: read CS_TIMESTAMP on GPU (in preempting context)
2078 * C: read CS_TIMESTAMP on GPU (in original context)
2080 * Preemption dispatch latency: B - A
2081 * Preemption switch latency: C - B
2084 if (!intel_engine_has_preemption(ce->engine))
2087 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2088 u32 addr = offset + 2 * i * sizeof(u32);
2089 struct i915_request *rq;
2091 rq = i915_request_create(ce);
2097 cs = intel_ring_begin(rq, 12);
2099 i915_request_add(rq);
2104 cs = emit_store_dw(cs, addr, -1);
2105 cs = emit_semaphore_poll_until(cs, offset, i);
2106 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2108 intel_ring_advance(rq, cs);
2109 i915_request_add(rq);
2111 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2116 rq = i915_request_create(ce->engine->kernel_context);
2122 cs = intel_ring_begin(rq, 8);
2124 i915_request_add(rq);
2129 cs = emit_timestamp_store(cs, ce, addr);
2130 cs = emit_store_dw(cs, offset, i);
2132 intel_ring_advance(rq, cs);
2133 rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2135 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2136 i915_request_add(rq);
2139 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2144 for (i = 1; i <= TF_COUNT; i++)
2145 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2147 cycles = trifilter(elapsed);
2148 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2149 ce->engine->name, cycles >> TF_BIAS,
2150 cycles_to_ns(ce->engine, cycles));
2152 for (i = 1; i <= TF_COUNT; i++)
2153 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2155 cycles = trifilter(elapsed);
2156 pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2157 ce->engine->name, cycles >> TF_BIAS,
2158 cycles_to_ns(ce->engine, cycles));
2160 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2163 intel_gt_set_wedged(ce->engine->gt);
2168 struct dma_fence_cb base;
2172 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2174 struct signal_cb *s = container_of(cb, typeof(*s), base);
2176 smp_store_mb(s->seen, true); /* be safe, be strong */
2179 static int measure_completion(struct intel_context *ce)
2181 u32 *sema = hwsp_scratch(ce);
2182 const u32 offset = hwsp_offset(ce, sema);
2183 u32 elapsed[TF_COUNT], cycles;
2189 * Measure how long it takes for the signal (interrupt) to be
2190 * sent from the GPU to be processed by the CPU.
2192 * A: read CS_TIMESTAMP on GPU
2194 * B: read CS_TIMESTAMP from CPU
2196 * Completion latency: B - A
2199 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2200 struct signal_cb cb = { .seen = false };
2201 struct i915_request *rq;
2203 rq = i915_request_create(ce);
2209 cs = intel_ring_begin(rq, 12);
2211 i915_request_add(rq);
2216 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2217 cs = emit_semaphore_poll_until(cs, offset, i);
2218 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2220 intel_ring_advance(rq, cs);
2222 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2225 i915_request_add(rq);
2228 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2234 semaphore_set(sema, i);
2235 while (!READ_ONCE(cb.seen))
2238 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2242 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2246 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2247 GEM_BUG_ON(sema[i + 1] == -1);
2248 elapsed[i] = elapsed[i] - sema[i + 1];
2251 cycles = trifilter(elapsed);
2252 pr_info("%s: completion latency %d cycles, %lluns\n",
2253 ce->engine->name, cycles >> TF_BIAS,
2254 cycles_to_ns(ce->engine, cycles));
2256 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2259 intel_gt_set_wedged(ce->engine->gt);
2263 static void rps_pin(struct intel_gt *gt)
2265 /* Pin the frequency to max */
2266 atomic_inc(>->rps.num_waiters);
2267 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2269 mutex_lock(>->rps.lock);
2270 intel_rps_set(>->rps, gt->rps.max_freq);
2271 mutex_unlock(>->rps.lock);
2274 static void rps_unpin(struct intel_gt *gt)
2276 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2277 atomic_dec(>->rps.num_waiters);
2280 static int perf_request_latency(void *arg)
2282 struct drm_i915_private *i915 = arg;
2283 struct intel_engine_cs *engine;
2284 struct pm_qos_request qos;
2287 if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
2290 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2292 for_each_uabi_engine(engine, i915) {
2293 struct intel_context *ce;
2295 ce = intel_context_create(engine);
2301 err = intel_context_pin(ce);
2303 intel_context_put(ce);
2307 st_engine_heartbeat_disable(engine);
2308 rps_pin(engine->gt);
2311 err = measure_semaphore_response(ce);
2313 err = measure_idle_dispatch(ce);
2315 err = measure_busy_dispatch(ce);
2317 err = measure_inter_request(ce);
2319 err = measure_context_switch(ce);
2321 err = measure_preemption(ce);
2323 err = measure_completion(ce);
2325 rps_unpin(engine->gt);
2326 st_engine_heartbeat_enable(engine);
2328 intel_context_unpin(ce);
2329 intel_context_put(ce);
2335 if (igt_flush_test(i915))
2338 cpu_latency_qos_remove_request(&qos);
2342 static int s_sync0(void *arg)
2344 struct perf_series *ps = arg;
2345 IGT_TIMEOUT(end_time);
2346 unsigned int idx = 0;
2349 GEM_BUG_ON(!ps->nengines);
2351 struct i915_request *rq;
2353 rq = i915_request_create(ps->ce[idx]);
2359 i915_request_get(rq);
2360 i915_request_add(rq);
2362 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2364 i915_request_put(rq);
2368 if (++idx == ps->nengines)
2370 } while (!__igt_timeout(end_time, NULL));
2375 static int s_sync1(void *arg)
2377 struct perf_series *ps = arg;
2378 struct i915_request *prev = NULL;
2379 IGT_TIMEOUT(end_time);
2380 unsigned int idx = 0;
2383 GEM_BUG_ON(!ps->nengines);
2385 struct i915_request *rq;
2387 rq = i915_request_create(ps->ce[idx]);
2393 i915_request_get(rq);
2394 i915_request_add(rq);
2396 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2398 i915_request_put(prev);
2403 if (++idx == ps->nengines)
2405 } while (!__igt_timeout(end_time, NULL));
2406 i915_request_put(prev);
2411 static int s_many(void *arg)
2413 struct perf_series *ps = arg;
2414 IGT_TIMEOUT(end_time);
2415 unsigned int idx = 0;
2417 GEM_BUG_ON(!ps->nengines);
2419 struct i915_request *rq;
2421 rq = i915_request_create(ps->ce[idx]);
2425 i915_request_add(rq);
2427 if (++idx == ps->nengines)
2429 } while (!__igt_timeout(end_time, NULL));
2434 static int perf_series_engines(void *arg)
2436 struct drm_i915_private *i915 = arg;
2437 static int (* const func[])(void *arg) = {
2443 const unsigned int nengines = num_uabi_engines(i915);
2444 struct intel_engine_cs *engine;
2445 int (* const *fn)(void *arg);
2446 struct pm_qos_request qos;
2447 struct perf_stats *stats;
2448 struct perf_series *ps;
2452 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2456 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2462 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2465 ps->nengines = nengines;
2468 for_each_uabi_engine(engine, i915) {
2469 struct intel_context *ce;
2471 ce = intel_context_create(engine);
2477 err = intel_context_pin(ce);
2479 intel_context_put(ce);
2485 GEM_BUG_ON(idx != ps->nengines);
2487 for (fn = func; *fn && !err; fn++) {
2488 char name[KSYM_NAME_LEN];
2489 struct igt_live_test t;
2491 snprintf(name, sizeof(name), "%ps", *fn);
2492 err = igt_live_test_begin(&t, i915, __func__, name);
2496 for (idx = 0; idx < nengines; idx++) {
2497 struct perf_stats *p =
2498 memset(&stats[idx], 0, sizeof(stats[idx]));
2499 struct intel_context *ce = ps->ce[idx];
2501 p->engine = ps->ce[idx]->engine;
2502 intel_engine_pm_get(p->engine);
2504 if (intel_engine_supports_stats(p->engine))
2505 p->busy = intel_engine_get_busy_time(p->engine,
2508 p->time = ktime_get();
2509 p->runtime = -intel_context_get_total_runtime_ns(ce);
2513 if (igt_live_test_end(&t))
2516 for (idx = 0; idx < nengines; idx++) {
2517 struct perf_stats *p = &stats[idx];
2518 struct intel_context *ce = ps->ce[idx];
2519 int integer, decimal;
2523 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2528 p->time = ktime_sub(now, p->time);
2530 err = switch_to_kernel_sync(ce, err);
2531 p->runtime += intel_context_get_total_runtime_ns(ce);
2532 intel_engine_pm_put(p->engine);
2534 busy = 100 * ktime_to_ns(p->busy);
2535 dt = ktime_to_ns(p->time);
2537 integer = div64_u64(busy, dt);
2538 busy -= integer * dt;
2539 decimal = div64_u64(100 * busy, dt);
2545 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2546 name, p->engine->name, ce->timeline->seqno,
2548 div_u64(p->runtime, 1000 * 1000),
2549 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2554 for (idx = 0; idx < nengines; idx++) {
2555 if (IS_ERR_OR_NULL(ps->ce[idx]))
2558 intel_context_unpin(ps->ce[idx]);
2559 intel_context_put(ps->ce[idx]);
2563 cpu_latency_qos_remove_request(&qos);
2568 static int p_sync0(void *arg)
2570 struct perf_stats *p = arg;
2571 struct intel_engine_cs *engine = p->engine;
2572 struct intel_context *ce;
2573 IGT_TIMEOUT(end_time);
2574 unsigned long count;
2578 ce = intel_context_create(engine);
2582 err = intel_context_pin(ce);
2584 intel_context_put(ce);
2588 if (intel_engine_supports_stats(engine)) {
2589 p->busy = intel_engine_get_busy_time(engine, &p->time);
2592 p->time = ktime_get();
2598 struct i915_request *rq;
2600 rq = i915_request_create(ce);
2606 i915_request_get(rq);
2607 i915_request_add(rq);
2610 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2612 i915_request_put(rq);
2617 } while (!__igt_timeout(end_time, NULL));
2622 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2624 p->time = ktime_sub(now, p->time);
2626 p->time = ktime_sub(ktime_get(), p->time);
2629 err = switch_to_kernel_sync(ce, err);
2630 p->runtime = intel_context_get_total_runtime_ns(ce);
2633 intel_context_unpin(ce);
2634 intel_context_put(ce);
2638 static int p_sync1(void *arg)
2640 struct perf_stats *p = arg;
2641 struct intel_engine_cs *engine = p->engine;
2642 struct i915_request *prev = NULL;
2643 struct intel_context *ce;
2644 IGT_TIMEOUT(end_time);
2645 unsigned long count;
2649 ce = intel_context_create(engine);
2653 err = intel_context_pin(ce);
2655 intel_context_put(ce);
2659 if (intel_engine_supports_stats(engine)) {
2660 p->busy = intel_engine_get_busy_time(engine, &p->time);
2663 p->time = ktime_get();
2669 struct i915_request *rq;
2671 rq = i915_request_create(ce);
2677 i915_request_get(rq);
2678 i915_request_add(rq);
2681 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2683 i915_request_put(prev);
2689 } while (!__igt_timeout(end_time, NULL));
2690 i915_request_put(prev);
2695 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2697 p->time = ktime_sub(now, p->time);
2699 p->time = ktime_sub(ktime_get(), p->time);
2702 err = switch_to_kernel_sync(ce, err);
2703 p->runtime = intel_context_get_total_runtime_ns(ce);
2706 intel_context_unpin(ce);
2707 intel_context_put(ce);
2711 static int p_many(void *arg)
2713 struct perf_stats *p = arg;
2714 struct intel_engine_cs *engine = p->engine;
2715 struct intel_context *ce;
2716 IGT_TIMEOUT(end_time);
2717 unsigned long count;
2721 ce = intel_context_create(engine);
2725 err = intel_context_pin(ce);
2727 intel_context_put(ce);
2731 if (intel_engine_supports_stats(engine)) {
2732 p->busy = intel_engine_get_busy_time(engine, &p->time);
2735 p->time = ktime_get();
2741 struct i915_request *rq;
2743 rq = i915_request_create(ce);
2749 i915_request_add(rq);
2751 } while (!__igt_timeout(end_time, NULL));
2756 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2758 p->time = ktime_sub(now, p->time);
2760 p->time = ktime_sub(ktime_get(), p->time);
2763 err = switch_to_kernel_sync(ce, err);
2764 p->runtime = intel_context_get_total_runtime_ns(ce);
2767 intel_context_unpin(ce);
2768 intel_context_put(ce);
2772 static int perf_parallel_engines(void *arg)
2774 struct drm_i915_private *i915 = arg;
2775 static int (* const func[])(void *arg) = {
2781 const unsigned int nengines = num_uabi_engines(i915);
2782 struct intel_engine_cs *engine;
2783 int (* const *fn)(void *arg);
2784 struct pm_qos_request qos;
2786 struct perf_stats p;
2787 struct task_struct *tsk;
2791 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
2795 cpu_latency_qos_add_request(&qos, 0);
2797 for (fn = func; *fn; fn++) {
2798 char name[KSYM_NAME_LEN];
2799 struct igt_live_test t;
2802 snprintf(name, sizeof(name), "%ps", *fn);
2803 err = igt_live_test_begin(&t, i915, __func__, name);
2807 atomic_set(&i915->selftest.counter, nengines);
2810 for_each_uabi_engine(engine, i915) {
2811 intel_engine_pm_get(engine);
2813 memset(&engines[idx].p, 0, sizeof(engines[idx].p));
2814 engines[idx].p.engine = engine;
2816 engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
2817 "igt:%s", engine->name);
2818 if (IS_ERR(engines[idx].tsk)) {
2819 err = PTR_ERR(engines[idx].tsk);
2820 intel_engine_pm_put(engine);
2823 get_task_struct(engines[idx++].tsk);
2826 yield(); /* start all threads before we kthread_stop() */
2829 for_each_uabi_engine(engine, i915) {
2832 if (IS_ERR(engines[idx].tsk))
2835 status = kthread_stop(engines[idx].tsk);
2839 intel_engine_pm_put(engine);
2840 put_task_struct(engines[idx++].tsk);
2843 if (igt_live_test_end(&t))
2849 for_each_uabi_engine(engine, i915) {
2850 struct perf_stats *p = &engines[idx].p;
2851 u64 busy = 100 * ktime_to_ns(p->busy);
2852 u64 dt = ktime_to_ns(p->time);
2853 int integer, decimal;
2856 integer = div64_u64(busy, dt);
2857 busy -= integer * dt;
2858 decimal = div64_u64(100 * busy, dt);
2864 GEM_BUG_ON(engine != p->engine);
2865 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2866 name, engine->name, p->count, integer, decimal,
2867 div_u64(p->runtime, 1000 * 1000),
2868 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2873 cpu_latency_qos_remove_request(&qos);
2878 int i915_request_perf_selftests(struct drm_i915_private *i915)
2880 static const struct i915_subtest tests[] = {
2881 SUBTEST(perf_request_latency),
2882 SUBTEST(perf_series_engines),
2883 SUBTEST(perf_parallel_engines),
2886 if (intel_gt_is_wedged(&i915->gt))
2889 return i915_subtests(tests, i915);