GNU Linux-libre 5.10.215-gnu1
[releases.git] / drivers / gpu / drm / i915 / selftests / i915_request.c
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24
25 #include <linux/prime_numbers.h>
26 #include <linux/pm_qos.h>
27 #include <linux/sort.h>
28
29 #include "gem/i915_gem_pm.h"
30 #include "gem/selftests/mock_context.h"
31
32 #include "gt/intel_engine_heartbeat.h"
33 #include "gt/intel_engine_pm.h"
34 #include "gt/intel_engine_user.h"
35 #include "gt/intel_gt.h"
36 #include "gt/intel_gt_requests.h"
37 #include "gt/selftest_engine_heartbeat.h"
38
39 #include "i915_random.h"
40 #include "i915_selftest.h"
41 #include "igt_flush_test.h"
42 #include "igt_live_test.h"
43 #include "igt_spinner.h"
44 #include "lib_sw_fence.h"
45
46 #include "mock_drm.h"
47 #include "mock_gem_device.h"
48
49 static unsigned int num_uabi_engines(struct drm_i915_private *i915)
50 {
51         struct intel_engine_cs *engine;
52         unsigned int count;
53
54         count = 0;
55         for_each_uabi_engine(engine, i915)
56                 count++;
57
58         return count;
59 }
60
61 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
62 {
63         return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
64 }
65
66 static int igt_add_request(void *arg)
67 {
68         struct drm_i915_private *i915 = arg;
69         struct i915_request *request;
70
71         /* Basic preliminary test to create a request and let it loose! */
72
73         request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
74         if (!request)
75                 return -ENOMEM;
76
77         i915_request_add(request);
78
79         return 0;
80 }
81
82 static int igt_wait_request(void *arg)
83 {
84         const long T = HZ / 4;
85         struct drm_i915_private *i915 = arg;
86         struct i915_request *request;
87         int err = -EINVAL;
88
89         /* Submit a request, then wait upon it */
90
91         request = mock_request(rcs0(i915)->kernel_context, T);
92         if (!request)
93                 return -ENOMEM;
94
95         i915_request_get(request);
96
97         if (i915_request_wait(request, 0, 0) != -ETIME) {
98                 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
99                 goto out_request;
100         }
101
102         if (i915_request_wait(request, 0, T) != -ETIME) {
103                 pr_err("request wait succeeded (expected timeout before submit!)\n");
104                 goto out_request;
105         }
106
107         if (i915_request_completed(request)) {
108                 pr_err("request completed before submit!!\n");
109                 goto out_request;
110         }
111
112         i915_request_add(request);
113
114         if (i915_request_wait(request, 0, 0) != -ETIME) {
115                 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
116                 goto out_request;
117         }
118
119         if (i915_request_completed(request)) {
120                 pr_err("request completed immediately!\n");
121                 goto out_request;
122         }
123
124         if (i915_request_wait(request, 0, T / 2) != -ETIME) {
125                 pr_err("request wait succeeded (expected timeout!)\n");
126                 goto out_request;
127         }
128
129         if (i915_request_wait(request, 0, T) == -ETIME) {
130                 pr_err("request wait timed out!\n");
131                 goto out_request;
132         }
133
134         if (!i915_request_completed(request)) {
135                 pr_err("request not complete after waiting!\n");
136                 goto out_request;
137         }
138
139         if (i915_request_wait(request, 0, T) == -ETIME) {
140                 pr_err("request wait timed out when already complete!\n");
141                 goto out_request;
142         }
143
144         err = 0;
145 out_request:
146         i915_request_put(request);
147         mock_device_flush(i915);
148         return err;
149 }
150
151 static int igt_fence_wait(void *arg)
152 {
153         const long T = HZ / 4;
154         struct drm_i915_private *i915 = arg;
155         struct i915_request *request;
156         int err = -EINVAL;
157
158         /* Submit a request, treat it as a fence and wait upon it */
159
160         request = mock_request(rcs0(i915)->kernel_context, T);
161         if (!request)
162                 return -ENOMEM;
163
164         if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
165                 pr_err("fence wait success before submit (expected timeout)!\n");
166                 goto out;
167         }
168
169         i915_request_add(request);
170
171         if (dma_fence_is_signaled(&request->fence)) {
172                 pr_err("fence signaled immediately!\n");
173                 goto out;
174         }
175
176         if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
177                 pr_err("fence wait success after submit (expected timeout)!\n");
178                 goto out;
179         }
180
181         if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
182                 pr_err("fence wait timed out (expected success)!\n");
183                 goto out;
184         }
185
186         if (!dma_fence_is_signaled(&request->fence)) {
187                 pr_err("fence unsignaled after waiting!\n");
188                 goto out;
189         }
190
191         if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
192                 pr_err("fence wait timed out when complete (expected success)!\n");
193                 goto out;
194         }
195
196         err = 0;
197 out:
198         mock_device_flush(i915);
199         return err;
200 }
201
202 static int igt_request_rewind(void *arg)
203 {
204         struct drm_i915_private *i915 = arg;
205         struct i915_request *request, *vip;
206         struct i915_gem_context *ctx[2];
207         struct intel_context *ce;
208         int err = -EINVAL;
209
210         ctx[0] = mock_context(i915, "A");
211
212         ce = i915_gem_context_get_engine(ctx[0], RCS0);
213         GEM_BUG_ON(IS_ERR(ce));
214         request = mock_request(ce, 2 * HZ);
215         intel_context_put(ce);
216         if (!request) {
217                 err = -ENOMEM;
218                 goto err_context_0;
219         }
220
221         i915_request_get(request);
222         i915_request_add(request);
223
224         ctx[1] = mock_context(i915, "B");
225
226         ce = i915_gem_context_get_engine(ctx[1], RCS0);
227         GEM_BUG_ON(IS_ERR(ce));
228         vip = mock_request(ce, 0);
229         intel_context_put(ce);
230         if (!vip) {
231                 err = -ENOMEM;
232                 goto err_context_1;
233         }
234
235         /* Simulate preemption by manual reordering */
236         if (!mock_cancel_request(request)) {
237                 pr_err("failed to cancel request (already executed)!\n");
238                 i915_request_add(vip);
239                 goto err_context_1;
240         }
241         i915_request_get(vip);
242         i915_request_add(vip);
243         rcu_read_lock();
244         request->engine->submit_request(request);
245         rcu_read_unlock();
246
247
248         if (i915_request_wait(vip, 0, HZ) == -ETIME) {
249                 pr_err("timed out waiting for high priority request\n");
250                 goto err;
251         }
252
253         if (i915_request_completed(request)) {
254                 pr_err("low priority request already completed\n");
255                 goto err;
256         }
257
258         err = 0;
259 err:
260         i915_request_put(vip);
261 err_context_1:
262         mock_context_close(ctx[1]);
263         i915_request_put(request);
264 err_context_0:
265         mock_context_close(ctx[0]);
266         mock_device_flush(i915);
267         return err;
268 }
269
270 struct smoketest {
271         struct intel_engine_cs *engine;
272         struct i915_gem_context **contexts;
273         atomic_long_t num_waits, num_fences;
274         int ncontexts, max_batch;
275         struct i915_request *(*request_alloc)(struct intel_context *ce);
276 };
277
278 static struct i915_request *
279 __mock_request_alloc(struct intel_context *ce)
280 {
281         return mock_request(ce, 0);
282 }
283
284 static struct i915_request *
285 __live_request_alloc(struct intel_context *ce)
286 {
287         return intel_context_create_request(ce);
288 }
289
290 static int __igt_breadcrumbs_smoketest(void *arg)
291 {
292         struct smoketest *t = arg;
293         const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
294         const unsigned int total = 4 * t->ncontexts + 1;
295         unsigned int num_waits = 0, num_fences = 0;
296         struct i915_request **requests;
297         I915_RND_STATE(prng);
298         unsigned int *order;
299         int err = 0;
300
301         /*
302          * A very simple test to catch the most egregious of list handling bugs.
303          *
304          * At its heart, we simply create oodles of requests running across
305          * multiple kthreads and enable signaling on them, for the sole purpose
306          * of stressing our breadcrumb handling. The only inspection we do is
307          * that the fences were marked as signaled.
308          */
309
310         requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
311         if (!requests)
312                 return -ENOMEM;
313
314         order = i915_random_order(total, &prng);
315         if (!order) {
316                 err = -ENOMEM;
317                 goto out_requests;
318         }
319
320         while (!kthread_should_stop()) {
321                 struct i915_sw_fence *submit, *wait;
322                 unsigned int n, count;
323
324                 submit = heap_fence_create(GFP_KERNEL);
325                 if (!submit) {
326                         err = -ENOMEM;
327                         break;
328                 }
329
330                 wait = heap_fence_create(GFP_KERNEL);
331                 if (!wait) {
332                         i915_sw_fence_commit(submit);
333                         heap_fence_put(submit);
334                         err = -ENOMEM;
335                         break;
336                 }
337
338                 i915_random_reorder(order, total, &prng);
339                 count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
340
341                 for (n = 0; n < count; n++) {
342                         struct i915_gem_context *ctx =
343                                 t->contexts[order[n] % t->ncontexts];
344                         struct i915_request *rq;
345                         struct intel_context *ce;
346
347                         ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
348                         GEM_BUG_ON(IS_ERR(ce));
349                         rq = t->request_alloc(ce);
350                         intel_context_put(ce);
351                         if (IS_ERR(rq)) {
352                                 err = PTR_ERR(rq);
353                                 count = n;
354                                 break;
355                         }
356
357                         err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
358                                                                submit,
359                                                                GFP_KERNEL);
360
361                         requests[n] = i915_request_get(rq);
362                         i915_request_add(rq);
363
364                         if (err >= 0)
365                                 err = i915_sw_fence_await_dma_fence(wait,
366                                                                     &rq->fence,
367                                                                     0,
368                                                                     GFP_KERNEL);
369
370                         if (err < 0) {
371                                 i915_request_put(rq);
372                                 count = n;
373                                 break;
374                         }
375                 }
376
377                 i915_sw_fence_commit(submit);
378                 i915_sw_fence_commit(wait);
379
380                 if (!wait_event_timeout(wait->wait,
381                                         i915_sw_fence_done(wait),
382                                         5 * HZ)) {
383                         struct i915_request *rq = requests[count - 1];
384
385                         pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
386                                atomic_read(&wait->pending), count,
387                                rq->fence.context, rq->fence.seqno,
388                                t->engine->name);
389                         GEM_TRACE_DUMP();
390
391                         intel_gt_set_wedged(t->engine->gt);
392                         GEM_BUG_ON(!i915_request_completed(rq));
393                         i915_sw_fence_wait(wait);
394                         err = -EIO;
395                 }
396
397                 for (n = 0; n < count; n++) {
398                         struct i915_request *rq = requests[n];
399
400                         if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
401                                       &rq->fence.flags)) {
402                                 pr_err("%llu:%llu was not signaled!\n",
403                                        rq->fence.context, rq->fence.seqno);
404                                 err = -EINVAL;
405                         }
406
407                         i915_request_put(rq);
408                 }
409
410                 heap_fence_put(wait);
411                 heap_fence_put(submit);
412
413                 if (err < 0)
414                         break;
415
416                 num_fences += count;
417                 num_waits++;
418
419                 cond_resched();
420         }
421
422         atomic_long_add(num_fences, &t->num_fences);
423         atomic_long_add(num_waits, &t->num_waits);
424
425         kfree(order);
426 out_requests:
427         kfree(requests);
428         return err;
429 }
430
431 static int mock_breadcrumbs_smoketest(void *arg)
432 {
433         struct drm_i915_private *i915 = arg;
434         struct smoketest t = {
435                 .engine = rcs0(i915),
436                 .ncontexts = 1024,
437                 .max_batch = 1024,
438                 .request_alloc = __mock_request_alloc
439         };
440         unsigned int ncpus = num_online_cpus();
441         struct task_struct **threads;
442         unsigned int n;
443         int ret = 0;
444
445         /*
446          * Smoketest our breadcrumb/signal handling for requests across multiple
447          * threads. A very simple test to only catch the most egregious of bugs.
448          * See __igt_breadcrumbs_smoketest();
449          */
450
451         threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
452         if (!threads)
453                 return -ENOMEM;
454
455         t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
456         if (!t.contexts) {
457                 ret = -ENOMEM;
458                 goto out_threads;
459         }
460
461         for (n = 0; n < t.ncontexts; n++) {
462                 t.contexts[n] = mock_context(t.engine->i915, "mock");
463                 if (!t.contexts[n]) {
464                         ret = -ENOMEM;
465                         goto out_contexts;
466                 }
467         }
468
469         for (n = 0; n < ncpus; n++) {
470                 threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
471                                          &t, "igt/%d", n);
472                 if (IS_ERR(threads[n])) {
473                         ret = PTR_ERR(threads[n]);
474                         ncpus = n;
475                         break;
476                 }
477
478                 get_task_struct(threads[n]);
479         }
480
481         yield(); /* start all threads before we begin */
482         msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
483
484         for (n = 0; n < ncpus; n++) {
485                 int err;
486
487                 err = kthread_stop(threads[n]);
488                 if (err < 0 && !ret)
489                         ret = err;
490
491                 put_task_struct(threads[n]);
492         }
493         pr_info("Completed %lu waits for %lu fence across %d cpus\n",
494                 atomic_long_read(&t.num_waits),
495                 atomic_long_read(&t.num_fences),
496                 ncpus);
497
498 out_contexts:
499         for (n = 0; n < t.ncontexts; n++) {
500                 if (!t.contexts[n])
501                         break;
502                 mock_context_close(t.contexts[n]);
503         }
504         kfree(t.contexts);
505 out_threads:
506         kfree(threads);
507         return ret;
508 }
509
510 int i915_request_mock_selftests(void)
511 {
512         static const struct i915_subtest tests[] = {
513                 SUBTEST(igt_add_request),
514                 SUBTEST(igt_wait_request),
515                 SUBTEST(igt_fence_wait),
516                 SUBTEST(igt_request_rewind),
517                 SUBTEST(mock_breadcrumbs_smoketest),
518         };
519         struct drm_i915_private *i915;
520         intel_wakeref_t wakeref;
521         int err = 0;
522
523         i915 = mock_gem_device();
524         if (!i915)
525                 return -ENOMEM;
526
527         with_intel_runtime_pm(&i915->runtime_pm, wakeref)
528                 err = i915_subtests(tests, i915);
529
530         mock_destroy_device(i915);
531
532         return err;
533 }
534
535 static int live_nop_request(void *arg)
536 {
537         struct drm_i915_private *i915 = arg;
538         struct intel_engine_cs *engine;
539         struct igt_live_test t;
540         int err = -ENODEV;
541
542         /*
543          * Submit various sized batches of empty requests, to each engine
544          * (individually), and wait for the batch to complete. We can check
545          * the overhead of submitting requests to the hardware.
546          */
547
548         for_each_uabi_engine(engine, i915) {
549                 unsigned long n, prime;
550                 IGT_TIMEOUT(end_time);
551                 ktime_t times[2] = {};
552
553                 err = igt_live_test_begin(&t, i915, __func__, engine->name);
554                 if (err)
555                         return err;
556
557                 intel_engine_pm_get(engine);
558                 for_each_prime_number_from(prime, 1, 8192) {
559                         struct i915_request *request = NULL;
560
561                         times[1] = ktime_get_raw();
562
563                         for (n = 0; n < prime; n++) {
564                                 i915_request_put(request);
565                                 request = i915_request_create(engine->kernel_context);
566                                 if (IS_ERR(request))
567                                         return PTR_ERR(request);
568
569                                 /*
570                                  * This space is left intentionally blank.
571                                  *
572                                  * We do not actually want to perform any
573                                  * action with this request, we just want
574                                  * to measure the latency in allocation
575                                  * and submission of our breadcrumbs -
576                                  * ensuring that the bare request is sufficient
577                                  * for the system to work (i.e. proper HEAD
578                                  * tracking of the rings, interrupt handling,
579                                  * etc). It also gives us the lowest bounds
580                                  * for latency.
581                                  */
582
583                                 i915_request_get(request);
584                                 i915_request_add(request);
585                         }
586                         i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
587                         i915_request_put(request);
588
589                         times[1] = ktime_sub(ktime_get_raw(), times[1]);
590                         if (prime == 1)
591                                 times[0] = times[1];
592
593                         if (__igt_timeout(end_time, NULL))
594                                 break;
595                 }
596                 intel_engine_pm_put(engine);
597
598                 err = igt_live_test_end(&t);
599                 if (err)
600                         return err;
601
602                 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
603                         engine->name,
604                         ktime_to_ns(times[0]),
605                         prime, div64_u64(ktime_to_ns(times[1]), prime));
606         }
607
608         return err;
609 }
610
611 static struct i915_vma *empty_batch(struct drm_i915_private *i915)
612 {
613         struct drm_i915_gem_object *obj;
614         struct i915_vma *vma;
615         u32 *cmd;
616         int err;
617
618         obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
619         if (IS_ERR(obj))
620                 return ERR_CAST(obj);
621
622         cmd = i915_gem_object_pin_map(obj, I915_MAP_WB);
623         if (IS_ERR(cmd)) {
624                 err = PTR_ERR(cmd);
625                 goto err;
626         }
627
628         *cmd = MI_BATCH_BUFFER_END;
629
630         __i915_gem_object_flush_map(obj, 0, 64);
631         i915_gem_object_unpin_map(obj);
632
633         intel_gt_chipset_flush(&i915->gt);
634
635         vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
636         if (IS_ERR(vma)) {
637                 err = PTR_ERR(vma);
638                 goto err;
639         }
640
641         err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
642         if (err)
643                 goto err;
644
645         /* Force the wait wait now to avoid including it in the benchmark */
646         err = i915_vma_sync(vma);
647         if (err)
648                 goto err_pin;
649
650         return vma;
651
652 err_pin:
653         i915_vma_unpin(vma);
654 err:
655         i915_gem_object_put(obj);
656         return ERR_PTR(err);
657 }
658
659 static struct i915_request *
660 empty_request(struct intel_engine_cs *engine,
661               struct i915_vma *batch)
662 {
663         struct i915_request *request;
664         int err;
665
666         request = i915_request_create(engine->kernel_context);
667         if (IS_ERR(request))
668                 return request;
669
670         err = engine->emit_bb_start(request,
671                                     batch->node.start,
672                                     batch->node.size,
673                                     I915_DISPATCH_SECURE);
674         if (err)
675                 goto out_request;
676
677         i915_request_get(request);
678 out_request:
679         i915_request_add(request);
680         return err ? ERR_PTR(err) : request;
681 }
682
683 static int live_empty_request(void *arg)
684 {
685         struct drm_i915_private *i915 = arg;
686         struct intel_engine_cs *engine;
687         struct igt_live_test t;
688         struct i915_vma *batch;
689         int err = 0;
690
691         /*
692          * Submit various sized batches of empty requests, to each engine
693          * (individually), and wait for the batch to complete. We can check
694          * the overhead of submitting requests to the hardware.
695          */
696
697         batch = empty_batch(i915);
698         if (IS_ERR(batch))
699                 return PTR_ERR(batch);
700
701         for_each_uabi_engine(engine, i915) {
702                 IGT_TIMEOUT(end_time);
703                 struct i915_request *request;
704                 unsigned long n, prime;
705                 ktime_t times[2] = {};
706
707                 err = igt_live_test_begin(&t, i915, __func__, engine->name);
708                 if (err)
709                         goto out_batch;
710
711                 intel_engine_pm_get(engine);
712
713                 /* Warmup / preload */
714                 request = empty_request(engine, batch);
715                 if (IS_ERR(request)) {
716                         err = PTR_ERR(request);
717                         intel_engine_pm_put(engine);
718                         goto out_batch;
719                 }
720                 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
721
722                 for_each_prime_number_from(prime, 1, 8192) {
723                         times[1] = ktime_get_raw();
724
725                         for (n = 0; n < prime; n++) {
726                                 i915_request_put(request);
727                                 request = empty_request(engine, batch);
728                                 if (IS_ERR(request)) {
729                                         err = PTR_ERR(request);
730                                         intel_engine_pm_put(engine);
731                                         goto out_batch;
732                                 }
733                         }
734                         i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
735
736                         times[1] = ktime_sub(ktime_get_raw(), times[1]);
737                         if (prime == 1)
738                                 times[0] = times[1];
739
740                         if (__igt_timeout(end_time, NULL))
741                                 break;
742                 }
743                 i915_request_put(request);
744                 intel_engine_pm_put(engine);
745
746                 err = igt_live_test_end(&t);
747                 if (err)
748                         goto out_batch;
749
750                 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
751                         engine->name,
752                         ktime_to_ns(times[0]),
753                         prime, div64_u64(ktime_to_ns(times[1]), prime));
754         }
755
756 out_batch:
757         i915_vma_unpin(batch);
758         i915_vma_put(batch);
759         return err;
760 }
761
762 static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
763 {
764         struct drm_i915_gem_object *obj;
765         const int gen = INTEL_GEN(i915);
766         struct i915_vma *vma;
767         u32 *cmd;
768         int err;
769
770         obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
771         if (IS_ERR(obj))
772                 return ERR_CAST(obj);
773
774         vma = i915_vma_instance(obj, i915->gt.vm, NULL);
775         if (IS_ERR(vma)) {
776                 err = PTR_ERR(vma);
777                 goto err;
778         }
779
780         err = i915_vma_pin(vma, 0, 0, PIN_USER);
781         if (err)
782                 goto err;
783
784         cmd = i915_gem_object_pin_map(obj, I915_MAP_WC);
785         if (IS_ERR(cmd)) {
786                 err = PTR_ERR(cmd);
787                 goto err;
788         }
789
790         if (gen >= 8) {
791                 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
792                 *cmd++ = lower_32_bits(vma->node.start);
793                 *cmd++ = upper_32_bits(vma->node.start);
794         } else if (gen >= 6) {
795                 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
796                 *cmd++ = lower_32_bits(vma->node.start);
797         } else {
798                 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
799                 *cmd++ = lower_32_bits(vma->node.start);
800         }
801         *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
802
803         __i915_gem_object_flush_map(obj, 0, 64);
804         i915_gem_object_unpin_map(obj);
805
806         intel_gt_chipset_flush(&i915->gt);
807
808         return vma;
809
810 err:
811         i915_gem_object_put(obj);
812         return ERR_PTR(err);
813 }
814
815 static int recursive_batch_resolve(struct i915_vma *batch)
816 {
817         u32 *cmd;
818
819         cmd = i915_gem_object_pin_map(batch->obj, I915_MAP_WC);
820         if (IS_ERR(cmd))
821                 return PTR_ERR(cmd);
822
823         *cmd = MI_BATCH_BUFFER_END;
824
825         __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
826         i915_gem_object_unpin_map(batch->obj);
827
828         intel_gt_chipset_flush(batch->vm->gt);
829
830         return 0;
831 }
832
833 static int live_all_engines(void *arg)
834 {
835         struct drm_i915_private *i915 = arg;
836         const unsigned int nengines = num_uabi_engines(i915);
837         struct intel_engine_cs *engine;
838         struct i915_request **request;
839         struct igt_live_test t;
840         struct i915_vma *batch;
841         unsigned int idx;
842         int err;
843
844         /*
845          * Check we can submit requests to all engines simultaneously. We
846          * send a recursive batch to each engine - checking that we don't
847          * block doing so, and that they don't complete too soon.
848          */
849
850         request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
851         if (!request)
852                 return -ENOMEM;
853
854         err = igt_live_test_begin(&t, i915, __func__, "");
855         if (err)
856                 goto out_free;
857
858         batch = recursive_batch(i915);
859         if (IS_ERR(batch)) {
860                 err = PTR_ERR(batch);
861                 pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
862                 goto out_free;
863         }
864
865         i915_vma_lock(batch);
866
867         idx = 0;
868         for_each_uabi_engine(engine, i915) {
869                 request[idx] = intel_engine_create_kernel_request(engine);
870                 if (IS_ERR(request[idx])) {
871                         err = PTR_ERR(request[idx]);
872                         pr_err("%s: Request allocation failed with err=%d\n",
873                                __func__, err);
874                         goto out_request;
875                 }
876
877                 err = i915_request_await_object(request[idx], batch->obj, 0);
878                 if (err == 0)
879                         err = i915_vma_move_to_active(batch, request[idx], 0);
880                 GEM_BUG_ON(err);
881
882                 err = engine->emit_bb_start(request[idx],
883                                             batch->node.start,
884                                             batch->node.size,
885                                             0);
886                 GEM_BUG_ON(err);
887                 request[idx]->batch = batch;
888
889                 i915_request_get(request[idx]);
890                 i915_request_add(request[idx]);
891                 idx++;
892         }
893
894         i915_vma_unlock(batch);
895
896         idx = 0;
897         for_each_uabi_engine(engine, i915) {
898                 if (i915_request_completed(request[idx])) {
899                         pr_err("%s(%s): request completed too early!\n",
900                                __func__, engine->name);
901                         err = -EINVAL;
902                         goto out_request;
903                 }
904                 idx++;
905         }
906
907         err = recursive_batch_resolve(batch);
908         if (err) {
909                 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err);
910                 goto out_request;
911         }
912
913         idx = 0;
914         for_each_uabi_engine(engine, i915) {
915                 long timeout;
916
917                 timeout = i915_request_wait(request[idx], 0,
918                                             MAX_SCHEDULE_TIMEOUT);
919                 if (timeout < 0) {
920                         err = timeout;
921                         pr_err("%s: error waiting for request on %s, err=%d\n",
922                                __func__, engine->name, err);
923                         goto out_request;
924                 }
925
926                 GEM_BUG_ON(!i915_request_completed(request[idx]));
927                 i915_request_put(request[idx]);
928                 request[idx] = NULL;
929                 idx++;
930         }
931
932         err = igt_live_test_end(&t);
933
934 out_request:
935         idx = 0;
936         for_each_uabi_engine(engine, i915) {
937                 if (request[idx])
938                         i915_request_put(request[idx]);
939                 idx++;
940         }
941         i915_vma_unpin(batch);
942         i915_vma_put(batch);
943 out_free:
944         kfree(request);
945         return err;
946 }
947
948 static int live_sequential_engines(void *arg)
949 {
950         struct drm_i915_private *i915 = arg;
951         const unsigned int nengines = num_uabi_engines(i915);
952         struct i915_request **request;
953         struct i915_request *prev = NULL;
954         struct intel_engine_cs *engine;
955         struct igt_live_test t;
956         unsigned int idx;
957         int err;
958
959         /*
960          * Check we can submit requests to all engines sequentially, such
961          * that each successive request waits for the earlier ones. This
962          * tests that we don't execute requests out of order, even though
963          * they are running on independent engines.
964          */
965
966         request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
967         if (!request)
968                 return -ENOMEM;
969
970         err = igt_live_test_begin(&t, i915, __func__, "");
971         if (err)
972                 goto out_free;
973
974         idx = 0;
975         for_each_uabi_engine(engine, i915) {
976                 struct i915_vma *batch;
977
978                 batch = recursive_batch(i915);
979                 if (IS_ERR(batch)) {
980                         err = PTR_ERR(batch);
981                         pr_err("%s: Unable to create batch for %s, err=%d\n",
982                                __func__, engine->name, err);
983                         goto out_free;
984                 }
985
986                 i915_vma_lock(batch);
987                 request[idx] = intel_engine_create_kernel_request(engine);
988                 if (IS_ERR(request[idx])) {
989                         err = PTR_ERR(request[idx]);
990                         pr_err("%s: Request allocation failed for %s with err=%d\n",
991                                __func__, engine->name, err);
992                         goto out_unlock;
993                 }
994
995                 if (prev) {
996                         err = i915_request_await_dma_fence(request[idx],
997                                                            &prev->fence);
998                         if (err) {
999                                 i915_request_add(request[idx]);
1000                                 pr_err("%s: Request await failed for %s with err=%d\n",
1001                                        __func__, engine->name, err);
1002                                 goto out_unlock;
1003                         }
1004                 }
1005
1006                 err = i915_request_await_object(request[idx],
1007                                                 batch->obj, false);
1008                 if (err == 0)
1009                         err = i915_vma_move_to_active(batch, request[idx], 0);
1010                 GEM_BUG_ON(err);
1011
1012                 err = engine->emit_bb_start(request[idx],
1013                                             batch->node.start,
1014                                             batch->node.size,
1015                                             0);
1016                 GEM_BUG_ON(err);
1017                 request[idx]->batch = batch;
1018
1019                 i915_request_get(request[idx]);
1020                 i915_request_add(request[idx]);
1021
1022                 prev = request[idx];
1023                 idx++;
1024
1025 out_unlock:
1026                 i915_vma_unlock(batch);
1027                 if (err)
1028                         goto out_request;
1029         }
1030
1031         idx = 0;
1032         for_each_uabi_engine(engine, i915) {
1033                 long timeout;
1034
1035                 if (i915_request_completed(request[idx])) {
1036                         pr_err("%s(%s): request completed too early!\n",
1037                                __func__, engine->name);
1038                         err = -EINVAL;
1039                         goto out_request;
1040                 }
1041
1042                 err = recursive_batch_resolve(request[idx]->batch);
1043                 if (err) {
1044                         pr_err("%s: failed to resolve batch, err=%d\n",
1045                                __func__, err);
1046                         goto out_request;
1047                 }
1048
1049                 timeout = i915_request_wait(request[idx], 0,
1050                                             MAX_SCHEDULE_TIMEOUT);
1051                 if (timeout < 0) {
1052                         err = timeout;
1053                         pr_err("%s: error waiting for request on %s, err=%d\n",
1054                                __func__, engine->name, err);
1055                         goto out_request;
1056                 }
1057
1058                 GEM_BUG_ON(!i915_request_completed(request[idx]));
1059                 idx++;
1060         }
1061
1062         err = igt_live_test_end(&t);
1063
1064 out_request:
1065         idx = 0;
1066         for_each_uabi_engine(engine, i915) {
1067                 u32 *cmd;
1068
1069                 if (!request[idx])
1070                         break;
1071
1072                 cmd = i915_gem_object_pin_map(request[idx]->batch->obj,
1073                                               I915_MAP_WC);
1074                 if (!IS_ERR(cmd)) {
1075                         *cmd = MI_BATCH_BUFFER_END;
1076
1077                         __i915_gem_object_flush_map(request[idx]->batch->obj,
1078                                                     0, sizeof(*cmd));
1079                         i915_gem_object_unpin_map(request[idx]->batch->obj);
1080
1081                         intel_gt_chipset_flush(engine->gt);
1082                 }
1083
1084                 i915_vma_put(request[idx]->batch);
1085                 i915_request_put(request[idx]);
1086                 idx++;
1087         }
1088 out_free:
1089         kfree(request);
1090         return err;
1091 }
1092
1093 static int __live_parallel_engine1(void *arg)
1094 {
1095         struct intel_engine_cs *engine = arg;
1096         IGT_TIMEOUT(end_time);
1097         unsigned long count;
1098         int err = 0;
1099
1100         count = 0;
1101         intel_engine_pm_get(engine);
1102         do {
1103                 struct i915_request *rq;
1104
1105                 rq = i915_request_create(engine->kernel_context);
1106                 if (IS_ERR(rq)) {
1107                         err = PTR_ERR(rq);
1108                         break;
1109                 }
1110
1111                 i915_request_get(rq);
1112                 i915_request_add(rq);
1113
1114                 err = 0;
1115                 if (i915_request_wait(rq, 0, HZ / 5) < 0)
1116                         err = -ETIME;
1117                 i915_request_put(rq);
1118                 if (err)
1119                         break;
1120
1121                 count++;
1122         } while (!__igt_timeout(end_time, NULL));
1123         intel_engine_pm_put(engine);
1124
1125         pr_info("%s: %lu request + sync\n", engine->name, count);
1126         return err;
1127 }
1128
1129 static int __live_parallel_engineN(void *arg)
1130 {
1131         struct intel_engine_cs *engine = arg;
1132         IGT_TIMEOUT(end_time);
1133         unsigned long count;
1134         int err = 0;
1135
1136         count = 0;
1137         intel_engine_pm_get(engine);
1138         do {
1139                 struct i915_request *rq;
1140
1141                 rq = i915_request_create(engine->kernel_context);
1142                 if (IS_ERR(rq)) {
1143                         err = PTR_ERR(rq);
1144                         break;
1145                 }
1146
1147                 i915_request_add(rq);
1148                 count++;
1149         } while (!__igt_timeout(end_time, NULL));
1150         intel_engine_pm_put(engine);
1151
1152         pr_info("%s: %lu requests\n", engine->name, count);
1153         return err;
1154 }
1155
1156 static bool wake_all(struct drm_i915_private *i915)
1157 {
1158         if (atomic_dec_and_test(&i915->selftest.counter)) {
1159                 wake_up_var(&i915->selftest.counter);
1160                 return true;
1161         }
1162
1163         return false;
1164 }
1165
1166 static int wait_for_all(struct drm_i915_private *i915)
1167 {
1168         if (wake_all(i915))
1169                 return 0;
1170
1171         if (wait_var_event_timeout(&i915->selftest.counter,
1172                                    !atomic_read(&i915->selftest.counter),
1173                                    i915_selftest.timeout_jiffies))
1174                 return 0;
1175
1176         return -ETIME;
1177 }
1178
1179 static int __live_parallel_spin(void *arg)
1180 {
1181         struct intel_engine_cs *engine = arg;
1182         struct igt_spinner spin;
1183         struct i915_request *rq;
1184         int err = 0;
1185
1186         /*
1187          * Create a spinner running for eternity on each engine. If a second
1188          * spinner is incorrectly placed on the same engine, it will not be
1189          * able to start in time.
1190          */
1191
1192         if (igt_spinner_init(&spin, engine->gt)) {
1193                 wake_all(engine->i915);
1194                 return -ENOMEM;
1195         }
1196
1197         intel_engine_pm_get(engine);
1198         rq = igt_spinner_create_request(&spin,
1199                                         engine->kernel_context,
1200                                         MI_NOOP); /* no preemption */
1201         intel_engine_pm_put(engine);
1202         if (IS_ERR(rq)) {
1203                 err = PTR_ERR(rq);
1204                 if (err == -ENODEV)
1205                         err = 0;
1206                 wake_all(engine->i915);
1207                 goto out_spin;
1208         }
1209
1210         i915_request_get(rq);
1211         i915_request_add(rq);
1212         if (igt_wait_for_spinner(&spin, rq)) {
1213                 /* Occupy this engine for the whole test */
1214                 err = wait_for_all(engine->i915);
1215         } else {
1216                 pr_err("Failed to start spinner on %s\n", engine->name);
1217                 err = -EINVAL;
1218         }
1219         igt_spinner_end(&spin);
1220
1221         if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0)
1222                 err = -EIO;
1223         i915_request_put(rq);
1224
1225 out_spin:
1226         igt_spinner_fini(&spin);
1227         return err;
1228 }
1229
1230 static int live_parallel_engines(void *arg)
1231 {
1232         struct drm_i915_private *i915 = arg;
1233         static int (* const func[])(void *arg) = {
1234                 __live_parallel_engine1,
1235                 __live_parallel_engineN,
1236                 __live_parallel_spin,
1237                 NULL,
1238         };
1239         const unsigned int nengines = num_uabi_engines(i915);
1240         struct intel_engine_cs *engine;
1241         int (* const *fn)(void *arg);
1242         struct task_struct **tsk;
1243         int err = 0;
1244
1245         /*
1246          * Check we can submit requests to all engines concurrently. This
1247          * tests that we load up the system maximally.
1248          */
1249
1250         tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL);
1251         if (!tsk)
1252                 return -ENOMEM;
1253
1254         for (fn = func; !err && *fn; fn++) {
1255                 char name[KSYM_NAME_LEN];
1256                 struct igt_live_test t;
1257                 unsigned int idx;
1258
1259                 snprintf(name, sizeof(name), "%ps", *fn);
1260                 err = igt_live_test_begin(&t, i915, __func__, name);
1261                 if (err)
1262                         break;
1263
1264                 atomic_set(&i915->selftest.counter, nengines);
1265
1266                 idx = 0;
1267                 for_each_uabi_engine(engine, i915) {
1268                         tsk[idx] = kthread_run(*fn, engine,
1269                                                "igt/parallel:%s",
1270                                                engine->name);
1271                         if (IS_ERR(tsk[idx])) {
1272                                 err = PTR_ERR(tsk[idx]);
1273                                 break;
1274                         }
1275                         get_task_struct(tsk[idx++]);
1276                 }
1277
1278                 yield(); /* start all threads before we kthread_stop() */
1279
1280                 idx = 0;
1281                 for_each_uabi_engine(engine, i915) {
1282                         int status;
1283
1284                         if (IS_ERR(tsk[idx]))
1285                                 break;
1286
1287                         status = kthread_stop(tsk[idx]);
1288                         if (status && !err)
1289                                 err = status;
1290
1291                         put_task_struct(tsk[idx++]);
1292                 }
1293
1294                 if (igt_live_test_end(&t))
1295                         err = -EIO;
1296         }
1297
1298         kfree(tsk);
1299         return err;
1300 }
1301
1302 static int
1303 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1304 {
1305         struct i915_request *rq;
1306         int ret;
1307
1308         /*
1309          * Before execlists, all contexts share the same ringbuffer. With
1310          * execlists, each context/engine has a separate ringbuffer and
1311          * for the purposes of this test, inexhaustible.
1312          *
1313          * For the global ringbuffer though, we have to be very careful
1314          * that we do not wrap while preventing the execution of requests
1315          * with a unsignaled fence.
1316          */
1317         if (HAS_EXECLISTS(ctx->i915))
1318                 return INT_MAX;
1319
1320         rq = igt_request_alloc(ctx, engine);
1321         if (IS_ERR(rq)) {
1322                 ret = PTR_ERR(rq);
1323         } else {
1324                 int sz;
1325
1326                 ret = rq->ring->size - rq->reserved_space;
1327                 i915_request_add(rq);
1328
1329                 sz = rq->ring->emit - rq->head;
1330                 if (sz < 0)
1331                         sz += rq->ring->size;
1332                 ret /= sz;
1333                 ret /= 2; /* leave half spare, in case of emergency! */
1334         }
1335
1336         return ret;
1337 }
1338
1339 static int live_breadcrumbs_smoketest(void *arg)
1340 {
1341         struct drm_i915_private *i915 = arg;
1342         const unsigned int nengines = num_uabi_engines(i915);
1343         const unsigned int ncpus = num_online_cpus();
1344         unsigned long num_waits, num_fences;
1345         struct intel_engine_cs *engine;
1346         struct task_struct **threads;
1347         struct igt_live_test live;
1348         intel_wakeref_t wakeref;
1349         struct smoketest *smoke;
1350         unsigned int n, idx;
1351         struct file *file;
1352         int ret = 0;
1353
1354         /*
1355          * Smoketest our breadcrumb/signal handling for requests across multiple
1356          * threads. A very simple test to only catch the most egregious of bugs.
1357          * See __igt_breadcrumbs_smoketest();
1358          *
1359          * On real hardware this time.
1360          */
1361
1362         wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1363
1364         file = mock_file(i915);
1365         if (IS_ERR(file)) {
1366                 ret = PTR_ERR(file);
1367                 goto out_rpm;
1368         }
1369
1370         smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1371         if (!smoke) {
1372                 ret = -ENOMEM;
1373                 goto out_file;
1374         }
1375
1376         threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1377         if (!threads) {
1378                 ret = -ENOMEM;
1379                 goto out_smoke;
1380         }
1381
1382         smoke[0].request_alloc = __live_request_alloc;
1383         smoke[0].ncontexts = 64;
1384         smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1385                                     sizeof(*smoke[0].contexts),
1386                                     GFP_KERNEL);
1387         if (!smoke[0].contexts) {
1388                 ret = -ENOMEM;
1389                 goto out_threads;
1390         }
1391
1392         for (n = 0; n < smoke[0].ncontexts; n++) {
1393                 smoke[0].contexts[n] = live_context(i915, file);
1394                 if (IS_ERR(smoke[0].contexts[n])) {
1395                         ret = PTR_ERR(smoke[0].contexts[n]);
1396                         goto out_contexts;
1397                 }
1398         }
1399
1400         ret = igt_live_test_begin(&live, i915, __func__, "");
1401         if (ret)
1402                 goto out_contexts;
1403
1404         idx = 0;
1405         for_each_uabi_engine(engine, i915) {
1406                 smoke[idx] = smoke[0];
1407                 smoke[idx].engine = engine;
1408                 smoke[idx].max_batch =
1409                         max_batches(smoke[0].contexts[0], engine);
1410                 if (smoke[idx].max_batch < 0) {
1411                         ret = smoke[idx].max_batch;
1412                         goto out_flush;
1413                 }
1414                 /* One ring interleaved between requests from all cpus */
1415                 smoke[idx].max_batch /= num_online_cpus() + 1;
1416                 pr_debug("Limiting batches to %d requests on %s\n",
1417                          smoke[idx].max_batch, engine->name);
1418
1419                 for (n = 0; n < ncpus; n++) {
1420                         struct task_struct *tsk;
1421
1422                         tsk = kthread_run(__igt_breadcrumbs_smoketest,
1423                                           &smoke[idx], "igt/%d.%d", idx, n);
1424                         if (IS_ERR(tsk)) {
1425                                 ret = PTR_ERR(tsk);
1426                                 goto out_flush;
1427                         }
1428
1429                         get_task_struct(tsk);
1430                         threads[idx * ncpus + n] = tsk;
1431                 }
1432
1433                 idx++;
1434         }
1435
1436         yield(); /* start all threads before we begin */
1437         msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1438
1439 out_flush:
1440         idx = 0;
1441         num_waits = 0;
1442         num_fences = 0;
1443         for_each_uabi_engine(engine, i915) {
1444                 for (n = 0; n < ncpus; n++) {
1445                         struct task_struct *tsk = threads[idx * ncpus + n];
1446                         int err;
1447
1448                         if (!tsk)
1449                                 continue;
1450
1451                         err = kthread_stop(tsk);
1452                         if (err < 0 && !ret)
1453                                 ret = err;
1454
1455                         put_task_struct(tsk);
1456                 }
1457
1458                 num_waits += atomic_long_read(&smoke[idx].num_waits);
1459                 num_fences += atomic_long_read(&smoke[idx].num_fences);
1460                 idx++;
1461         }
1462         pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1463                 num_waits, num_fences, idx, ncpus);
1464
1465         ret = igt_live_test_end(&live) ?: ret;
1466 out_contexts:
1467         kfree(smoke[0].contexts);
1468 out_threads:
1469         kfree(threads);
1470 out_smoke:
1471         kfree(smoke);
1472 out_file:
1473         fput(file);
1474 out_rpm:
1475         intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1476
1477         return ret;
1478 }
1479
1480 int i915_request_live_selftests(struct drm_i915_private *i915)
1481 {
1482         static const struct i915_subtest tests[] = {
1483                 SUBTEST(live_nop_request),
1484                 SUBTEST(live_all_engines),
1485                 SUBTEST(live_sequential_engines),
1486                 SUBTEST(live_parallel_engines),
1487                 SUBTEST(live_empty_request),
1488                 SUBTEST(live_breadcrumbs_smoketest),
1489         };
1490
1491         if (intel_gt_is_wedged(&i915->gt))
1492                 return 0;
1493
1494         return i915_subtests(tests, i915);
1495 }
1496
1497 static int switch_to_kernel_sync(struct intel_context *ce, int err)
1498 {
1499         struct i915_request *rq;
1500         struct dma_fence *fence;
1501
1502         rq = intel_engine_create_kernel_request(ce->engine);
1503         if (IS_ERR(rq))
1504                 return PTR_ERR(rq);
1505
1506         fence = i915_active_fence_get(&ce->timeline->last_request);
1507         if (fence) {
1508                 i915_request_await_dma_fence(rq, fence);
1509                 dma_fence_put(fence);
1510         }
1511
1512         rq = i915_request_get(rq);
1513         i915_request_add(rq);
1514         if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1515                 err = -ETIME;
1516         i915_request_put(rq);
1517
1518         while (!err && !intel_engine_is_idle(ce->engine))
1519                 intel_engine_flush_submission(ce->engine);
1520
1521         return err;
1522 }
1523
1524 struct perf_stats {
1525         struct intel_engine_cs *engine;
1526         unsigned long count;
1527         ktime_t time;
1528         ktime_t busy;
1529         u64 runtime;
1530 };
1531
1532 struct perf_series {
1533         struct drm_i915_private *i915;
1534         unsigned int nengines;
1535         struct intel_context *ce[];
1536 };
1537
1538 static int cmp_u32(const void *A, const void *B)
1539 {
1540         const u32 *a = A, *b = B;
1541
1542         return *a - *b;
1543 }
1544
1545 static u32 trifilter(u32 *a)
1546 {
1547         u64 sum;
1548
1549 #define TF_COUNT 5
1550         sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1551
1552         sum = mul_u32_u32(a[2], 2);
1553         sum += a[1];
1554         sum += a[3];
1555
1556         GEM_BUG_ON(sum > U32_MAX);
1557         return sum;
1558 #define TF_BIAS 2
1559 }
1560
1561 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1562 {
1563         u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
1564
1565         return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1566 }
1567
1568 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1569 {
1570         *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1571         *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1572         *cs++ = offset;
1573         *cs++ = 0;
1574
1575         return cs;
1576 }
1577
1578 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1579 {
1580         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1581         *cs++ = offset;
1582         *cs++ = 0;
1583         *cs++ = value;
1584
1585         return cs;
1586 }
1587
1588 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1589 {
1590         *cs++ = MI_SEMAPHORE_WAIT |
1591                 MI_SEMAPHORE_GLOBAL_GTT |
1592                 MI_SEMAPHORE_POLL |
1593                 mode;
1594         *cs++ = value;
1595         *cs++ = offset;
1596         *cs++ = 0;
1597
1598         return cs;
1599 }
1600
1601 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1602 {
1603         return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1604 }
1605
1606 static void semaphore_set(u32 *sema, u32 value)
1607 {
1608         WRITE_ONCE(*sema, value);
1609         wmb(); /* flush the update to the cache, and beyond */
1610 }
1611
1612 static u32 *hwsp_scratch(const struct intel_context *ce)
1613 {
1614         return memset32(ce->engine->status_page.addr + 1000, 0, 21);
1615 }
1616
1617 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
1618 {
1619         return (i915_ggtt_offset(ce->engine->status_page.vma) +
1620                 offset_in_page(dw));
1621 }
1622
1623 static int measure_semaphore_response(struct intel_context *ce)
1624 {
1625         u32 *sema = hwsp_scratch(ce);
1626         const u32 offset = hwsp_offset(ce, sema);
1627         u32 elapsed[TF_COUNT], cycles;
1628         struct i915_request *rq;
1629         u32 *cs;
1630         int err;
1631         int i;
1632
1633         /*
1634          * Measure how many cycles it takes for the HW to detect the change
1635          * in a semaphore value.
1636          *
1637          *    A: read CS_TIMESTAMP from CPU
1638          *    poke semaphore
1639          *    B: read CS_TIMESTAMP on GPU
1640          *
1641          * Semaphore latency: B - A
1642          */
1643
1644         semaphore_set(sema, -1);
1645
1646         rq = i915_request_create(ce);
1647         if (IS_ERR(rq))
1648                 return PTR_ERR(rq);
1649
1650         cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
1651         if (IS_ERR(cs)) {
1652                 i915_request_add(rq);
1653                 err = PTR_ERR(cs);
1654                 goto err;
1655         }
1656
1657         cs = emit_store_dw(cs, offset, 0);
1658         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1659                 cs = emit_semaphore_poll_until(cs, offset, i);
1660                 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1661                 cs = emit_store_dw(cs, offset, 0);
1662         }
1663
1664         intel_ring_advance(rq, cs);
1665         i915_request_add(rq);
1666
1667         if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1668                 err = -EIO;
1669                 goto err;
1670         }
1671
1672         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1673                 preempt_disable();
1674                 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1675                 semaphore_set(sema, i);
1676                 preempt_enable();
1677
1678                 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1679                         err = -EIO;
1680                         goto err;
1681                 }
1682
1683                 elapsed[i - 1] = sema[i] - cycles;
1684         }
1685
1686         cycles = trifilter(elapsed);
1687         pr_info("%s: semaphore response %d cycles, %lluns\n",
1688                 ce->engine->name, cycles >> TF_BIAS,
1689                 cycles_to_ns(ce->engine, cycles));
1690
1691         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1692
1693 err:
1694         intel_gt_set_wedged(ce->engine->gt);
1695         return err;
1696 }
1697
1698 static int measure_idle_dispatch(struct intel_context *ce)
1699 {
1700         u32 *sema = hwsp_scratch(ce);
1701         const u32 offset = hwsp_offset(ce, sema);
1702         u32 elapsed[TF_COUNT], cycles;
1703         u32 *cs;
1704         int err;
1705         int i;
1706
1707         /*
1708          * Measure how long it takes for us to submit a request while the
1709          * engine is idle, but is resting in our context.
1710          *
1711          *    A: read CS_TIMESTAMP from CPU
1712          *    submit request
1713          *    B: read CS_TIMESTAMP on GPU
1714          *
1715          * Submission latency: B - A
1716          */
1717
1718         for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
1719                 struct i915_request *rq;
1720
1721                 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1722                 if (err)
1723                         return err;
1724
1725                 rq = i915_request_create(ce);
1726                 if (IS_ERR(rq)) {
1727                         err = PTR_ERR(rq);
1728                         goto err;
1729                 }
1730
1731                 cs = intel_ring_begin(rq, 4);
1732                 if (IS_ERR(cs)) {
1733                         i915_request_add(rq);
1734                         err = PTR_ERR(cs);
1735                         goto err;
1736                 }
1737
1738                 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1739
1740                 intel_ring_advance(rq, cs);
1741
1742                 preempt_disable();
1743                 local_bh_disable();
1744                 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1745                 i915_request_add(rq);
1746                 local_bh_enable();
1747                 preempt_enable();
1748         }
1749
1750         err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1751         if (err)
1752                 goto err;
1753
1754         for (i = 0; i < ARRAY_SIZE(elapsed); i++)
1755                 elapsed[i] = sema[i] - elapsed[i];
1756
1757         cycles = trifilter(elapsed);
1758         pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
1759                 ce->engine->name, cycles >> TF_BIAS,
1760                 cycles_to_ns(ce->engine, cycles));
1761
1762         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1763
1764 err:
1765         intel_gt_set_wedged(ce->engine->gt);
1766         return err;
1767 }
1768
1769 static int measure_busy_dispatch(struct intel_context *ce)
1770 {
1771         u32 *sema = hwsp_scratch(ce);
1772         const u32 offset = hwsp_offset(ce, sema);
1773         u32 elapsed[TF_COUNT + 1], cycles;
1774         u32 *cs;
1775         int err;
1776         int i;
1777
1778         /*
1779          * Measure how long it takes for us to submit a request while the
1780          * engine is busy, polling on a semaphore in our context. With
1781          * direct submission, this will include the cost of a lite restore.
1782          *
1783          *    A: read CS_TIMESTAMP from CPU
1784          *    submit request
1785          *    B: read CS_TIMESTAMP on GPU
1786          *
1787          * Submission latency: B - A
1788          */
1789
1790         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1791                 struct i915_request *rq;
1792
1793                 rq = i915_request_create(ce);
1794                 if (IS_ERR(rq)) {
1795                         err = PTR_ERR(rq);
1796                         goto err;
1797                 }
1798
1799                 cs = intel_ring_begin(rq, 12);
1800                 if (IS_ERR(cs)) {
1801                         i915_request_add(rq);
1802                         err = PTR_ERR(cs);
1803                         goto err;
1804                 }
1805
1806                 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
1807                 cs = emit_semaphore_poll_until(cs, offset, i);
1808                 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1809
1810                 intel_ring_advance(rq, cs);
1811
1812                 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
1813                         err = -EIO;
1814                         goto err;
1815                 }
1816
1817                 preempt_disable();
1818                 local_bh_disable();
1819                 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1820                 i915_request_add(rq);
1821                 local_bh_enable();
1822                 semaphore_set(sema, i - 1);
1823                 preempt_enable();
1824         }
1825
1826         wait_for(READ_ONCE(sema[i - 1]), 500);
1827         semaphore_set(sema, i - 1);
1828
1829         for (i = 1; i <= TF_COUNT; i++) {
1830                 GEM_BUG_ON(sema[i] == -1);
1831                 elapsed[i - 1] = sema[i] - elapsed[i];
1832         }
1833
1834         cycles = trifilter(elapsed);
1835         pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
1836                 ce->engine->name, cycles >> TF_BIAS,
1837                 cycles_to_ns(ce->engine, cycles));
1838
1839         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1840
1841 err:
1842         intel_gt_set_wedged(ce->engine->gt);
1843         return err;
1844 }
1845
1846 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
1847 {
1848         const u32 offset =
1849                 i915_ggtt_offset(engine->status_page.vma) +
1850                 offset_in_page(sema);
1851         struct i915_request *rq;
1852         u32 *cs;
1853
1854         rq = i915_request_create(engine->kernel_context);
1855         if (IS_ERR(rq))
1856                 return PTR_ERR(rq);
1857
1858         cs = intel_ring_begin(rq, 4);
1859         if (IS_ERR(cs)) {
1860                 i915_request_add(rq);
1861                 return PTR_ERR(cs);
1862         }
1863
1864         cs = emit_semaphore_poll(cs, mode, value, offset);
1865
1866         intel_ring_advance(rq, cs);
1867         i915_request_add(rq);
1868
1869         return 0;
1870 }
1871
1872 static int measure_inter_request(struct intel_context *ce)
1873 {
1874         u32 *sema = hwsp_scratch(ce);
1875         const u32 offset = hwsp_offset(ce, sema);
1876         u32 elapsed[TF_COUNT + 1], cycles;
1877         struct i915_sw_fence *submit;
1878         int i, err;
1879
1880         /*
1881          * Measure how long it takes to advance from one request into the
1882          * next. Between each request we flush the GPU caches to memory,
1883          * update the breadcrumbs, and then invalidate those caches.
1884          * We queue up all the requests to be submitted in one batch so
1885          * it should be one set of contiguous measurements.
1886          *
1887          *    A: read CS_TIMESTAMP on GPU
1888          *    advance request
1889          *    B: read CS_TIMESTAMP on GPU
1890          *
1891          * Request latency: B - A
1892          */
1893
1894         err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
1895         if (err)
1896                 return err;
1897
1898         submit = heap_fence_create(GFP_KERNEL);
1899         if (!submit) {
1900                 semaphore_set(sema, 1);
1901                 return -ENOMEM;
1902         }
1903
1904         intel_engine_flush_submission(ce->engine);
1905         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1906                 struct i915_request *rq;
1907                 u32 *cs;
1908
1909                 rq = i915_request_create(ce);
1910                 if (IS_ERR(rq)) {
1911                         err = PTR_ERR(rq);
1912                         goto err_submit;
1913                 }
1914
1915                 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
1916                                                        submit,
1917                                                        GFP_KERNEL);
1918                 if (err < 0) {
1919                         i915_request_add(rq);
1920                         goto err_submit;
1921                 }
1922
1923                 cs = intel_ring_begin(rq, 4);
1924                 if (IS_ERR(cs)) {
1925                         i915_request_add(rq);
1926                         err = PTR_ERR(cs);
1927                         goto err_submit;
1928                 }
1929
1930                 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1931
1932                 intel_ring_advance(rq, cs);
1933                 i915_request_add(rq);
1934         }
1935         local_bh_disable();
1936         i915_sw_fence_commit(submit);
1937         local_bh_enable();
1938         intel_engine_flush_submission(ce->engine);
1939         heap_fence_put(submit);
1940
1941         semaphore_set(sema, 1);
1942         err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1943         if (err)
1944                 goto err;
1945
1946         for (i = 1; i <= TF_COUNT; i++)
1947                 elapsed[i - 1] = sema[i + 1] - sema[i];
1948
1949         cycles = trifilter(elapsed);
1950         pr_info("%s: inter-request latency %d cycles, %lluns\n",
1951                 ce->engine->name, cycles >> TF_BIAS,
1952                 cycles_to_ns(ce->engine, cycles));
1953
1954         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1955
1956 err_submit:
1957         i915_sw_fence_commit(submit);
1958         heap_fence_put(submit);
1959         semaphore_set(sema, 1);
1960 err:
1961         intel_gt_set_wedged(ce->engine->gt);
1962         return err;
1963 }
1964
1965 static int measure_context_switch(struct intel_context *ce)
1966 {
1967         u32 *sema = hwsp_scratch(ce);
1968         const u32 offset = hwsp_offset(ce, sema);
1969         struct i915_request *fence = NULL;
1970         u32 elapsed[TF_COUNT + 1], cycles;
1971         int i, j, err;
1972         u32 *cs;
1973
1974         /*
1975          * Measure how long it takes to advance from one request in one
1976          * context to a request in another context. This allows us to
1977          * measure how long the context save/restore take, along with all
1978          * the inter-context setup we require.
1979          *
1980          *    A: read CS_TIMESTAMP on GPU
1981          *    switch context
1982          *    B: read CS_TIMESTAMP on GPU
1983          *
1984          * Context switch latency: B - A
1985          */
1986
1987         err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
1988         if (err)
1989                 return err;
1990
1991         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1992                 struct intel_context *arr[] = {
1993                         ce, ce->engine->kernel_context
1994                 };
1995                 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
1996
1997                 for (j = 0; j < ARRAY_SIZE(arr); j++) {
1998                         struct i915_request *rq;
1999
2000                         rq = i915_request_create(arr[j]);
2001                         if (IS_ERR(rq)) {
2002                                 err = PTR_ERR(rq);
2003                                 goto err_fence;
2004                         }
2005
2006                         if (fence) {
2007                                 err = i915_request_await_dma_fence(rq,
2008                                                                    &fence->fence);
2009                                 if (err) {
2010                                         i915_request_add(rq);
2011                                         goto err_fence;
2012                                 }
2013                         }
2014
2015                         cs = intel_ring_begin(rq, 4);
2016                         if (IS_ERR(cs)) {
2017                                 i915_request_add(rq);
2018                                 err = PTR_ERR(cs);
2019                                 goto err_fence;
2020                         }
2021
2022                         cs = emit_timestamp_store(cs, ce, addr);
2023                         addr += sizeof(u32);
2024
2025                         intel_ring_advance(rq, cs);
2026
2027                         i915_request_put(fence);
2028                         fence = i915_request_get(rq);
2029
2030                         i915_request_add(rq);
2031                 }
2032         }
2033         i915_request_put(fence);
2034         intel_engine_flush_submission(ce->engine);
2035
2036         semaphore_set(sema, 1);
2037         err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2038         if (err)
2039                 goto err;
2040
2041         for (i = 1; i <= TF_COUNT; i++)
2042                 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2043
2044         cycles = trifilter(elapsed);
2045         pr_info("%s: context switch latency %d cycles, %lluns\n",
2046                 ce->engine->name, cycles >> TF_BIAS,
2047                 cycles_to_ns(ce->engine, cycles));
2048
2049         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2050
2051 err_fence:
2052         i915_request_put(fence);
2053         semaphore_set(sema, 1);
2054 err:
2055         intel_gt_set_wedged(ce->engine->gt);
2056         return err;
2057 }
2058
2059 static int measure_preemption(struct intel_context *ce)
2060 {
2061         u32 *sema = hwsp_scratch(ce);
2062         const u32 offset = hwsp_offset(ce, sema);
2063         u32 elapsed[TF_COUNT], cycles;
2064         u32 *cs;
2065         int err;
2066         int i;
2067
2068         /*
2069          * We measure two latencies while triggering preemption. The first
2070          * latency is how long it takes for us to submit a preempting request.
2071          * The second latency is how it takes for us to return from the
2072          * preemption back to the original context.
2073          *
2074          *    A: read CS_TIMESTAMP from CPU
2075          *    submit preemption
2076          *    B: read CS_TIMESTAMP on GPU (in preempting context)
2077          *    context switch
2078          *    C: read CS_TIMESTAMP on GPU (in original context)
2079          *
2080          * Preemption dispatch latency: B - A
2081          * Preemption switch latency: C - B
2082          */
2083
2084         if (!intel_engine_has_preemption(ce->engine))
2085                 return 0;
2086
2087         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2088                 u32 addr = offset + 2 * i * sizeof(u32);
2089                 struct i915_request *rq;
2090
2091                 rq = i915_request_create(ce);
2092                 if (IS_ERR(rq)) {
2093                         err = PTR_ERR(rq);
2094                         goto err;
2095                 }
2096
2097                 cs = intel_ring_begin(rq, 12);
2098                 if (IS_ERR(cs)) {
2099                         i915_request_add(rq);
2100                         err = PTR_ERR(cs);
2101                         goto err;
2102                 }
2103
2104                 cs = emit_store_dw(cs, addr, -1);
2105                 cs = emit_semaphore_poll_until(cs, offset, i);
2106                 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2107
2108                 intel_ring_advance(rq, cs);
2109                 i915_request_add(rq);
2110
2111                 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2112                         err = -EIO;
2113                         goto err;
2114                 }
2115
2116                 rq = i915_request_create(ce->engine->kernel_context);
2117                 if (IS_ERR(rq)) {
2118                         err = PTR_ERR(rq);
2119                         goto err;
2120                 }
2121
2122                 cs = intel_ring_begin(rq, 8);
2123                 if (IS_ERR(cs)) {
2124                         i915_request_add(rq);
2125                         err = PTR_ERR(cs);
2126                         goto err;
2127                 }
2128
2129                 cs = emit_timestamp_store(cs, ce, addr);
2130                 cs = emit_store_dw(cs, offset, i);
2131
2132                 intel_ring_advance(rq, cs);
2133                 rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2134
2135                 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2136                 i915_request_add(rq);
2137         }
2138
2139         if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2140                 err = -EIO;
2141                 goto err;
2142         }
2143
2144         for (i = 1; i <= TF_COUNT; i++)
2145                 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2146
2147         cycles = trifilter(elapsed);
2148         pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2149                 ce->engine->name, cycles >> TF_BIAS,
2150                 cycles_to_ns(ce->engine, cycles));
2151
2152         for (i = 1; i <= TF_COUNT; i++)
2153                 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2154
2155         cycles = trifilter(elapsed);
2156         pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2157                 ce->engine->name, cycles >> TF_BIAS,
2158                 cycles_to_ns(ce->engine, cycles));
2159
2160         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2161
2162 err:
2163         intel_gt_set_wedged(ce->engine->gt);
2164         return err;
2165 }
2166
2167 struct signal_cb {
2168         struct dma_fence_cb base;
2169         bool seen;
2170 };
2171
2172 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2173 {
2174         struct signal_cb *s = container_of(cb, typeof(*s), base);
2175
2176         smp_store_mb(s->seen, true); /* be safe, be strong */
2177 }
2178
2179 static int measure_completion(struct intel_context *ce)
2180 {
2181         u32 *sema = hwsp_scratch(ce);
2182         const u32 offset = hwsp_offset(ce, sema);
2183         u32 elapsed[TF_COUNT], cycles;
2184         u32 *cs;
2185         int err;
2186         int i;
2187
2188         /*
2189          * Measure how long it takes for the signal (interrupt) to be
2190          * sent from the GPU to be processed by the CPU.
2191          *
2192          *    A: read CS_TIMESTAMP on GPU
2193          *    signal
2194          *    B: read CS_TIMESTAMP from CPU
2195          *
2196          * Completion latency: B - A
2197          */
2198
2199         for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2200                 struct signal_cb cb = { .seen = false };
2201                 struct i915_request *rq;
2202
2203                 rq = i915_request_create(ce);
2204                 if (IS_ERR(rq)) {
2205                         err = PTR_ERR(rq);
2206                         goto err;
2207                 }
2208
2209                 cs = intel_ring_begin(rq, 12);
2210                 if (IS_ERR(cs)) {
2211                         i915_request_add(rq);
2212                         err = PTR_ERR(cs);
2213                         goto err;
2214                 }
2215
2216                 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2217                 cs = emit_semaphore_poll_until(cs, offset, i);
2218                 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2219
2220                 intel_ring_advance(rq, cs);
2221
2222                 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2223
2224                 local_bh_disable();
2225                 i915_request_add(rq);
2226                 local_bh_enable();
2227
2228                 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2229                         err = -EIO;
2230                         goto err;
2231                 }
2232
2233                 preempt_disable();
2234                 semaphore_set(sema, i);
2235                 while (!READ_ONCE(cb.seen))
2236                         cpu_relax();
2237
2238                 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2239                 preempt_enable();
2240         }
2241
2242         err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2243         if (err)
2244                 goto err;
2245
2246         for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2247                 GEM_BUG_ON(sema[i + 1] == -1);
2248                 elapsed[i] = elapsed[i] - sema[i + 1];
2249         }
2250
2251         cycles = trifilter(elapsed);
2252         pr_info("%s: completion latency %d cycles, %lluns\n",
2253                 ce->engine->name, cycles >> TF_BIAS,
2254                 cycles_to_ns(ce->engine, cycles));
2255
2256         return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2257
2258 err:
2259         intel_gt_set_wedged(ce->engine->gt);
2260         return err;
2261 }
2262
2263 static void rps_pin(struct intel_gt *gt)
2264 {
2265         /* Pin the frequency to max */
2266         atomic_inc(&gt->rps.num_waiters);
2267         intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2268
2269         mutex_lock(&gt->rps.lock);
2270         intel_rps_set(&gt->rps, gt->rps.max_freq);
2271         mutex_unlock(&gt->rps.lock);
2272 }
2273
2274 static void rps_unpin(struct intel_gt *gt)
2275 {
2276         intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2277         atomic_dec(&gt->rps.num_waiters);
2278 }
2279
2280 static int perf_request_latency(void *arg)
2281 {
2282         struct drm_i915_private *i915 = arg;
2283         struct intel_engine_cs *engine;
2284         struct pm_qos_request qos;
2285         int err = 0;
2286
2287         if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
2288                 return 0;
2289
2290         cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2291
2292         for_each_uabi_engine(engine, i915) {
2293                 struct intel_context *ce;
2294
2295                 ce = intel_context_create(engine);
2296                 if (IS_ERR(ce)) {
2297                         err = PTR_ERR(ce);
2298                         goto out;
2299                 }
2300
2301                 err = intel_context_pin(ce);
2302                 if (err) {
2303                         intel_context_put(ce);
2304                         goto out;
2305                 }
2306
2307                 st_engine_heartbeat_disable(engine);
2308                 rps_pin(engine->gt);
2309
2310                 if (err == 0)
2311                         err = measure_semaphore_response(ce);
2312                 if (err == 0)
2313                         err = measure_idle_dispatch(ce);
2314                 if (err == 0)
2315                         err = measure_busy_dispatch(ce);
2316                 if (err == 0)
2317                         err = measure_inter_request(ce);
2318                 if (err == 0)
2319                         err = measure_context_switch(ce);
2320                 if (err == 0)
2321                         err = measure_preemption(ce);
2322                 if (err == 0)
2323                         err = measure_completion(ce);
2324
2325                 rps_unpin(engine->gt);
2326                 st_engine_heartbeat_enable(engine);
2327
2328                 intel_context_unpin(ce);
2329                 intel_context_put(ce);
2330                 if (err)
2331                         goto out;
2332         }
2333
2334 out:
2335         if (igt_flush_test(i915))
2336                 err = -EIO;
2337
2338         cpu_latency_qos_remove_request(&qos);
2339         return err;
2340 }
2341
2342 static int s_sync0(void *arg)
2343 {
2344         struct perf_series *ps = arg;
2345         IGT_TIMEOUT(end_time);
2346         unsigned int idx = 0;
2347         int err = 0;
2348
2349         GEM_BUG_ON(!ps->nengines);
2350         do {
2351                 struct i915_request *rq;
2352
2353                 rq = i915_request_create(ps->ce[idx]);
2354                 if (IS_ERR(rq)) {
2355                         err = PTR_ERR(rq);
2356                         break;
2357                 }
2358
2359                 i915_request_get(rq);
2360                 i915_request_add(rq);
2361
2362                 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2363                         err = -ETIME;
2364                 i915_request_put(rq);
2365                 if (err)
2366                         break;
2367
2368                 if (++idx == ps->nengines)
2369                         idx = 0;
2370         } while (!__igt_timeout(end_time, NULL));
2371
2372         return err;
2373 }
2374
2375 static int s_sync1(void *arg)
2376 {
2377         struct perf_series *ps = arg;
2378         struct i915_request *prev = NULL;
2379         IGT_TIMEOUT(end_time);
2380         unsigned int idx = 0;
2381         int err = 0;
2382
2383         GEM_BUG_ON(!ps->nengines);
2384         do {
2385                 struct i915_request *rq;
2386
2387                 rq = i915_request_create(ps->ce[idx]);
2388                 if (IS_ERR(rq)) {
2389                         err = PTR_ERR(rq);
2390                         break;
2391                 }
2392
2393                 i915_request_get(rq);
2394                 i915_request_add(rq);
2395
2396                 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2397                         err = -ETIME;
2398                 i915_request_put(prev);
2399                 prev = rq;
2400                 if (err)
2401                         break;
2402
2403                 if (++idx == ps->nengines)
2404                         idx = 0;
2405         } while (!__igt_timeout(end_time, NULL));
2406         i915_request_put(prev);
2407
2408         return err;
2409 }
2410
2411 static int s_many(void *arg)
2412 {
2413         struct perf_series *ps = arg;
2414         IGT_TIMEOUT(end_time);
2415         unsigned int idx = 0;
2416
2417         GEM_BUG_ON(!ps->nengines);
2418         do {
2419                 struct i915_request *rq;
2420
2421                 rq = i915_request_create(ps->ce[idx]);
2422                 if (IS_ERR(rq))
2423                         return PTR_ERR(rq);
2424
2425                 i915_request_add(rq);
2426
2427                 if (++idx == ps->nengines)
2428                         idx = 0;
2429         } while (!__igt_timeout(end_time, NULL));
2430
2431         return 0;
2432 }
2433
2434 static int perf_series_engines(void *arg)
2435 {
2436         struct drm_i915_private *i915 = arg;
2437         static int (* const func[])(void *arg) = {
2438                 s_sync0,
2439                 s_sync1,
2440                 s_many,
2441                 NULL,
2442         };
2443         const unsigned int nengines = num_uabi_engines(i915);
2444         struct intel_engine_cs *engine;
2445         int (* const *fn)(void *arg);
2446         struct pm_qos_request qos;
2447         struct perf_stats *stats;
2448         struct perf_series *ps;
2449         unsigned int idx;
2450         int err = 0;
2451
2452         stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2453         if (!stats)
2454                 return -ENOMEM;
2455
2456         ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2457         if (!ps) {
2458                 kfree(stats);
2459                 return -ENOMEM;
2460         }
2461
2462         cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2463
2464         ps->i915 = i915;
2465         ps->nengines = nengines;
2466
2467         idx = 0;
2468         for_each_uabi_engine(engine, i915) {
2469                 struct intel_context *ce;
2470
2471                 ce = intel_context_create(engine);
2472                 if (IS_ERR(ce)) {
2473                         err = PTR_ERR(ce);
2474                         goto out;
2475                 }
2476
2477                 err = intel_context_pin(ce);
2478                 if (err) {
2479                         intel_context_put(ce);
2480                         goto out;
2481                 }
2482
2483                 ps->ce[idx++] = ce;
2484         }
2485         GEM_BUG_ON(idx != ps->nengines);
2486
2487         for (fn = func; *fn && !err; fn++) {
2488                 char name[KSYM_NAME_LEN];
2489                 struct igt_live_test t;
2490
2491                 snprintf(name, sizeof(name), "%ps", *fn);
2492                 err = igt_live_test_begin(&t, i915, __func__, name);
2493                 if (err)
2494                         break;
2495
2496                 for (idx = 0; idx < nengines; idx++) {
2497                         struct perf_stats *p =
2498                                 memset(&stats[idx], 0, sizeof(stats[idx]));
2499                         struct intel_context *ce = ps->ce[idx];
2500
2501                         p->engine = ps->ce[idx]->engine;
2502                         intel_engine_pm_get(p->engine);
2503
2504                         if (intel_engine_supports_stats(p->engine))
2505                                 p->busy = intel_engine_get_busy_time(p->engine,
2506                                                                      &p->time) + 1;
2507                         else
2508                                 p->time = ktime_get();
2509                         p->runtime = -intel_context_get_total_runtime_ns(ce);
2510                 }
2511
2512                 err = (*fn)(ps);
2513                 if (igt_live_test_end(&t))
2514                         err = -EIO;
2515
2516                 for (idx = 0; idx < nengines; idx++) {
2517                         struct perf_stats *p = &stats[idx];
2518                         struct intel_context *ce = ps->ce[idx];
2519                         int integer, decimal;
2520                         u64 busy, dt, now;
2521
2522                         if (p->busy)
2523                                 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2524                                                                                &now),
2525                                                     p->busy - 1);
2526                         else
2527                                 now = ktime_get();
2528                         p->time = ktime_sub(now, p->time);
2529
2530                         err = switch_to_kernel_sync(ce, err);
2531                         p->runtime += intel_context_get_total_runtime_ns(ce);
2532                         intel_engine_pm_put(p->engine);
2533
2534                         busy = 100 * ktime_to_ns(p->busy);
2535                         dt = ktime_to_ns(p->time);
2536                         if (dt) {
2537                                 integer = div64_u64(busy, dt);
2538                                 busy -= integer * dt;
2539                                 decimal = div64_u64(100 * busy, dt);
2540                         } else {
2541                                 integer = 0;
2542                                 decimal = 0;
2543                         }
2544
2545                         pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2546                                 name, p->engine->name, ce->timeline->seqno,
2547                                 integer, decimal,
2548                                 div_u64(p->runtime, 1000 * 1000),
2549                                 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2550                 }
2551         }
2552
2553 out:
2554         for (idx = 0; idx < nengines; idx++) {
2555                 if (IS_ERR_OR_NULL(ps->ce[idx]))
2556                         break;
2557
2558                 intel_context_unpin(ps->ce[idx]);
2559                 intel_context_put(ps->ce[idx]);
2560         }
2561         kfree(ps);
2562
2563         cpu_latency_qos_remove_request(&qos);
2564         kfree(stats);
2565         return err;
2566 }
2567
2568 static int p_sync0(void *arg)
2569 {
2570         struct perf_stats *p = arg;
2571         struct intel_engine_cs *engine = p->engine;
2572         struct intel_context *ce;
2573         IGT_TIMEOUT(end_time);
2574         unsigned long count;
2575         bool busy;
2576         int err = 0;
2577
2578         ce = intel_context_create(engine);
2579         if (IS_ERR(ce))
2580                 return PTR_ERR(ce);
2581
2582         err = intel_context_pin(ce);
2583         if (err) {
2584                 intel_context_put(ce);
2585                 return err;
2586         }
2587
2588         if (intel_engine_supports_stats(engine)) {
2589                 p->busy = intel_engine_get_busy_time(engine, &p->time);
2590                 busy = true;
2591         } else {
2592                 p->time = ktime_get();
2593                 busy = false;
2594         }
2595
2596         count = 0;
2597         do {
2598                 struct i915_request *rq;
2599
2600                 rq = i915_request_create(ce);
2601                 if (IS_ERR(rq)) {
2602                         err = PTR_ERR(rq);
2603                         break;
2604                 }
2605
2606                 i915_request_get(rq);
2607                 i915_request_add(rq);
2608
2609                 err = 0;
2610                 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2611                         err = -ETIME;
2612                 i915_request_put(rq);
2613                 if (err)
2614                         break;
2615
2616                 count++;
2617         } while (!__igt_timeout(end_time, NULL));
2618
2619         if (busy) {
2620                 ktime_t now;
2621
2622                 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2623                                     p->busy);
2624                 p->time = ktime_sub(now, p->time);
2625         } else {
2626                 p->time = ktime_sub(ktime_get(), p->time);
2627         }
2628
2629         err = switch_to_kernel_sync(ce, err);
2630         p->runtime = intel_context_get_total_runtime_ns(ce);
2631         p->count = count;
2632
2633         intel_context_unpin(ce);
2634         intel_context_put(ce);
2635         return err;
2636 }
2637
2638 static int p_sync1(void *arg)
2639 {
2640         struct perf_stats *p = arg;
2641         struct intel_engine_cs *engine = p->engine;
2642         struct i915_request *prev = NULL;
2643         struct intel_context *ce;
2644         IGT_TIMEOUT(end_time);
2645         unsigned long count;
2646         bool busy;
2647         int err = 0;
2648
2649         ce = intel_context_create(engine);
2650         if (IS_ERR(ce))
2651                 return PTR_ERR(ce);
2652
2653         err = intel_context_pin(ce);
2654         if (err) {
2655                 intel_context_put(ce);
2656                 return err;
2657         }
2658
2659         if (intel_engine_supports_stats(engine)) {
2660                 p->busy = intel_engine_get_busy_time(engine, &p->time);
2661                 busy = true;
2662         } else {
2663                 p->time = ktime_get();
2664                 busy = false;
2665         }
2666
2667         count = 0;
2668         do {
2669                 struct i915_request *rq;
2670
2671                 rq = i915_request_create(ce);
2672                 if (IS_ERR(rq)) {
2673                         err = PTR_ERR(rq);
2674                         break;
2675                 }
2676
2677                 i915_request_get(rq);
2678                 i915_request_add(rq);
2679
2680                 err = 0;
2681                 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2682                         err = -ETIME;
2683                 i915_request_put(prev);
2684                 prev = rq;
2685                 if (err)
2686                         break;
2687
2688                 count++;
2689         } while (!__igt_timeout(end_time, NULL));
2690         i915_request_put(prev);
2691
2692         if (busy) {
2693                 ktime_t now;
2694
2695                 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2696                                     p->busy);
2697                 p->time = ktime_sub(now, p->time);
2698         } else {
2699                 p->time = ktime_sub(ktime_get(), p->time);
2700         }
2701
2702         err = switch_to_kernel_sync(ce, err);
2703         p->runtime = intel_context_get_total_runtime_ns(ce);
2704         p->count = count;
2705
2706         intel_context_unpin(ce);
2707         intel_context_put(ce);
2708         return err;
2709 }
2710
2711 static int p_many(void *arg)
2712 {
2713         struct perf_stats *p = arg;
2714         struct intel_engine_cs *engine = p->engine;
2715         struct intel_context *ce;
2716         IGT_TIMEOUT(end_time);
2717         unsigned long count;
2718         int err = 0;
2719         bool busy;
2720
2721         ce = intel_context_create(engine);
2722         if (IS_ERR(ce))
2723                 return PTR_ERR(ce);
2724
2725         err = intel_context_pin(ce);
2726         if (err) {
2727                 intel_context_put(ce);
2728                 return err;
2729         }
2730
2731         if (intel_engine_supports_stats(engine)) {
2732                 p->busy = intel_engine_get_busy_time(engine, &p->time);
2733                 busy = true;
2734         } else {
2735                 p->time = ktime_get();
2736                 busy = false;
2737         }
2738
2739         count = 0;
2740         do {
2741                 struct i915_request *rq;
2742
2743                 rq = i915_request_create(ce);
2744                 if (IS_ERR(rq)) {
2745                         err = PTR_ERR(rq);
2746                         break;
2747                 }
2748
2749                 i915_request_add(rq);
2750                 count++;
2751         } while (!__igt_timeout(end_time, NULL));
2752
2753         if (busy) {
2754                 ktime_t now;
2755
2756                 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2757                                     p->busy);
2758                 p->time = ktime_sub(now, p->time);
2759         } else {
2760                 p->time = ktime_sub(ktime_get(), p->time);
2761         }
2762
2763         err = switch_to_kernel_sync(ce, err);
2764         p->runtime = intel_context_get_total_runtime_ns(ce);
2765         p->count = count;
2766
2767         intel_context_unpin(ce);
2768         intel_context_put(ce);
2769         return err;
2770 }
2771
2772 static int perf_parallel_engines(void *arg)
2773 {
2774         struct drm_i915_private *i915 = arg;
2775         static int (* const func[])(void *arg) = {
2776                 p_sync0,
2777                 p_sync1,
2778                 p_many,
2779                 NULL,
2780         };
2781         const unsigned int nengines = num_uabi_engines(i915);
2782         struct intel_engine_cs *engine;
2783         int (* const *fn)(void *arg);
2784         struct pm_qos_request qos;
2785         struct {
2786                 struct perf_stats p;
2787                 struct task_struct *tsk;
2788         } *engines;
2789         int err = 0;
2790
2791         engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
2792         if (!engines)
2793                 return -ENOMEM;
2794
2795         cpu_latency_qos_add_request(&qos, 0);
2796
2797         for (fn = func; *fn; fn++) {
2798                 char name[KSYM_NAME_LEN];
2799                 struct igt_live_test t;
2800                 unsigned int idx;
2801
2802                 snprintf(name, sizeof(name), "%ps", *fn);
2803                 err = igt_live_test_begin(&t, i915, __func__, name);
2804                 if (err)
2805                         break;
2806
2807                 atomic_set(&i915->selftest.counter, nengines);
2808
2809                 idx = 0;
2810                 for_each_uabi_engine(engine, i915) {
2811                         intel_engine_pm_get(engine);
2812
2813                         memset(&engines[idx].p, 0, sizeof(engines[idx].p));
2814                         engines[idx].p.engine = engine;
2815
2816                         engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
2817                                                        "igt:%s", engine->name);
2818                         if (IS_ERR(engines[idx].tsk)) {
2819                                 err = PTR_ERR(engines[idx].tsk);
2820                                 intel_engine_pm_put(engine);
2821                                 break;
2822                         }
2823                         get_task_struct(engines[idx++].tsk);
2824                 }
2825
2826                 yield(); /* start all threads before we kthread_stop() */
2827
2828                 idx = 0;
2829                 for_each_uabi_engine(engine, i915) {
2830                         int status;
2831
2832                         if (IS_ERR(engines[idx].tsk))
2833                                 break;
2834
2835                         status = kthread_stop(engines[idx].tsk);
2836                         if (status && !err)
2837                                 err = status;
2838
2839                         intel_engine_pm_put(engine);
2840                         put_task_struct(engines[idx++].tsk);
2841                 }
2842
2843                 if (igt_live_test_end(&t))
2844                         err = -EIO;
2845                 if (err)
2846                         break;
2847
2848                 idx = 0;
2849                 for_each_uabi_engine(engine, i915) {
2850                         struct perf_stats *p = &engines[idx].p;
2851                         u64 busy = 100 * ktime_to_ns(p->busy);
2852                         u64 dt = ktime_to_ns(p->time);
2853                         int integer, decimal;
2854
2855                         if (dt) {
2856                                 integer = div64_u64(busy, dt);
2857                                 busy -= integer * dt;
2858                                 decimal = div64_u64(100 * busy, dt);
2859                         } else {
2860                                 integer = 0;
2861                                 decimal = 0;
2862                         }
2863
2864                         GEM_BUG_ON(engine != p->engine);
2865                         pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2866                                 name, engine->name, p->count, integer, decimal,
2867                                 div_u64(p->runtime, 1000 * 1000),
2868                                 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2869                         idx++;
2870                 }
2871         }
2872
2873         cpu_latency_qos_remove_request(&qos);
2874         kfree(engines);
2875         return err;
2876 }
2877
2878 int i915_request_perf_selftests(struct drm_i915_private *i915)
2879 {
2880         static const struct i915_subtest tests[] = {
2881                 SUBTEST(perf_request_latency),
2882                 SUBTEST(perf_series_engines),
2883                 SUBTEST(perf_parallel_engines),
2884         };
2885
2886         if (intel_gt_is_wedged(&i915->gt))
2887                 return 0;
2888
2889         return i915_subtests(tests, i915);
2890 }