GNU Linux-libre 5.10.215-gnu1
[releases.git] / drivers / gpu / drm / i915 / gt / selftest_hangcheck.c
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24
25 #include <linux/kthread.h>
26
27 #include "gem/i915_gem_context.h"
28
29 #include "intel_gt.h"
30 #include "intel_engine_heartbeat.h"
31 #include "intel_engine_pm.h"
32 #include "selftest_engine_heartbeat.h"
33
34 #include "i915_selftest.h"
35 #include "selftests/i915_random.h"
36 #include "selftests/igt_flush_test.h"
37 #include "selftests/igt_reset.h"
38 #include "selftests/igt_atomic.h"
39
40 #include "selftests/mock_drm.h"
41
42 #include "gem/selftests/mock_context.h"
43 #include "gem/selftests/igt_gem_utils.h"
44
45 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
46
47 struct hang {
48         struct intel_gt *gt;
49         struct drm_i915_gem_object *hws;
50         struct drm_i915_gem_object *obj;
51         struct i915_gem_context *ctx;
52         u32 *seqno;
53         u32 *batch;
54 };
55
56 static int hang_init(struct hang *h, struct intel_gt *gt)
57 {
58         void *vaddr;
59         int err;
60
61         memset(h, 0, sizeof(*h));
62         h->gt = gt;
63
64         h->ctx = kernel_context(gt->i915);
65         if (IS_ERR(h->ctx))
66                 return PTR_ERR(h->ctx);
67
68         GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
69
70         h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
71         if (IS_ERR(h->hws)) {
72                 err = PTR_ERR(h->hws);
73                 goto err_ctx;
74         }
75
76         h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
77         if (IS_ERR(h->obj)) {
78                 err = PTR_ERR(h->obj);
79                 goto err_hws;
80         }
81
82         i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
83         vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
84         if (IS_ERR(vaddr)) {
85                 err = PTR_ERR(vaddr);
86                 goto err_obj;
87         }
88         h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
89
90         vaddr = i915_gem_object_pin_map(h->obj,
91                                         i915_coherent_map_type(gt->i915));
92         if (IS_ERR(vaddr)) {
93                 err = PTR_ERR(vaddr);
94                 goto err_unpin_hws;
95         }
96         h->batch = vaddr;
97
98         return 0;
99
100 err_unpin_hws:
101         i915_gem_object_unpin_map(h->hws);
102 err_obj:
103         i915_gem_object_put(h->obj);
104 err_hws:
105         i915_gem_object_put(h->hws);
106 err_ctx:
107         kernel_context_close(h->ctx);
108         return err;
109 }
110
111 static u64 hws_address(const struct i915_vma *hws,
112                        const struct i915_request *rq)
113 {
114         return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
115 }
116
117 static int move_to_active(struct i915_vma *vma,
118                           struct i915_request *rq,
119                           unsigned int flags)
120 {
121         int err;
122
123         i915_vma_lock(vma);
124         err = i915_request_await_object(rq, vma->obj,
125                                         flags & EXEC_OBJECT_WRITE);
126         if (err == 0)
127                 err = i915_vma_move_to_active(vma, rq, flags);
128         i915_vma_unlock(vma);
129
130         return err;
131 }
132
133 static struct i915_request *
134 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
135 {
136         struct intel_gt *gt = h->gt;
137         struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
138         struct drm_i915_gem_object *obj;
139         struct i915_request *rq = NULL;
140         struct i915_vma *hws, *vma;
141         unsigned int flags;
142         void *vaddr;
143         u32 *batch;
144         int err;
145
146         obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
147         if (IS_ERR(obj)) {
148                 i915_vm_put(vm);
149                 return ERR_CAST(obj);
150         }
151
152         vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
153         if (IS_ERR(vaddr)) {
154                 i915_gem_object_put(obj);
155                 i915_vm_put(vm);
156                 return ERR_CAST(vaddr);
157         }
158
159         i915_gem_object_unpin_map(h->obj);
160         i915_gem_object_put(h->obj);
161
162         h->obj = obj;
163         h->batch = vaddr;
164
165         vma = i915_vma_instance(h->obj, vm, NULL);
166         if (IS_ERR(vma)) {
167                 i915_vm_put(vm);
168                 return ERR_CAST(vma);
169         }
170
171         hws = i915_vma_instance(h->hws, vm, NULL);
172         if (IS_ERR(hws)) {
173                 i915_vm_put(vm);
174                 return ERR_CAST(hws);
175         }
176
177         err = i915_vma_pin(vma, 0, 0, PIN_USER);
178         if (err) {
179                 i915_vm_put(vm);
180                 return ERR_PTR(err);
181         }
182
183         err = i915_vma_pin(hws, 0, 0, PIN_USER);
184         if (err)
185                 goto unpin_vma;
186
187         rq = igt_request_alloc(h->ctx, engine);
188         if (IS_ERR(rq)) {
189                 err = PTR_ERR(rq);
190                 goto unpin_hws;
191         }
192
193         err = move_to_active(vma, rq, 0);
194         if (err)
195                 goto cancel_rq;
196
197         err = move_to_active(hws, rq, 0);
198         if (err)
199                 goto cancel_rq;
200
201         batch = h->batch;
202         if (INTEL_GEN(gt->i915) >= 8) {
203                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
204                 *batch++ = lower_32_bits(hws_address(hws, rq));
205                 *batch++ = upper_32_bits(hws_address(hws, rq));
206                 *batch++ = rq->fence.seqno;
207                 *batch++ = MI_NOOP;
208
209                 memset(batch, 0, 1024);
210                 batch += 1024 / sizeof(*batch);
211
212                 *batch++ = MI_NOOP;
213                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
214                 *batch++ = lower_32_bits(vma->node.start);
215                 *batch++ = upper_32_bits(vma->node.start);
216         } else if (INTEL_GEN(gt->i915) >= 6) {
217                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
218                 *batch++ = 0;
219                 *batch++ = lower_32_bits(hws_address(hws, rq));
220                 *batch++ = rq->fence.seqno;
221                 *batch++ = MI_NOOP;
222
223                 memset(batch, 0, 1024);
224                 batch += 1024 / sizeof(*batch);
225
226                 *batch++ = MI_NOOP;
227                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
228                 *batch++ = lower_32_bits(vma->node.start);
229         } else if (INTEL_GEN(gt->i915) >= 4) {
230                 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
231                 *batch++ = 0;
232                 *batch++ = lower_32_bits(hws_address(hws, rq));
233                 *batch++ = rq->fence.seqno;
234                 *batch++ = MI_NOOP;
235
236                 memset(batch, 0, 1024);
237                 batch += 1024 / sizeof(*batch);
238
239                 *batch++ = MI_NOOP;
240                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
241                 *batch++ = lower_32_bits(vma->node.start);
242         } else {
243                 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
244                 *batch++ = lower_32_bits(hws_address(hws, rq));
245                 *batch++ = rq->fence.seqno;
246                 *batch++ = MI_NOOP;
247
248                 memset(batch, 0, 1024);
249                 batch += 1024 / sizeof(*batch);
250
251                 *batch++ = MI_NOOP;
252                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
253                 *batch++ = lower_32_bits(vma->node.start);
254         }
255         *batch++ = MI_BATCH_BUFFER_END; /* not reached */
256         intel_gt_chipset_flush(engine->gt);
257
258         if (rq->engine->emit_init_breadcrumb) {
259                 err = rq->engine->emit_init_breadcrumb(rq);
260                 if (err)
261                         goto cancel_rq;
262         }
263
264         flags = 0;
265         if (INTEL_GEN(gt->i915) <= 5)
266                 flags |= I915_DISPATCH_SECURE;
267
268         err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
269
270 cancel_rq:
271         if (err) {
272                 i915_request_set_error_once(rq, err);
273                 i915_request_add(rq);
274         }
275 unpin_hws:
276         i915_vma_unpin(hws);
277 unpin_vma:
278         i915_vma_unpin(vma);
279         i915_vm_put(vm);
280         return err ? ERR_PTR(err) : rq;
281 }
282
283 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
284 {
285         return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
286 }
287
288 static void hang_fini(struct hang *h)
289 {
290         *h->batch = MI_BATCH_BUFFER_END;
291         intel_gt_chipset_flush(h->gt);
292
293         i915_gem_object_unpin_map(h->obj);
294         i915_gem_object_put(h->obj);
295
296         i915_gem_object_unpin_map(h->hws);
297         i915_gem_object_put(h->hws);
298
299         kernel_context_close(h->ctx);
300
301         igt_flush_test(h->gt->i915);
302 }
303
304 static bool wait_until_running(struct hang *h, struct i915_request *rq)
305 {
306         return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
307                                                rq->fence.seqno),
308                              10) &&
309                  wait_for(i915_seqno_passed(hws_seqno(h, rq),
310                                             rq->fence.seqno),
311                           1000));
312 }
313
314 static int igt_hang_sanitycheck(void *arg)
315 {
316         struct intel_gt *gt = arg;
317         struct i915_request *rq;
318         struct intel_engine_cs *engine;
319         enum intel_engine_id id;
320         struct hang h;
321         int err;
322
323         /* Basic check that we can execute our hanging batch */
324
325         err = hang_init(&h, gt);
326         if (err)
327                 return err;
328
329         for_each_engine(engine, gt, id) {
330                 struct intel_wedge_me w;
331                 long timeout;
332
333                 if (!intel_engine_can_store_dword(engine))
334                         continue;
335
336                 rq = hang_create_request(&h, engine);
337                 if (IS_ERR(rq)) {
338                         err = PTR_ERR(rq);
339                         pr_err("Failed to create request for %s, err=%d\n",
340                                engine->name, err);
341                         goto fini;
342                 }
343
344                 i915_request_get(rq);
345
346                 *h.batch = MI_BATCH_BUFFER_END;
347                 intel_gt_chipset_flush(engine->gt);
348
349                 i915_request_add(rq);
350
351                 timeout = 0;
352                 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
353                         timeout = i915_request_wait(rq, 0,
354                                                     MAX_SCHEDULE_TIMEOUT);
355                 if (intel_gt_is_wedged(gt))
356                         timeout = -EIO;
357
358                 i915_request_put(rq);
359
360                 if (timeout < 0) {
361                         err = timeout;
362                         pr_err("Wait for request failed on %s, err=%d\n",
363                                engine->name, err);
364                         goto fini;
365                 }
366         }
367
368 fini:
369         hang_fini(&h);
370         return err;
371 }
372
373 static bool wait_for_idle(struct intel_engine_cs *engine)
374 {
375         return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
376 }
377
378 static int igt_reset_nop(void *arg)
379 {
380         struct intel_gt *gt = arg;
381         struct i915_gpu_error *global = &gt->i915->gpu_error;
382         struct intel_engine_cs *engine;
383         unsigned int reset_count, count;
384         enum intel_engine_id id;
385         IGT_TIMEOUT(end_time);
386         int err = 0;
387
388         /* Check that we can reset during non-user portions of requests */
389
390         reset_count = i915_reset_count(global);
391         count = 0;
392         do {
393                 for_each_engine(engine, gt, id) {
394                         struct intel_context *ce;
395                         int i;
396
397                         ce = intel_context_create(engine);
398                         if (IS_ERR(ce)) {
399                                 err = PTR_ERR(ce);
400                                 break;
401                         }
402
403                         for (i = 0; i < 16; i++) {
404                                 struct i915_request *rq;
405
406                                 rq = intel_context_create_request(ce);
407                                 if (IS_ERR(rq)) {
408                                         err = PTR_ERR(rq);
409                                         break;
410                                 }
411
412                                 i915_request_add(rq);
413                         }
414
415                         intel_context_put(ce);
416                 }
417
418                 igt_global_reset_lock(gt);
419                 intel_gt_reset(gt, ALL_ENGINES, NULL);
420                 igt_global_reset_unlock(gt);
421
422                 if (intel_gt_is_wedged(gt)) {
423                         err = -EIO;
424                         break;
425                 }
426
427                 if (i915_reset_count(global) != reset_count + ++count) {
428                         pr_err("Full GPU reset not recorded!\n");
429                         err = -EINVAL;
430                         break;
431                 }
432
433                 err = igt_flush_test(gt->i915);
434                 if (err)
435                         break;
436         } while (time_before(jiffies, end_time));
437         pr_info("%s: %d resets\n", __func__, count);
438
439         if (igt_flush_test(gt->i915))
440                 err = -EIO;
441         return err;
442 }
443
444 static int igt_reset_nop_engine(void *arg)
445 {
446         struct intel_gt *gt = arg;
447         struct i915_gpu_error *global = &gt->i915->gpu_error;
448         struct intel_engine_cs *engine;
449         enum intel_engine_id id;
450
451         /* Check that we can engine-reset during non-user portions */
452
453         if (!intel_has_reset_engine(gt))
454                 return 0;
455
456         for_each_engine(engine, gt, id) {
457                 unsigned int reset_count, reset_engine_count, count;
458                 struct intel_context *ce;
459                 IGT_TIMEOUT(end_time);
460                 int err;
461
462                 ce = intel_context_create(engine);
463                 if (IS_ERR(ce))
464                         return PTR_ERR(ce);
465
466                 reset_count = i915_reset_count(global);
467                 reset_engine_count = i915_reset_engine_count(global, engine);
468                 count = 0;
469
470                 st_engine_heartbeat_disable(engine);
471                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
472                 do {
473                         int i;
474
475                         if (!wait_for_idle(engine)) {
476                                 pr_err("%s failed to idle before reset\n",
477                                        engine->name);
478                                 err = -EIO;
479                                 break;
480                         }
481
482                         for (i = 0; i < 16; i++) {
483                                 struct i915_request *rq;
484
485                                 rq = intel_context_create_request(ce);
486                                 if (IS_ERR(rq)) {
487                                         struct drm_printer p =
488                                                 drm_info_printer(gt->i915->drm.dev);
489                                         intel_engine_dump(engine, &p,
490                                                           "%s(%s): failed to submit request\n",
491                                                           __func__,
492                                                           engine->name);
493
494                                         GEM_TRACE("%s(%s): failed to submit request\n",
495                                                   __func__,
496                                                   engine->name);
497                                         GEM_TRACE_DUMP();
498
499                                         intel_gt_set_wedged(gt);
500
501                                         err = PTR_ERR(rq);
502                                         break;
503                                 }
504
505                                 i915_request_add(rq);
506                         }
507                         err = intel_engine_reset(engine, NULL);
508                         if (err) {
509                                 pr_err("i915_reset_engine failed\n");
510                                 break;
511                         }
512
513                         if (i915_reset_count(global) != reset_count) {
514                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
515                                 err = -EINVAL;
516                                 break;
517                         }
518
519                         if (i915_reset_engine_count(global, engine) !=
520                             reset_engine_count + ++count) {
521                                 pr_err("%s engine reset not recorded!\n",
522                                        engine->name);
523                                 err = -EINVAL;
524                                 break;
525                         }
526                 } while (time_before(jiffies, end_time));
527                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
528                 st_engine_heartbeat_enable(engine);
529
530                 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
531
532                 intel_context_put(ce);
533                 if (igt_flush_test(gt->i915))
534                         err = -EIO;
535                 if (err)
536                         return err;
537         }
538
539         return 0;
540 }
541
542 static int __igt_reset_engine(struct intel_gt *gt, bool active)
543 {
544         struct i915_gpu_error *global = &gt->i915->gpu_error;
545         struct intel_engine_cs *engine;
546         enum intel_engine_id id;
547         struct hang h;
548         int err = 0;
549
550         /* Check that we can issue an engine reset on an idle engine (no-op) */
551
552         if (!intel_has_reset_engine(gt))
553                 return 0;
554
555         if (active) {
556                 err = hang_init(&h, gt);
557                 if (err)
558                         return err;
559         }
560
561         for_each_engine(engine, gt, id) {
562                 unsigned int reset_count, reset_engine_count;
563                 IGT_TIMEOUT(end_time);
564
565                 if (active && !intel_engine_can_store_dword(engine))
566                         continue;
567
568                 if (!wait_for_idle(engine)) {
569                         pr_err("%s failed to idle before reset\n",
570                                engine->name);
571                         err = -EIO;
572                         break;
573                 }
574
575                 reset_count = i915_reset_count(global);
576                 reset_engine_count = i915_reset_engine_count(global, engine);
577
578                 st_engine_heartbeat_disable(engine);
579                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
580                 do {
581                         if (active) {
582                                 struct i915_request *rq;
583
584                                 rq = hang_create_request(&h, engine);
585                                 if (IS_ERR(rq)) {
586                                         err = PTR_ERR(rq);
587                                         break;
588                                 }
589
590                                 i915_request_get(rq);
591                                 i915_request_add(rq);
592
593                                 if (!wait_until_running(&h, rq)) {
594                                         struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
595
596                                         pr_err("%s: Failed to start request %llx, at %x\n",
597                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
598                                         intel_engine_dump(engine, &p,
599                                                           "%s\n", engine->name);
600
601                                         i915_request_put(rq);
602                                         err = -EIO;
603                                         break;
604                                 }
605
606                                 i915_request_put(rq);
607                         }
608
609                         err = intel_engine_reset(engine, NULL);
610                         if (err) {
611                                 pr_err("i915_reset_engine failed\n");
612                                 break;
613                         }
614
615                         if (i915_reset_count(global) != reset_count) {
616                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
617                                 err = -EINVAL;
618                                 break;
619                         }
620
621                         if (i915_reset_engine_count(global, engine) !=
622                             ++reset_engine_count) {
623                                 pr_err("%s engine reset not recorded!\n",
624                                        engine->name);
625                                 err = -EINVAL;
626                                 break;
627                         }
628                 } while (time_before(jiffies, end_time));
629                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
630                 st_engine_heartbeat_enable(engine);
631
632                 if (err)
633                         break;
634
635                 err = igt_flush_test(gt->i915);
636                 if (err)
637                         break;
638         }
639
640         if (intel_gt_is_wedged(gt))
641                 err = -EIO;
642
643         if (active)
644                 hang_fini(&h);
645
646         return err;
647 }
648
649 static int igt_reset_idle_engine(void *arg)
650 {
651         return __igt_reset_engine(arg, false);
652 }
653
654 static int igt_reset_active_engine(void *arg)
655 {
656         return __igt_reset_engine(arg, true);
657 }
658
659 struct active_engine {
660         struct task_struct *task;
661         struct intel_engine_cs *engine;
662         unsigned long resets;
663         unsigned int flags;
664 };
665
666 #define TEST_ACTIVE     BIT(0)
667 #define TEST_OTHERS     BIT(1)
668 #define TEST_SELF       BIT(2)
669 #define TEST_PRIORITY   BIT(3)
670
671 static int active_request_put(struct i915_request *rq)
672 {
673         int err = 0;
674
675         if (!rq)
676                 return 0;
677
678         if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
679                 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
680                           rq->engine->name,
681                           rq->fence.context,
682                           rq->fence.seqno);
683                 GEM_TRACE_DUMP();
684
685                 intel_gt_set_wedged(rq->engine->gt);
686                 err = -EIO;
687         }
688
689         i915_request_put(rq);
690
691         return err;
692 }
693
694 static int active_engine(void *data)
695 {
696         I915_RND_STATE(prng);
697         struct active_engine *arg = data;
698         struct intel_engine_cs *engine = arg->engine;
699         struct i915_request *rq[8] = {};
700         struct intel_context *ce[ARRAY_SIZE(rq)];
701         unsigned long count;
702         int err = 0;
703
704         for (count = 0; count < ARRAY_SIZE(ce); count++) {
705                 ce[count] = intel_context_create(engine);
706                 if (IS_ERR(ce[count])) {
707                         err = PTR_ERR(ce[count]);
708                         while (--count)
709                                 intel_context_put(ce[count]);
710                         return err;
711                 }
712         }
713
714         count = 0;
715         while (!kthread_should_stop()) {
716                 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
717                 struct i915_request *old = rq[idx];
718                 struct i915_request *new;
719
720                 new = intel_context_create_request(ce[idx]);
721                 if (IS_ERR(new)) {
722                         err = PTR_ERR(new);
723                         break;
724                 }
725
726                 rq[idx] = i915_request_get(new);
727                 i915_request_add(new);
728
729                 if (engine->schedule && arg->flags & TEST_PRIORITY) {
730                         struct i915_sched_attr attr = {
731                                 .priority =
732                                         i915_prandom_u32_max_state(512, &prng),
733                         };
734                         engine->schedule(rq[idx], &attr);
735                 }
736
737                 err = active_request_put(old);
738                 if (err)
739                         break;
740
741                 cond_resched();
742         }
743
744         for (count = 0; count < ARRAY_SIZE(rq); count++) {
745                 int err__ = active_request_put(rq[count]);
746
747                 /* Keep the first error */
748                 if (!err)
749                         err = err__;
750
751                 intel_context_put(ce[count]);
752         }
753
754         return err;
755 }
756
757 static int __igt_reset_engines(struct intel_gt *gt,
758                                const char *test_name,
759                                unsigned int flags)
760 {
761         struct i915_gpu_error *global = &gt->i915->gpu_error;
762         struct intel_engine_cs *engine, *other;
763         enum intel_engine_id id, tmp;
764         struct hang h;
765         int err = 0;
766
767         /* Check that issuing a reset on one engine does not interfere
768          * with any other engine.
769          */
770
771         if (!intel_has_reset_engine(gt))
772                 return 0;
773
774         if (flags & TEST_ACTIVE) {
775                 err = hang_init(&h, gt);
776                 if (err)
777                         return err;
778
779                 if (flags & TEST_PRIORITY)
780                         h.ctx->sched.priority = 1024;
781         }
782
783         for_each_engine(engine, gt, id) {
784                 struct active_engine threads[I915_NUM_ENGINES] = {};
785                 unsigned long device = i915_reset_count(global);
786                 unsigned long count = 0, reported;
787                 IGT_TIMEOUT(end_time);
788
789                 if (flags & TEST_ACTIVE &&
790                     !intel_engine_can_store_dword(engine))
791                         continue;
792
793                 if (!wait_for_idle(engine)) {
794                         pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
795                                engine->name, test_name);
796                         err = -EIO;
797                         break;
798                 }
799
800                 memset(threads, 0, sizeof(threads));
801                 for_each_engine(other, gt, tmp) {
802                         struct task_struct *tsk;
803
804                         threads[tmp].resets =
805                                 i915_reset_engine_count(global, other);
806
807                         if (other == engine && !(flags & TEST_SELF))
808                                 continue;
809
810                         if (other != engine && !(flags & TEST_OTHERS))
811                                 continue;
812
813                         threads[tmp].engine = other;
814                         threads[tmp].flags = flags;
815
816                         tsk = kthread_run(active_engine, &threads[tmp],
817                                           "igt/%s", other->name);
818                         if (IS_ERR(tsk)) {
819                                 err = PTR_ERR(tsk);
820                                 goto unwind;
821                         }
822
823                         threads[tmp].task = tsk;
824                         get_task_struct(tsk);
825                 }
826
827                 yield(); /* start all threads before we begin */
828
829                 st_engine_heartbeat_disable(engine);
830                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
831                 do {
832                         struct i915_request *rq = NULL;
833
834                         if (flags & TEST_ACTIVE) {
835                                 rq = hang_create_request(&h, engine);
836                                 if (IS_ERR(rq)) {
837                                         err = PTR_ERR(rq);
838                                         break;
839                                 }
840
841                                 i915_request_get(rq);
842                                 i915_request_add(rq);
843
844                                 if (!wait_until_running(&h, rq)) {
845                                         struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
846
847                                         pr_err("%s: Failed to start request %llx, at %x\n",
848                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
849                                         intel_engine_dump(engine, &p,
850                                                           "%s\n", engine->name);
851
852                                         i915_request_put(rq);
853                                         err = -EIO;
854                                         break;
855                                 }
856                         }
857
858                         err = intel_engine_reset(engine, NULL);
859                         if (err) {
860                                 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
861                                        engine->name, test_name, err);
862                                 break;
863                         }
864
865                         count++;
866
867                         if (rq) {
868                                 if (rq->fence.error != -EIO) {
869                                         pr_err("i915_reset_engine(%s:%s):"
870                                                " failed to reset request %llx:%lld\n",
871                                                engine->name, test_name,
872                                                rq->fence.context,
873                                                rq->fence.seqno);
874                                         i915_request_put(rq);
875
876                                         GEM_TRACE_DUMP();
877                                         intel_gt_set_wedged(gt);
878                                         err = -EIO;
879                                         break;
880                                 }
881
882                                 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
883                                         struct drm_printer p =
884                                                 drm_info_printer(gt->i915->drm.dev);
885
886                                         pr_err("i915_reset_engine(%s:%s):"
887                                                " failed to complete request %llx:%lld after reset\n",
888                                                engine->name, test_name,
889                                                rq->fence.context,
890                                                rq->fence.seqno);
891                                         intel_engine_dump(engine, &p,
892                                                           "%s\n", engine->name);
893                                         i915_request_put(rq);
894
895                                         GEM_TRACE_DUMP();
896                                         intel_gt_set_wedged(gt);
897                                         err = -EIO;
898                                         break;
899                                 }
900
901                                 i915_request_put(rq);
902                         }
903
904                         if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
905                                 struct drm_printer p =
906                                         drm_info_printer(gt->i915->drm.dev);
907
908                                 pr_err("i915_reset_engine(%s:%s):"
909                                        " failed to idle after reset\n",
910                                        engine->name, test_name);
911                                 intel_engine_dump(engine, &p,
912                                                   "%s\n", engine->name);
913
914                                 err = -EIO;
915                                 break;
916                         }
917                 } while (time_before(jiffies, end_time));
918                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
919                 st_engine_heartbeat_enable(engine);
920
921                 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
922                         engine->name, test_name, count);
923
924                 reported = i915_reset_engine_count(global, engine);
925                 reported -= threads[engine->id].resets;
926                 if (reported != count) {
927                         pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
928                                engine->name, test_name, count, reported);
929                         if (!err)
930                                 err = -EINVAL;
931                 }
932
933 unwind:
934                 for_each_engine(other, gt, tmp) {
935                         int ret;
936
937                         if (!threads[tmp].task)
938                                 continue;
939
940                         ret = kthread_stop(threads[tmp].task);
941                         if (ret) {
942                                 pr_err("kthread for other engine %s failed, err=%d\n",
943                                        other->name, ret);
944                                 if (!err)
945                                         err = ret;
946                         }
947                         put_task_struct(threads[tmp].task);
948
949                         if (other->uabi_class != engine->uabi_class &&
950                             threads[tmp].resets !=
951                             i915_reset_engine_count(global, other)) {
952                                 pr_err("Innocent engine %s was reset (count=%ld)\n",
953                                        other->name,
954                                        i915_reset_engine_count(global, other) -
955                                        threads[tmp].resets);
956                                 if (!err)
957                                         err = -EINVAL;
958                         }
959                 }
960
961                 if (device != i915_reset_count(global)) {
962                         pr_err("Global reset (count=%ld)!\n",
963                                i915_reset_count(global) - device);
964                         if (!err)
965                                 err = -EINVAL;
966                 }
967
968                 if (err)
969                         break;
970
971                 err = igt_flush_test(gt->i915);
972                 if (err)
973                         break;
974         }
975
976         if (intel_gt_is_wedged(gt))
977                 err = -EIO;
978
979         if (flags & TEST_ACTIVE)
980                 hang_fini(&h);
981
982         return err;
983 }
984
985 static int igt_reset_engines(void *arg)
986 {
987         static const struct {
988                 const char *name;
989                 unsigned int flags;
990         } phases[] = {
991                 { "idle", 0 },
992                 { "active", TEST_ACTIVE },
993                 { "others-idle", TEST_OTHERS },
994                 { "others-active", TEST_OTHERS | TEST_ACTIVE },
995                 {
996                         "others-priority",
997                         TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
998                 },
999                 {
1000                         "self-priority",
1001                         TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1002                 },
1003                 { }
1004         };
1005         struct intel_gt *gt = arg;
1006         typeof(*phases) *p;
1007         int err;
1008
1009         for (p = phases; p->name; p++) {
1010                 if (p->flags & TEST_PRIORITY) {
1011                         if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1012                                 continue;
1013                 }
1014
1015                 err = __igt_reset_engines(arg, p->name, p->flags);
1016                 if (err)
1017                         return err;
1018         }
1019
1020         return 0;
1021 }
1022
1023 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1024 {
1025         u32 count = i915_reset_count(&gt->i915->gpu_error);
1026
1027         intel_gt_reset(gt, mask, NULL);
1028
1029         return count;
1030 }
1031
1032 static int igt_reset_wait(void *arg)
1033 {
1034         struct intel_gt *gt = arg;
1035         struct i915_gpu_error *global = &gt->i915->gpu_error;
1036         struct intel_engine_cs *engine = gt->engine[RCS0];
1037         struct i915_request *rq;
1038         unsigned int reset_count;
1039         struct hang h;
1040         long timeout;
1041         int err;
1042
1043         if (!engine || !intel_engine_can_store_dword(engine))
1044                 return 0;
1045
1046         /* Check that we detect a stuck waiter and issue a reset */
1047
1048         igt_global_reset_lock(gt);
1049
1050         err = hang_init(&h, gt);
1051         if (err)
1052                 goto unlock;
1053
1054         rq = hang_create_request(&h, engine);
1055         if (IS_ERR(rq)) {
1056                 err = PTR_ERR(rq);
1057                 goto fini;
1058         }
1059
1060         i915_request_get(rq);
1061         i915_request_add(rq);
1062
1063         if (!wait_until_running(&h, rq)) {
1064                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1065
1066                 pr_err("%s: Failed to start request %llx, at %x\n",
1067                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1068                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1069
1070                 intel_gt_set_wedged(gt);
1071
1072                 err = -EIO;
1073                 goto out_rq;
1074         }
1075
1076         reset_count = fake_hangcheck(gt, ALL_ENGINES);
1077
1078         timeout = i915_request_wait(rq, 0, 10);
1079         if (timeout < 0) {
1080                 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1081                        timeout);
1082                 err = timeout;
1083                 goto out_rq;
1084         }
1085
1086         if (i915_reset_count(global) == reset_count) {
1087                 pr_err("No GPU reset recorded!\n");
1088                 err = -EINVAL;
1089                 goto out_rq;
1090         }
1091
1092 out_rq:
1093         i915_request_put(rq);
1094 fini:
1095         hang_fini(&h);
1096 unlock:
1097         igt_global_reset_unlock(gt);
1098
1099         if (intel_gt_is_wedged(gt))
1100                 return -EIO;
1101
1102         return err;
1103 }
1104
1105 struct evict_vma {
1106         struct completion completion;
1107         struct i915_vma *vma;
1108 };
1109
1110 static int evict_vma(void *data)
1111 {
1112         struct evict_vma *arg = data;
1113         struct i915_address_space *vm = arg->vma->vm;
1114         struct drm_mm_node evict = arg->vma->node;
1115         int err;
1116
1117         complete(&arg->completion);
1118
1119         mutex_lock(&vm->mutex);
1120         err = i915_gem_evict_for_node(vm, &evict, 0);
1121         mutex_unlock(&vm->mutex);
1122
1123         return err;
1124 }
1125
1126 static int evict_fence(void *data)
1127 {
1128         struct evict_vma *arg = data;
1129         int err;
1130
1131         complete(&arg->completion);
1132
1133         /* Mark the fence register as dirty to force the mmio update. */
1134         err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1135         if (err) {
1136                 pr_err("Invalid Y-tiling settings; err:%d\n", err);
1137                 return err;
1138         }
1139
1140         err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1141         if (err) {
1142                 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1143                 return err;
1144         }
1145
1146         err = i915_vma_pin_fence(arg->vma);
1147         i915_vma_unpin(arg->vma);
1148         if (err) {
1149                 pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1150                 return err;
1151         }
1152
1153         i915_vma_unpin_fence(arg->vma);
1154
1155         return 0;
1156 }
1157
1158 static int __igt_reset_evict_vma(struct intel_gt *gt,
1159                                  struct i915_address_space *vm,
1160                                  int (*fn)(void *),
1161                                  unsigned int flags)
1162 {
1163         struct intel_engine_cs *engine = gt->engine[RCS0];
1164         struct drm_i915_gem_object *obj;
1165         struct task_struct *tsk = NULL;
1166         struct i915_request *rq;
1167         struct evict_vma arg;
1168         struct hang h;
1169         unsigned int pin_flags;
1170         int err;
1171
1172         if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1173                 return 0;
1174
1175         if (!engine || !intel_engine_can_store_dword(engine))
1176                 return 0;
1177
1178         /* Check that we can recover an unbind stuck on a hanging request */
1179
1180         err = hang_init(&h, gt);
1181         if (err)
1182                 return err;
1183
1184         obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1185         if (IS_ERR(obj)) {
1186                 err = PTR_ERR(obj);
1187                 goto fini;
1188         }
1189
1190         if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1191                 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1192                 if (err) {
1193                         pr_err("Invalid X-tiling settings; err:%d\n", err);
1194                         goto out_obj;
1195                 }
1196         }
1197
1198         arg.vma = i915_vma_instance(obj, vm, NULL);
1199         if (IS_ERR(arg.vma)) {
1200                 err = PTR_ERR(arg.vma);
1201                 goto out_obj;
1202         }
1203
1204         rq = hang_create_request(&h, engine);
1205         if (IS_ERR(rq)) {
1206                 err = PTR_ERR(rq);
1207                 goto out_obj;
1208         }
1209
1210         pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1211
1212         if (flags & EXEC_OBJECT_NEEDS_FENCE)
1213                 pin_flags |= PIN_MAPPABLE;
1214
1215         err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1216         if (err) {
1217                 i915_request_add(rq);
1218                 goto out_obj;
1219         }
1220
1221         if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1222                 err = i915_vma_pin_fence(arg.vma);
1223                 if (err) {
1224                         pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1225                         i915_vma_unpin(arg.vma);
1226                         i915_request_add(rq);
1227                         goto out_obj;
1228                 }
1229         }
1230
1231         i915_vma_lock(arg.vma);
1232         err = i915_request_await_object(rq, arg.vma->obj,
1233                                         flags & EXEC_OBJECT_WRITE);
1234         if (err == 0)
1235                 err = i915_vma_move_to_active(arg.vma, rq, flags);
1236         i915_vma_unlock(arg.vma);
1237
1238         if (flags & EXEC_OBJECT_NEEDS_FENCE)
1239                 i915_vma_unpin_fence(arg.vma);
1240         i915_vma_unpin(arg.vma);
1241
1242         i915_request_get(rq);
1243         i915_request_add(rq);
1244         if (err)
1245                 goto out_rq;
1246
1247         if (!wait_until_running(&h, rq)) {
1248                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1249
1250                 pr_err("%s: Failed to start request %llx, at %x\n",
1251                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1252                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1253
1254                 intel_gt_set_wedged(gt);
1255                 goto out_reset;
1256         }
1257
1258         init_completion(&arg.completion);
1259
1260         tsk = kthread_run(fn, &arg, "igt/evict_vma");
1261         if (IS_ERR(tsk)) {
1262                 err = PTR_ERR(tsk);
1263                 tsk = NULL;
1264                 goto out_reset;
1265         }
1266         get_task_struct(tsk);
1267
1268         wait_for_completion(&arg.completion);
1269
1270         if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1271                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1272
1273                 pr_err("igt/evict_vma kthread did not wait\n");
1274                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1275
1276                 intel_gt_set_wedged(gt);
1277                 goto out_reset;
1278         }
1279
1280 out_reset:
1281         igt_global_reset_lock(gt);
1282         fake_hangcheck(gt, rq->engine->mask);
1283         igt_global_reset_unlock(gt);
1284
1285         if (tsk) {
1286                 struct intel_wedge_me w;
1287
1288                 /* The reset, even indirectly, should take less than 10ms. */
1289                 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1290                         err = kthread_stop(tsk);
1291
1292                 put_task_struct(tsk);
1293         }
1294
1295 out_rq:
1296         i915_request_put(rq);
1297 out_obj:
1298         i915_gem_object_put(obj);
1299 fini:
1300         hang_fini(&h);
1301         if (intel_gt_is_wedged(gt))
1302                 return -EIO;
1303
1304         return err;
1305 }
1306
1307 static int igt_reset_evict_ggtt(void *arg)
1308 {
1309         struct intel_gt *gt = arg;
1310
1311         return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1312                                      evict_vma, EXEC_OBJECT_WRITE);
1313 }
1314
1315 static int igt_reset_evict_ppgtt(void *arg)
1316 {
1317         struct intel_gt *gt = arg;
1318         struct i915_ppgtt *ppgtt;
1319         int err;
1320
1321         /* aliasing == global gtt locking, covered above */
1322         if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1323                 return 0;
1324
1325         ppgtt = i915_ppgtt_create(gt);
1326         if (IS_ERR(ppgtt))
1327                 return PTR_ERR(ppgtt);
1328
1329         err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1330                                     evict_vma, EXEC_OBJECT_WRITE);
1331         i915_vm_put(&ppgtt->vm);
1332
1333         return err;
1334 }
1335
1336 static int igt_reset_evict_fence(void *arg)
1337 {
1338         struct intel_gt *gt = arg;
1339
1340         return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1341                                      evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1342 }
1343
1344 static int wait_for_others(struct intel_gt *gt,
1345                            struct intel_engine_cs *exclude)
1346 {
1347         struct intel_engine_cs *engine;
1348         enum intel_engine_id id;
1349
1350         for_each_engine(engine, gt, id) {
1351                 if (engine == exclude)
1352                         continue;
1353
1354                 if (!wait_for_idle(engine))
1355                         return -EIO;
1356         }
1357
1358         return 0;
1359 }
1360
1361 static int igt_reset_queue(void *arg)
1362 {
1363         struct intel_gt *gt = arg;
1364         struct i915_gpu_error *global = &gt->i915->gpu_error;
1365         struct intel_engine_cs *engine;
1366         enum intel_engine_id id;
1367         struct hang h;
1368         int err;
1369
1370         /* Check that we replay pending requests following a hang */
1371
1372         igt_global_reset_lock(gt);
1373
1374         err = hang_init(&h, gt);
1375         if (err)
1376                 goto unlock;
1377
1378         for_each_engine(engine, gt, id) {
1379                 struct i915_request *prev;
1380                 IGT_TIMEOUT(end_time);
1381                 unsigned int count;
1382
1383                 if (!intel_engine_can_store_dword(engine))
1384                         continue;
1385
1386                 prev = hang_create_request(&h, engine);
1387                 if (IS_ERR(prev)) {
1388                         err = PTR_ERR(prev);
1389                         goto fini;
1390                 }
1391
1392                 i915_request_get(prev);
1393                 i915_request_add(prev);
1394
1395                 count = 0;
1396                 do {
1397                         struct i915_request *rq;
1398                         unsigned int reset_count;
1399
1400                         rq = hang_create_request(&h, engine);
1401                         if (IS_ERR(rq)) {
1402                                 err = PTR_ERR(rq);
1403                                 goto fini;
1404                         }
1405
1406                         i915_request_get(rq);
1407                         i915_request_add(rq);
1408
1409                         /*
1410                          * XXX We don't handle resetting the kernel context
1411                          * very well. If we trigger a device reset twice in
1412                          * quick succession while the kernel context is
1413                          * executing, we may end up skipping the breadcrumb.
1414                          * This is really only a problem for the selftest as
1415                          * normally there is a large interlude between resets
1416                          * (hangcheck), or we focus on resetting just one
1417                          * engine and so avoid repeatedly resetting innocents.
1418                          */
1419                         err = wait_for_others(gt, engine);
1420                         if (err) {
1421                                 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1422                                        __func__, engine->name);
1423                                 i915_request_put(rq);
1424                                 i915_request_put(prev);
1425
1426                                 GEM_TRACE_DUMP();
1427                                 intel_gt_set_wedged(gt);
1428                                 goto fini;
1429                         }
1430
1431                         if (!wait_until_running(&h, prev)) {
1432                                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1433
1434                                 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1435                                        __func__, engine->name,
1436                                        prev->fence.seqno, hws_seqno(&h, prev));
1437                                 intel_engine_dump(engine, &p,
1438                                                   "%s\n", engine->name);
1439
1440                                 i915_request_put(rq);
1441                                 i915_request_put(prev);
1442
1443                                 intel_gt_set_wedged(gt);
1444
1445                                 err = -EIO;
1446                                 goto fini;
1447                         }
1448
1449                         reset_count = fake_hangcheck(gt, BIT(id));
1450
1451                         if (prev->fence.error != -EIO) {
1452                                 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1453                                        prev->fence.error);
1454                                 i915_request_put(rq);
1455                                 i915_request_put(prev);
1456                                 err = -EINVAL;
1457                                 goto fini;
1458                         }
1459
1460                         if (rq->fence.error) {
1461                                 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1462                                        rq->fence.error);
1463                                 i915_request_put(rq);
1464                                 i915_request_put(prev);
1465                                 err = -EINVAL;
1466                                 goto fini;
1467                         }
1468
1469                         if (i915_reset_count(global) == reset_count) {
1470                                 pr_err("No GPU reset recorded!\n");
1471                                 i915_request_put(rq);
1472                                 i915_request_put(prev);
1473                                 err = -EINVAL;
1474                                 goto fini;
1475                         }
1476
1477                         i915_request_put(prev);
1478                         prev = rq;
1479                         count++;
1480                 } while (time_before(jiffies, end_time));
1481                 pr_info("%s: Completed %d resets\n", engine->name, count);
1482
1483                 *h.batch = MI_BATCH_BUFFER_END;
1484                 intel_gt_chipset_flush(engine->gt);
1485
1486                 i915_request_put(prev);
1487
1488                 err = igt_flush_test(gt->i915);
1489                 if (err)
1490                         break;
1491         }
1492
1493 fini:
1494         hang_fini(&h);
1495 unlock:
1496         igt_global_reset_unlock(gt);
1497
1498         if (intel_gt_is_wedged(gt))
1499                 return -EIO;
1500
1501         return err;
1502 }
1503
1504 static int igt_handle_error(void *arg)
1505 {
1506         struct intel_gt *gt = arg;
1507         struct i915_gpu_error *global = &gt->i915->gpu_error;
1508         struct intel_engine_cs *engine = gt->engine[RCS0];
1509         struct hang h;
1510         struct i915_request *rq;
1511         struct i915_gpu_coredump *error;
1512         int err;
1513
1514         /* Check that we can issue a global GPU and engine reset */
1515
1516         if (!intel_has_reset_engine(gt))
1517                 return 0;
1518
1519         if (!engine || !intel_engine_can_store_dword(engine))
1520                 return 0;
1521
1522         err = hang_init(&h, gt);
1523         if (err)
1524                 return err;
1525
1526         rq = hang_create_request(&h, engine);
1527         if (IS_ERR(rq)) {
1528                 err = PTR_ERR(rq);
1529                 goto err_fini;
1530         }
1531
1532         i915_request_get(rq);
1533         i915_request_add(rq);
1534
1535         if (!wait_until_running(&h, rq)) {
1536                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1537
1538                 pr_err("%s: Failed to start request %llx, at %x\n",
1539                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1540                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1541
1542                 intel_gt_set_wedged(gt);
1543
1544                 err = -EIO;
1545                 goto err_request;
1546         }
1547
1548         /* Temporarily disable error capture */
1549         error = xchg(&global->first_error, (void *)-1);
1550
1551         intel_gt_handle_error(gt, engine->mask, 0, NULL);
1552
1553         xchg(&global->first_error, error);
1554
1555         if (rq->fence.error != -EIO) {
1556                 pr_err("Guilty request not identified!\n");
1557                 err = -EINVAL;
1558                 goto err_request;
1559         }
1560
1561 err_request:
1562         i915_request_put(rq);
1563 err_fini:
1564         hang_fini(&h);
1565         return err;
1566 }
1567
1568 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1569                                      const struct igt_atomic_section *p,
1570                                      const char *mode)
1571 {
1572         struct tasklet_struct * const t = &engine->execlists.tasklet;
1573         int err;
1574
1575         GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1576                   engine->name, mode, p->name);
1577
1578         tasklet_disable(t);
1579         p->critical_section_begin();
1580
1581         err = intel_engine_reset(engine, NULL);
1582
1583         p->critical_section_end();
1584         tasklet_enable(t);
1585
1586         if (err)
1587                 pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1588                        engine->name, mode, p->name);
1589
1590         return err;
1591 }
1592
1593 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1594                                    const struct igt_atomic_section *p)
1595 {
1596         struct i915_request *rq;
1597         struct hang h;
1598         int err;
1599
1600         err = __igt_atomic_reset_engine(engine, p, "idle");
1601         if (err)
1602                 return err;
1603
1604         err = hang_init(&h, engine->gt);
1605         if (err)
1606                 return err;
1607
1608         rq = hang_create_request(&h, engine);
1609         if (IS_ERR(rq)) {
1610                 err = PTR_ERR(rq);
1611                 goto out;
1612         }
1613
1614         i915_request_get(rq);
1615         i915_request_add(rq);
1616
1617         if (wait_until_running(&h, rq)) {
1618                 err = __igt_atomic_reset_engine(engine, p, "active");
1619         } else {
1620                 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1621                        __func__, engine->name,
1622                        rq->fence.seqno, hws_seqno(&h, rq));
1623                 intel_gt_set_wedged(engine->gt);
1624                 err = -EIO;
1625         }
1626
1627         if (err == 0) {
1628                 struct intel_wedge_me w;
1629
1630                 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1631                         i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1632                 if (intel_gt_is_wedged(engine->gt))
1633                         err = -EIO;
1634         }
1635
1636         i915_request_put(rq);
1637 out:
1638         hang_fini(&h);
1639         return err;
1640 }
1641
1642 static int igt_reset_engines_atomic(void *arg)
1643 {
1644         struct intel_gt *gt = arg;
1645         const typeof(*igt_atomic_phases) *p;
1646         int err = 0;
1647
1648         /* Check that the engines resets are usable from atomic context */
1649
1650         if (!intel_has_reset_engine(gt))
1651                 return 0;
1652
1653         if (intel_uc_uses_guc_submission(&gt->uc))
1654                 return 0;
1655
1656         igt_global_reset_lock(gt);
1657
1658         /* Flush any requests before we get started and check basics */
1659         if (!igt_force_reset(gt))
1660                 goto unlock;
1661
1662         for (p = igt_atomic_phases; p->name; p++) {
1663                 struct intel_engine_cs *engine;
1664                 enum intel_engine_id id;
1665
1666                 for_each_engine(engine, gt, id) {
1667                         err = igt_atomic_reset_engine(engine, p);
1668                         if (err)
1669                                 goto out;
1670                 }
1671         }
1672
1673 out:
1674         /* As we poke around the guts, do a full reset before continuing. */
1675         igt_force_reset(gt);
1676 unlock:
1677         igt_global_reset_unlock(gt);
1678
1679         return err;
1680 }
1681
1682 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1683 {
1684         static const struct i915_subtest tests[] = {
1685                 SUBTEST(igt_hang_sanitycheck),
1686                 SUBTEST(igt_reset_nop),
1687                 SUBTEST(igt_reset_nop_engine),
1688                 SUBTEST(igt_reset_idle_engine),
1689                 SUBTEST(igt_reset_active_engine),
1690                 SUBTEST(igt_reset_engines),
1691                 SUBTEST(igt_reset_engines_atomic),
1692                 SUBTEST(igt_reset_queue),
1693                 SUBTEST(igt_reset_wait),
1694                 SUBTEST(igt_reset_evict_ggtt),
1695                 SUBTEST(igt_reset_evict_ppgtt),
1696                 SUBTEST(igt_reset_evict_fence),
1697                 SUBTEST(igt_handle_error),
1698         };
1699         struct intel_gt *gt = &i915->gt;
1700         intel_wakeref_t wakeref;
1701         int err;
1702
1703         if (!intel_has_gpu_reset(gt))
1704                 return 0;
1705
1706         if (intel_gt_is_wedged(gt))
1707                 return -EIO; /* we're long past hope of a successful reset */
1708
1709         wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1710
1711         err = intel_gt_live_subtests(tests, gt);
1712
1713         intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1714
1715         return err;
1716 }