GNU Linux-libre 4.19.207-gnu1
[releases.git] / drivers / gpu / drm / i915 / selftests / intel_hangcheck.c
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24
25 #include <linux/kthread.h>
26
27 #include "../i915_selftest.h"
28 #include "i915_random.h"
29 #include "igt_flush_test.h"
30 #include "igt_wedge_me.h"
31
32 #include "mock_context.h"
33 #include "mock_drm.h"
34
35 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
36
37 struct hang {
38         struct drm_i915_private *i915;
39         struct drm_i915_gem_object *hws;
40         struct drm_i915_gem_object *obj;
41         struct i915_gem_context *ctx;
42         u32 *seqno;
43         u32 *batch;
44 };
45
46 static int hang_init(struct hang *h, struct drm_i915_private *i915)
47 {
48         void *vaddr;
49         int err;
50
51         memset(h, 0, sizeof(*h));
52         h->i915 = i915;
53
54         h->ctx = kernel_context(i915);
55         if (IS_ERR(h->ctx))
56                 return PTR_ERR(h->ctx);
57
58         h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
59         if (IS_ERR(h->hws)) {
60                 err = PTR_ERR(h->hws);
61                 goto err_ctx;
62         }
63
64         h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
65         if (IS_ERR(h->obj)) {
66                 err = PTR_ERR(h->obj);
67                 goto err_hws;
68         }
69
70         i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
71         vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
72         if (IS_ERR(vaddr)) {
73                 err = PTR_ERR(vaddr);
74                 goto err_obj;
75         }
76         h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
77
78         vaddr = i915_gem_object_pin_map(h->obj,
79                                         HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
80         if (IS_ERR(vaddr)) {
81                 err = PTR_ERR(vaddr);
82                 goto err_unpin_hws;
83         }
84         h->batch = vaddr;
85
86         return 0;
87
88 err_unpin_hws:
89         i915_gem_object_unpin_map(h->hws);
90 err_obj:
91         i915_gem_object_put(h->obj);
92 err_hws:
93         i915_gem_object_put(h->hws);
94 err_ctx:
95         kernel_context_close(h->ctx);
96         return err;
97 }
98
99 static u64 hws_address(const struct i915_vma *hws,
100                        const struct i915_request *rq)
101 {
102         return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
103 }
104
105 static int emit_recurse_batch(struct hang *h,
106                               struct i915_request *rq)
107 {
108         struct drm_i915_private *i915 = h->i915;
109         struct i915_address_space *vm =
110                 rq->gem_context->ppgtt ?
111                 &rq->gem_context->ppgtt->vm :
112                 &i915->ggtt.vm;
113         struct i915_vma *hws, *vma;
114         unsigned int flags;
115         u32 *batch;
116         int err;
117
118         vma = i915_vma_instance(h->obj, vm, NULL);
119         if (IS_ERR(vma))
120                 return PTR_ERR(vma);
121
122         hws = i915_vma_instance(h->hws, vm, NULL);
123         if (IS_ERR(hws))
124                 return PTR_ERR(hws);
125
126         err = i915_vma_pin(vma, 0, 0, PIN_USER);
127         if (err)
128                 return err;
129
130         err = i915_vma_pin(hws, 0, 0, PIN_USER);
131         if (err)
132                 goto unpin_vma;
133
134         err = i915_vma_move_to_active(vma, rq, 0);
135         if (err)
136                 goto unpin_hws;
137
138         if (!i915_gem_object_has_active_reference(vma->obj)) {
139                 i915_gem_object_get(vma->obj);
140                 i915_gem_object_set_active_reference(vma->obj);
141         }
142
143         err = i915_vma_move_to_active(hws, rq, 0);
144         if (err)
145                 goto unpin_hws;
146
147         if (!i915_gem_object_has_active_reference(hws->obj)) {
148                 i915_gem_object_get(hws->obj);
149                 i915_gem_object_set_active_reference(hws->obj);
150         }
151
152         batch = h->batch;
153         if (INTEL_GEN(i915) >= 8) {
154                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
155                 *batch++ = lower_32_bits(hws_address(hws, rq));
156                 *batch++ = upper_32_bits(hws_address(hws, rq));
157                 *batch++ = rq->fence.seqno;
158                 *batch++ = MI_ARB_CHECK;
159
160                 memset(batch, 0, 1024);
161                 batch += 1024 / sizeof(*batch);
162
163                 *batch++ = MI_ARB_CHECK;
164                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
165                 *batch++ = lower_32_bits(vma->node.start);
166                 *batch++ = upper_32_bits(vma->node.start);
167         } else if (INTEL_GEN(i915) >= 6) {
168                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
169                 *batch++ = 0;
170                 *batch++ = lower_32_bits(hws_address(hws, rq));
171                 *batch++ = rq->fence.seqno;
172                 *batch++ = MI_ARB_CHECK;
173
174                 memset(batch, 0, 1024);
175                 batch += 1024 / sizeof(*batch);
176
177                 *batch++ = MI_ARB_CHECK;
178                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
179                 *batch++ = lower_32_bits(vma->node.start);
180         } else if (INTEL_GEN(i915) >= 4) {
181                 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
182                 *batch++ = 0;
183                 *batch++ = lower_32_bits(hws_address(hws, rq));
184                 *batch++ = rq->fence.seqno;
185                 *batch++ = MI_ARB_CHECK;
186
187                 memset(batch, 0, 1024);
188                 batch += 1024 / sizeof(*batch);
189
190                 *batch++ = MI_ARB_CHECK;
191                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
192                 *batch++ = lower_32_bits(vma->node.start);
193         } else {
194                 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
195                 *batch++ = lower_32_bits(hws_address(hws, rq));
196                 *batch++ = rq->fence.seqno;
197                 *batch++ = MI_ARB_CHECK;
198
199                 memset(batch, 0, 1024);
200                 batch += 1024 / sizeof(*batch);
201
202                 *batch++ = MI_ARB_CHECK;
203                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
204                 *batch++ = lower_32_bits(vma->node.start);
205         }
206         *batch++ = MI_BATCH_BUFFER_END; /* not reached */
207         i915_gem_chipset_flush(h->i915);
208
209         flags = 0;
210         if (INTEL_GEN(vm->i915) <= 5)
211                 flags |= I915_DISPATCH_SECURE;
212
213         err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
214
215 unpin_hws:
216         i915_vma_unpin(hws);
217 unpin_vma:
218         i915_vma_unpin(vma);
219         return err;
220 }
221
222 static struct i915_request *
223 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
224 {
225         struct i915_request *rq;
226         int err;
227
228         if (i915_gem_object_is_active(h->obj)) {
229                 struct drm_i915_gem_object *obj;
230                 void *vaddr;
231
232                 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
233                 if (IS_ERR(obj))
234                         return ERR_CAST(obj);
235
236                 vaddr = i915_gem_object_pin_map(obj,
237                                                 HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
238                 if (IS_ERR(vaddr)) {
239                         i915_gem_object_put(obj);
240                         return ERR_CAST(vaddr);
241                 }
242
243                 i915_gem_object_unpin_map(h->obj);
244                 i915_gem_object_put(h->obj);
245
246                 h->obj = obj;
247                 h->batch = vaddr;
248         }
249
250         rq = i915_request_alloc(engine, h->ctx);
251         if (IS_ERR(rq))
252                 return rq;
253
254         err = emit_recurse_batch(h, rq);
255         if (err) {
256                 i915_request_add(rq);
257                 return ERR_PTR(err);
258         }
259
260         return rq;
261 }
262
263 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
264 {
265         return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
266 }
267
268 static void hang_fini(struct hang *h)
269 {
270         *h->batch = MI_BATCH_BUFFER_END;
271         i915_gem_chipset_flush(h->i915);
272
273         i915_gem_object_unpin_map(h->obj);
274         i915_gem_object_put(h->obj);
275
276         i915_gem_object_unpin_map(h->hws);
277         i915_gem_object_put(h->hws);
278
279         kernel_context_close(h->ctx);
280
281         igt_flush_test(h->i915, I915_WAIT_LOCKED);
282 }
283
284 static bool wait_until_running(struct hang *h, struct i915_request *rq)
285 {
286         return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
287                                                rq->fence.seqno),
288                              10) &&
289                  wait_for(i915_seqno_passed(hws_seqno(h, rq),
290                                             rq->fence.seqno),
291                           1000));
292 }
293
294 static int igt_hang_sanitycheck(void *arg)
295 {
296         struct drm_i915_private *i915 = arg;
297         struct i915_request *rq;
298         struct intel_engine_cs *engine;
299         enum intel_engine_id id;
300         struct hang h;
301         int err;
302
303         /* Basic check that we can execute our hanging batch */
304
305         mutex_lock(&i915->drm.struct_mutex);
306         err = hang_init(&h, i915);
307         if (err)
308                 goto unlock;
309
310         for_each_engine(engine, i915, id) {
311                 long timeout;
312
313                 if (!intel_engine_can_store_dword(engine))
314                         continue;
315
316                 rq = hang_create_request(&h, engine);
317                 if (IS_ERR(rq)) {
318                         err = PTR_ERR(rq);
319                         pr_err("Failed to create request for %s, err=%d\n",
320                                engine->name, err);
321                         goto fini;
322                 }
323
324                 i915_request_get(rq);
325
326                 *h.batch = MI_BATCH_BUFFER_END;
327                 i915_gem_chipset_flush(i915);
328
329                 i915_request_add(rq);
330
331                 timeout = i915_request_wait(rq,
332                                             I915_WAIT_LOCKED,
333                                             MAX_SCHEDULE_TIMEOUT);
334                 i915_request_put(rq);
335
336                 if (timeout < 0) {
337                         err = timeout;
338                         pr_err("Wait for request failed on %s, err=%d\n",
339                                engine->name, err);
340                         goto fini;
341                 }
342         }
343
344 fini:
345         hang_fini(&h);
346 unlock:
347         mutex_unlock(&i915->drm.struct_mutex);
348         return err;
349 }
350
351 static void global_reset_lock(struct drm_i915_private *i915)
352 {
353         struct intel_engine_cs *engine;
354         enum intel_engine_id id;
355
356         pr_debug("%s: current gpu_error=%08lx\n",
357                  __func__, i915->gpu_error.flags);
358
359         while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
360                 wait_event(i915->gpu_error.reset_queue,
361                            !test_bit(I915_RESET_BACKOFF,
362                                      &i915->gpu_error.flags));
363
364         for_each_engine(engine, i915, id) {
365                 while (test_and_set_bit(I915_RESET_ENGINE + id,
366                                         &i915->gpu_error.flags))
367                         wait_on_bit(&i915->gpu_error.flags,
368                                     I915_RESET_ENGINE + id,
369                                     TASK_UNINTERRUPTIBLE);
370         }
371 }
372
373 static void global_reset_unlock(struct drm_i915_private *i915)
374 {
375         struct intel_engine_cs *engine;
376         enum intel_engine_id id;
377
378         for_each_engine(engine, i915, id)
379                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
380
381         clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
382         wake_up_all(&i915->gpu_error.reset_queue);
383 }
384
385 static int igt_global_reset(void *arg)
386 {
387         struct drm_i915_private *i915 = arg;
388         unsigned int reset_count;
389         int err = 0;
390
391         /* Check that we can issue a global GPU reset */
392
393         global_reset_lock(i915);
394         set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
395
396         mutex_lock(&i915->drm.struct_mutex);
397         reset_count = i915_reset_count(&i915->gpu_error);
398
399         i915_reset(i915, ALL_ENGINES, NULL);
400
401         if (i915_reset_count(&i915->gpu_error) == reset_count) {
402                 pr_err("No GPU reset recorded!\n");
403                 err = -EINVAL;
404         }
405         mutex_unlock(&i915->drm.struct_mutex);
406
407         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
408         global_reset_unlock(i915);
409
410         if (i915_terminally_wedged(&i915->gpu_error))
411                 err = -EIO;
412
413         return err;
414 }
415
416 static bool wait_for_idle(struct intel_engine_cs *engine)
417 {
418         return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
419 }
420
421 static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
422 {
423         struct intel_engine_cs *engine;
424         enum intel_engine_id id;
425         struct hang h;
426         int err = 0;
427
428         /* Check that we can issue an engine reset on an idle engine (no-op) */
429
430         if (!intel_has_reset_engine(i915))
431                 return 0;
432
433         if (active) {
434                 mutex_lock(&i915->drm.struct_mutex);
435                 err = hang_init(&h, i915);
436                 mutex_unlock(&i915->drm.struct_mutex);
437                 if (err)
438                         return err;
439         }
440
441         for_each_engine(engine, i915, id) {
442                 unsigned int reset_count, reset_engine_count;
443                 IGT_TIMEOUT(end_time);
444
445                 if (active && !intel_engine_can_store_dword(engine))
446                         continue;
447
448                 if (!wait_for_idle(engine)) {
449                         pr_err("%s failed to idle before reset\n",
450                                engine->name);
451                         err = -EIO;
452                         break;
453                 }
454
455                 reset_count = i915_reset_count(&i915->gpu_error);
456                 reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
457                                                              engine);
458
459                 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
460                 do {
461                         u32 seqno = intel_engine_get_seqno(engine);
462
463                         if (active) {
464                                 struct i915_request *rq;
465
466                                 mutex_lock(&i915->drm.struct_mutex);
467                                 rq = hang_create_request(&h, engine);
468                                 if (IS_ERR(rq)) {
469                                         err = PTR_ERR(rq);
470                                         mutex_unlock(&i915->drm.struct_mutex);
471                                         break;
472                                 }
473
474                                 i915_request_get(rq);
475                                 i915_request_add(rq);
476                                 mutex_unlock(&i915->drm.struct_mutex);
477
478                                 if (!wait_until_running(&h, rq)) {
479                                         struct drm_printer p = drm_info_printer(i915->drm.dev);
480
481                                         pr_err("%s: Failed to start request %x, at %x\n",
482                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
483                                         intel_engine_dump(engine, &p,
484                                                           "%s\n", engine->name);
485
486                                         i915_request_put(rq);
487                                         err = -EIO;
488                                         break;
489                                 }
490
491                                 GEM_BUG_ON(!rq->global_seqno);
492                                 seqno = rq->global_seqno - 1;
493                                 i915_request_put(rq);
494                         }
495
496                         err = i915_reset_engine(engine, NULL);
497                         if (err) {
498                                 pr_err("i915_reset_engine failed\n");
499                                 break;
500                         }
501
502                         if (i915_reset_count(&i915->gpu_error) != reset_count) {
503                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
504                                 err = -EINVAL;
505                                 break;
506                         }
507
508                         reset_engine_count += active;
509                         if (i915_reset_engine_count(&i915->gpu_error, engine) !=
510                             reset_engine_count) {
511                                 pr_err("%s engine reset %srecorded!\n",
512                                        engine->name, active ? "not " : "");
513                                 err = -EINVAL;
514                                 break;
515                         }
516
517                         if (!wait_for_idle(engine)) {
518                                 struct drm_printer p =
519                                         drm_info_printer(i915->drm.dev);
520
521                                 pr_err("%s failed to idle after reset\n",
522                                        engine->name);
523                                 intel_engine_dump(engine, &p,
524                                                   "%s\n", engine->name);
525
526                                 err = -EIO;
527                                 break;
528                         }
529                 } while (time_before(jiffies, end_time));
530                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
531
532                 if (err)
533                         break;
534
535                 err = igt_flush_test(i915, 0);
536                 if (err)
537                         break;
538         }
539
540         if (i915_terminally_wedged(&i915->gpu_error))
541                 err = -EIO;
542
543         if (active) {
544                 mutex_lock(&i915->drm.struct_mutex);
545                 hang_fini(&h);
546                 mutex_unlock(&i915->drm.struct_mutex);
547         }
548
549         return err;
550 }
551
552 static int igt_reset_idle_engine(void *arg)
553 {
554         return __igt_reset_engine(arg, false);
555 }
556
557 static int igt_reset_active_engine(void *arg)
558 {
559         return __igt_reset_engine(arg, true);
560 }
561
562 struct active_engine {
563         struct task_struct *task;
564         struct intel_engine_cs *engine;
565         unsigned long resets;
566         unsigned int flags;
567 };
568
569 #define TEST_ACTIVE     BIT(0)
570 #define TEST_OTHERS     BIT(1)
571 #define TEST_SELF       BIT(2)
572 #define TEST_PRIORITY   BIT(3)
573
574 static int active_request_put(struct i915_request *rq)
575 {
576         int err = 0;
577
578         if (!rq)
579                 return 0;
580
581         if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
582                 GEM_TRACE("%s timed out waiting for completion of fence %llx:%d, seqno %d.\n",
583                           rq->engine->name,
584                           rq->fence.context,
585                           rq->fence.seqno,
586                           i915_request_global_seqno(rq));
587                 GEM_TRACE_DUMP();
588
589                 i915_gem_set_wedged(rq->i915);
590                 err = -EIO;
591         }
592
593         i915_request_put(rq);
594
595         return err;
596 }
597
598 static int active_engine(void *data)
599 {
600         I915_RND_STATE(prng);
601         struct active_engine *arg = data;
602         struct intel_engine_cs *engine = arg->engine;
603         struct i915_request *rq[8] = {};
604         struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
605         struct drm_file *file;
606         unsigned long count = 0;
607         int err = 0;
608
609         file = mock_file(engine->i915);
610         if (IS_ERR(file))
611                 return PTR_ERR(file);
612
613         for (count = 0; count < ARRAY_SIZE(ctx); count++) {
614                 mutex_lock(&engine->i915->drm.struct_mutex);
615                 ctx[count] = live_context(engine->i915, file);
616                 mutex_unlock(&engine->i915->drm.struct_mutex);
617                 if (IS_ERR(ctx[count])) {
618                         err = PTR_ERR(ctx[count]);
619                         while (--count)
620                                 i915_gem_context_put(ctx[count]);
621                         goto err_file;
622                 }
623         }
624
625         while (!kthread_should_stop()) {
626                 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
627                 struct i915_request *old = rq[idx];
628                 struct i915_request *new;
629
630                 mutex_lock(&engine->i915->drm.struct_mutex);
631                 new = i915_request_alloc(engine, ctx[idx]);
632                 if (IS_ERR(new)) {
633                         mutex_unlock(&engine->i915->drm.struct_mutex);
634                         err = PTR_ERR(new);
635                         break;
636                 }
637
638                 if (arg->flags & TEST_PRIORITY)
639                         ctx[idx]->sched.priority =
640                                 i915_prandom_u32_max_state(512, &prng);
641
642                 rq[idx] = i915_request_get(new);
643                 i915_request_add(new);
644                 mutex_unlock(&engine->i915->drm.struct_mutex);
645
646                 err = active_request_put(old);
647                 if (err)
648                         break;
649
650                 cond_resched();
651         }
652
653         for (count = 0; count < ARRAY_SIZE(rq); count++) {
654                 int err__ = active_request_put(rq[count]);
655
656                 /* Keep the first error */
657                 if (!err)
658                         err = err__;
659         }
660
661 err_file:
662         mock_file_free(engine->i915, file);
663         return err;
664 }
665
666 static int __igt_reset_engines(struct drm_i915_private *i915,
667                                const char *test_name,
668                                unsigned int flags)
669 {
670         struct intel_engine_cs *engine, *other;
671         enum intel_engine_id id, tmp;
672         struct hang h;
673         int err = 0;
674
675         /* Check that issuing a reset on one engine does not interfere
676          * with any other engine.
677          */
678
679         if (!intel_has_reset_engine(i915))
680                 return 0;
681
682         if (flags & TEST_ACTIVE) {
683                 mutex_lock(&i915->drm.struct_mutex);
684                 err = hang_init(&h, i915);
685                 mutex_unlock(&i915->drm.struct_mutex);
686                 if (err)
687                         return err;
688
689                 if (flags & TEST_PRIORITY)
690                         h.ctx->sched.priority = 1024;
691         }
692
693         for_each_engine(engine, i915, id) {
694                 struct active_engine threads[I915_NUM_ENGINES] = {};
695                 unsigned long global = i915_reset_count(&i915->gpu_error);
696                 unsigned long count = 0, reported;
697                 IGT_TIMEOUT(end_time);
698
699                 if (flags & TEST_ACTIVE &&
700                     !intel_engine_can_store_dword(engine))
701                         continue;
702
703                 if (!wait_for_idle(engine)) {
704                         pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
705                                engine->name, test_name);
706                         err = -EIO;
707                         break;
708                 }
709
710                 memset(threads, 0, sizeof(threads));
711                 for_each_engine(other, i915, tmp) {
712                         struct task_struct *tsk;
713
714                         threads[tmp].resets =
715                                 i915_reset_engine_count(&i915->gpu_error,
716                                                         other);
717
718                         if (!(flags & TEST_OTHERS))
719                                 continue;
720
721                         if (other == engine && !(flags & TEST_SELF))
722                                 continue;
723
724                         threads[tmp].engine = other;
725                         threads[tmp].flags = flags;
726
727                         tsk = kthread_run(active_engine, &threads[tmp],
728                                           "igt/%s", other->name);
729                         if (IS_ERR(tsk)) {
730                                 err = PTR_ERR(tsk);
731                                 goto unwind;
732                         }
733
734                         threads[tmp].task = tsk;
735                         get_task_struct(tsk);
736                 }
737
738                 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
739                 do {
740                         u32 seqno = intel_engine_get_seqno(engine);
741                         struct i915_request *rq = NULL;
742
743                         if (flags & TEST_ACTIVE) {
744                                 mutex_lock(&i915->drm.struct_mutex);
745                                 rq = hang_create_request(&h, engine);
746                                 if (IS_ERR(rq)) {
747                                         err = PTR_ERR(rq);
748                                         mutex_unlock(&i915->drm.struct_mutex);
749                                         break;
750                                 }
751
752                                 i915_request_get(rq);
753                                 i915_request_add(rq);
754                                 mutex_unlock(&i915->drm.struct_mutex);
755
756                                 if (!wait_until_running(&h, rq)) {
757                                         struct drm_printer p = drm_info_printer(i915->drm.dev);
758
759                                         pr_err("%s: Failed to start request %x, at %x\n",
760                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
761                                         intel_engine_dump(engine, &p,
762                                                           "%s\n", engine->name);
763
764                                         i915_request_put(rq);
765                                         err = -EIO;
766                                         break;
767                                 }
768
769                                 GEM_BUG_ON(!rq->global_seqno);
770                                 seqno = rq->global_seqno - 1;
771                         }
772
773                         err = i915_reset_engine(engine, NULL);
774                         if (err) {
775                                 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
776                                        engine->name, test_name, err);
777                                 break;
778                         }
779
780                         count++;
781
782                         if (rq) {
783                                 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
784                                 i915_request_put(rq);
785                         }
786
787                         if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
788                                 struct drm_printer p =
789                                         drm_info_printer(i915->drm.dev);
790
791                                 pr_err("i915_reset_engine(%s:%s):"
792                                        " failed to idle after reset\n",
793                                        engine->name, test_name);
794                                 intel_engine_dump(engine, &p,
795                                                   "%s\n", engine->name);
796
797                                 err = -EIO;
798                                 break;
799                         }
800                 } while (time_before(jiffies, end_time));
801                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
802                 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
803                         engine->name, test_name, count);
804
805                 reported = i915_reset_engine_count(&i915->gpu_error, engine);
806                 reported -= threads[engine->id].resets;
807                 if (reported != (flags & TEST_ACTIVE ? count : 0)) {
808                         pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu, expected %lu reported\n",
809                                engine->name, test_name, count, reported,
810                                (flags & TEST_ACTIVE ? count : 0));
811                         if (!err)
812                                 err = -EINVAL;
813                 }
814
815 unwind:
816                 for_each_engine(other, i915, tmp) {
817                         int ret;
818
819                         if (!threads[tmp].task)
820                                 continue;
821
822                         ret = kthread_stop(threads[tmp].task);
823                         if (ret) {
824                                 pr_err("kthread for other engine %s failed, err=%d\n",
825                                        other->name, ret);
826                                 if (!err)
827                                         err = ret;
828                         }
829                         put_task_struct(threads[tmp].task);
830
831                         if (other != engine &&
832                             threads[tmp].resets !=
833                             i915_reset_engine_count(&i915->gpu_error, other)) {
834                                 pr_err("Innocent engine %s was reset (count=%ld)\n",
835                                        other->name,
836                                        i915_reset_engine_count(&i915->gpu_error,
837                                                                other) -
838                                        threads[tmp].resets);
839                                 if (!err)
840                                         err = -EINVAL;
841                         }
842                 }
843
844                 if (global != i915_reset_count(&i915->gpu_error)) {
845                         pr_err("Global reset (count=%ld)!\n",
846                                i915_reset_count(&i915->gpu_error) - global);
847                         if (!err)
848                                 err = -EINVAL;
849                 }
850
851                 if (err)
852                         break;
853
854                 err = igt_flush_test(i915, 0);
855                 if (err)
856                         break;
857         }
858
859         if (i915_terminally_wedged(&i915->gpu_error))
860                 err = -EIO;
861
862         if (flags & TEST_ACTIVE) {
863                 mutex_lock(&i915->drm.struct_mutex);
864                 hang_fini(&h);
865                 mutex_unlock(&i915->drm.struct_mutex);
866         }
867
868         return err;
869 }
870
871 static int igt_reset_engines(void *arg)
872 {
873         static const struct {
874                 const char *name;
875                 unsigned int flags;
876         } phases[] = {
877                 { "idle", 0 },
878                 { "active", TEST_ACTIVE },
879                 { "others-idle", TEST_OTHERS },
880                 { "others-active", TEST_OTHERS | TEST_ACTIVE },
881                 {
882                         "others-priority",
883                         TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
884                 },
885                 {
886                         "self-priority",
887                         TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
888                 },
889                 { }
890         };
891         struct drm_i915_private *i915 = arg;
892         typeof(*phases) *p;
893         int err;
894
895         for (p = phases; p->name; p++) {
896                 if (p->flags & TEST_PRIORITY) {
897                         if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
898                                 continue;
899                 }
900
901                 err = __igt_reset_engines(arg, p->name, p->flags);
902                 if (err)
903                         return err;
904         }
905
906         return 0;
907 }
908
909 static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
910 {
911         struct i915_gpu_error *error = &rq->i915->gpu_error;
912         u32 reset_count = i915_reset_count(error);
913
914         error->stalled_mask = mask;
915
916         /* set_bit() must be after we have setup the backchannel (mask) */
917         smp_mb__before_atomic();
918         set_bit(I915_RESET_HANDOFF, &error->flags);
919
920         wake_up_all(&error->wait_queue);
921
922         return reset_count;
923 }
924
925 static int igt_reset_wait(void *arg)
926 {
927         struct drm_i915_private *i915 = arg;
928         struct i915_request *rq;
929         unsigned int reset_count;
930         struct hang h;
931         long timeout;
932         int err;
933
934         if (!intel_engine_can_store_dword(i915->engine[RCS]))
935                 return 0;
936
937         /* Check that we detect a stuck waiter and issue a reset */
938
939         global_reset_lock(i915);
940
941         mutex_lock(&i915->drm.struct_mutex);
942         err = hang_init(&h, i915);
943         if (err)
944                 goto unlock;
945
946         rq = hang_create_request(&h, i915->engine[RCS]);
947         if (IS_ERR(rq)) {
948                 err = PTR_ERR(rq);
949                 goto fini;
950         }
951
952         i915_request_get(rq);
953         i915_request_add(rq);
954
955         if (!wait_until_running(&h, rq)) {
956                 struct drm_printer p = drm_info_printer(i915->drm.dev);
957
958                 pr_err("%s: Failed to start request %x, at %x\n",
959                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
960                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
961
962                 i915_gem_set_wedged(i915);
963
964                 err = -EIO;
965                 goto out_rq;
966         }
967
968         reset_count = fake_hangcheck(rq, ALL_ENGINES);
969
970         timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
971         if (timeout < 0) {
972                 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
973                        timeout);
974                 err = timeout;
975                 goto out_rq;
976         }
977
978         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
979         if (i915_reset_count(&i915->gpu_error) == reset_count) {
980                 pr_err("No GPU reset recorded!\n");
981                 err = -EINVAL;
982                 goto out_rq;
983         }
984
985 out_rq:
986         i915_request_put(rq);
987 fini:
988         hang_fini(&h);
989 unlock:
990         mutex_unlock(&i915->drm.struct_mutex);
991         global_reset_unlock(i915);
992
993         if (i915_terminally_wedged(&i915->gpu_error))
994                 return -EIO;
995
996         return err;
997 }
998
999 struct evict_vma {
1000         struct completion completion;
1001         struct i915_vma *vma;
1002 };
1003
1004 static int evict_vma(void *data)
1005 {
1006         struct evict_vma *arg = data;
1007         struct i915_address_space *vm = arg->vma->vm;
1008         struct drm_i915_private *i915 = vm->i915;
1009         struct drm_mm_node evict = arg->vma->node;
1010         int err;
1011
1012         complete(&arg->completion);
1013
1014         mutex_lock(&i915->drm.struct_mutex);
1015         err = i915_gem_evict_for_node(vm, &evict, 0);
1016         mutex_unlock(&i915->drm.struct_mutex);
1017
1018         return err;
1019 }
1020
1021 static int __igt_reset_evict_vma(struct drm_i915_private *i915,
1022                                  struct i915_address_space *vm)
1023 {
1024         struct drm_i915_gem_object *obj;
1025         struct task_struct *tsk = NULL;
1026         struct i915_request *rq;
1027         struct evict_vma arg;
1028         struct hang h;
1029         int err;
1030
1031         if (!intel_engine_can_store_dword(i915->engine[RCS]))
1032                 return 0;
1033
1034         /* Check that we can recover an unbind stuck on a hanging request */
1035
1036         global_reset_lock(i915);
1037
1038         mutex_lock(&i915->drm.struct_mutex);
1039         err = hang_init(&h, i915);
1040         if (err)
1041                 goto unlock;
1042
1043         obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
1044         if (IS_ERR(obj)) {
1045                 err = PTR_ERR(obj);
1046                 goto fini;
1047         }
1048
1049         arg.vma = i915_vma_instance(obj, vm, NULL);
1050         if (IS_ERR(arg.vma)) {
1051                 err = PTR_ERR(arg.vma);
1052                 goto out_obj;
1053         }
1054
1055         rq = hang_create_request(&h, i915->engine[RCS]);
1056         if (IS_ERR(rq)) {
1057                 err = PTR_ERR(rq);
1058                 goto out_obj;
1059         }
1060
1061         err = i915_vma_pin(arg.vma, 0, 0,
1062                            i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER);
1063         if (err)
1064                 goto out_obj;
1065
1066         err = i915_vma_move_to_active(arg.vma, rq, EXEC_OBJECT_WRITE);
1067         i915_vma_unpin(arg.vma);
1068
1069         i915_request_get(rq);
1070         i915_request_add(rq);
1071         if (err)
1072                 goto out_rq;
1073
1074         mutex_unlock(&i915->drm.struct_mutex);
1075
1076         if (!wait_until_running(&h, rq)) {
1077                 struct drm_printer p = drm_info_printer(i915->drm.dev);
1078
1079                 pr_err("%s: Failed to start request %x, at %x\n",
1080                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1081                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1082
1083                 i915_gem_set_wedged(i915);
1084                 goto out_reset;
1085         }
1086
1087         init_completion(&arg.completion);
1088
1089         tsk = kthread_run(evict_vma, &arg, "igt/evict_vma");
1090         if (IS_ERR(tsk)) {
1091                 err = PTR_ERR(tsk);
1092                 tsk = NULL;
1093                 goto out_reset;
1094         }
1095
1096         wait_for_completion(&arg.completion);
1097
1098         if (wait_for(waitqueue_active(&rq->execute), 10)) {
1099                 struct drm_printer p = drm_info_printer(i915->drm.dev);
1100
1101                 pr_err("igt/evict_vma kthread did not wait\n");
1102                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1103
1104                 i915_gem_set_wedged(i915);
1105                 goto out_reset;
1106         }
1107
1108 out_reset:
1109         fake_hangcheck(rq, intel_engine_flag(rq->engine));
1110
1111         if (tsk) {
1112                 struct igt_wedge_me w;
1113
1114                 /* The reset, even indirectly, should take less than 10ms. */
1115                 igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/)
1116                         err = kthread_stop(tsk);
1117         }
1118
1119         mutex_lock(&i915->drm.struct_mutex);
1120 out_rq:
1121         i915_request_put(rq);
1122 out_obj:
1123         i915_gem_object_put(obj);
1124 fini:
1125         hang_fini(&h);
1126 unlock:
1127         mutex_unlock(&i915->drm.struct_mutex);
1128         global_reset_unlock(i915);
1129
1130         if (i915_terminally_wedged(&i915->gpu_error))
1131                 return -EIO;
1132
1133         return err;
1134 }
1135
1136 static int igt_reset_evict_ggtt(void *arg)
1137 {
1138         struct drm_i915_private *i915 = arg;
1139
1140         return __igt_reset_evict_vma(i915, &i915->ggtt.vm);
1141 }
1142
1143 static int igt_reset_evict_ppgtt(void *arg)
1144 {
1145         struct drm_i915_private *i915 = arg;
1146         struct i915_gem_context *ctx;
1147         int err;
1148
1149         mutex_lock(&i915->drm.struct_mutex);
1150         ctx = kernel_context(i915);
1151         mutex_unlock(&i915->drm.struct_mutex);
1152         if (IS_ERR(ctx))
1153                 return PTR_ERR(ctx);
1154
1155         err = 0;
1156         if (ctx->ppgtt) /* aliasing == global gtt locking, covered above */
1157                 err = __igt_reset_evict_vma(i915, &ctx->ppgtt->vm);
1158
1159         kernel_context_close(ctx);
1160         return err;
1161 }
1162
1163 static int wait_for_others(struct drm_i915_private *i915,
1164                            struct intel_engine_cs *exclude)
1165 {
1166         struct intel_engine_cs *engine;
1167         enum intel_engine_id id;
1168
1169         for_each_engine(engine, i915, id) {
1170                 if (engine == exclude)
1171                         continue;
1172
1173                 if (!wait_for_idle(engine))
1174                         return -EIO;
1175         }
1176
1177         return 0;
1178 }
1179
1180 static int igt_reset_queue(void *arg)
1181 {
1182         struct drm_i915_private *i915 = arg;
1183         struct intel_engine_cs *engine;
1184         enum intel_engine_id id;
1185         struct hang h;
1186         int err;
1187
1188         /* Check that we replay pending requests following a hang */
1189
1190         global_reset_lock(i915);
1191
1192         mutex_lock(&i915->drm.struct_mutex);
1193         err = hang_init(&h, i915);
1194         if (err)
1195                 goto unlock;
1196
1197         for_each_engine(engine, i915, id) {
1198                 struct i915_request *prev;
1199                 IGT_TIMEOUT(end_time);
1200                 unsigned int count;
1201
1202                 if (!intel_engine_can_store_dword(engine))
1203                         continue;
1204
1205                 prev = hang_create_request(&h, engine);
1206                 if (IS_ERR(prev)) {
1207                         err = PTR_ERR(prev);
1208                         goto fini;
1209                 }
1210
1211                 i915_request_get(prev);
1212                 i915_request_add(prev);
1213
1214                 count = 0;
1215                 do {
1216                         struct i915_request *rq;
1217                         unsigned int reset_count;
1218
1219                         rq = hang_create_request(&h, engine);
1220                         if (IS_ERR(rq)) {
1221                                 err = PTR_ERR(rq);
1222                                 goto fini;
1223                         }
1224
1225                         i915_request_get(rq);
1226                         i915_request_add(rq);
1227
1228                         /*
1229                          * XXX We don't handle resetting the kernel context
1230                          * very well. If we trigger a device reset twice in
1231                          * quick succession while the kernel context is
1232                          * executing, we may end up skipping the breadcrumb.
1233                          * This is really only a problem for the selftest as
1234                          * normally there is a large interlude between resets
1235                          * (hangcheck), or we focus on resetting just one
1236                          * engine and so avoid repeatedly resetting innocents.
1237                          */
1238                         err = wait_for_others(i915, engine);
1239                         if (err) {
1240                                 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1241                                        __func__, engine->name);
1242                                 i915_request_put(rq);
1243                                 i915_request_put(prev);
1244
1245                                 GEM_TRACE_DUMP();
1246                                 i915_gem_set_wedged(i915);
1247                                 goto fini;
1248                         }
1249
1250                         if (!wait_until_running(&h, prev)) {
1251                                 struct drm_printer p = drm_info_printer(i915->drm.dev);
1252
1253                                 pr_err("%s(%s): Failed to start request %x, at %x\n",
1254                                        __func__, engine->name,
1255                                        prev->fence.seqno, hws_seqno(&h, prev));
1256                                 intel_engine_dump(engine, &p,
1257                                                   "%s\n", engine->name);
1258
1259                                 i915_request_put(rq);
1260                                 i915_request_put(prev);
1261
1262                                 i915_gem_set_wedged(i915);
1263
1264                                 err = -EIO;
1265                                 goto fini;
1266                         }
1267
1268                         reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
1269
1270                         i915_reset(i915, ENGINE_MASK(id), NULL);
1271
1272                         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
1273                                             &i915->gpu_error.flags));
1274
1275                         if (prev->fence.error != -EIO) {
1276                                 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1277                                        prev->fence.error);
1278                                 i915_request_put(rq);
1279                                 i915_request_put(prev);
1280                                 err = -EINVAL;
1281                                 goto fini;
1282                         }
1283
1284                         if (rq->fence.error) {
1285                                 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1286                                        rq->fence.error);
1287                                 i915_request_put(rq);
1288                                 i915_request_put(prev);
1289                                 err = -EINVAL;
1290                                 goto fini;
1291                         }
1292
1293                         if (i915_reset_count(&i915->gpu_error) == reset_count) {
1294                                 pr_err("No GPU reset recorded!\n");
1295                                 i915_request_put(rq);
1296                                 i915_request_put(prev);
1297                                 err = -EINVAL;
1298                                 goto fini;
1299                         }
1300
1301                         i915_request_put(prev);
1302                         prev = rq;
1303                         count++;
1304                 } while (time_before(jiffies, end_time));
1305                 pr_info("%s: Completed %d resets\n", engine->name, count);
1306
1307                 *h.batch = MI_BATCH_BUFFER_END;
1308                 i915_gem_chipset_flush(i915);
1309
1310                 i915_request_put(prev);
1311
1312                 err = igt_flush_test(i915, I915_WAIT_LOCKED);
1313                 if (err)
1314                         break;
1315         }
1316
1317 fini:
1318         hang_fini(&h);
1319 unlock:
1320         mutex_unlock(&i915->drm.struct_mutex);
1321         global_reset_unlock(i915);
1322
1323         if (i915_terminally_wedged(&i915->gpu_error))
1324                 return -EIO;
1325
1326         return err;
1327 }
1328
1329 static int igt_handle_error(void *arg)
1330 {
1331         struct drm_i915_private *i915 = arg;
1332         struct intel_engine_cs *engine = i915->engine[RCS];
1333         struct hang h;
1334         struct i915_request *rq;
1335         struct i915_gpu_state *error;
1336         int err;
1337
1338         /* Check that we can issue a global GPU and engine reset */
1339
1340         if (!intel_has_reset_engine(i915))
1341                 return 0;
1342
1343         if (!engine || !intel_engine_can_store_dword(engine))
1344                 return 0;
1345
1346         mutex_lock(&i915->drm.struct_mutex);
1347
1348         err = hang_init(&h, i915);
1349         if (err)
1350                 goto err_unlock;
1351
1352         rq = hang_create_request(&h, engine);
1353         if (IS_ERR(rq)) {
1354                 err = PTR_ERR(rq);
1355                 goto err_fini;
1356         }
1357
1358         i915_request_get(rq);
1359         i915_request_add(rq);
1360
1361         if (!wait_until_running(&h, rq)) {
1362                 struct drm_printer p = drm_info_printer(i915->drm.dev);
1363
1364                 pr_err("%s: Failed to start request %x, at %x\n",
1365                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1366                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1367
1368                 i915_gem_set_wedged(i915);
1369
1370                 err = -EIO;
1371                 goto err_request;
1372         }
1373
1374         mutex_unlock(&i915->drm.struct_mutex);
1375
1376         /* Temporarily disable error capture */
1377         error = xchg(&i915->gpu_error.first_error, (void *)-1);
1378
1379         i915_handle_error(i915, ENGINE_MASK(engine->id), 0, NULL);
1380
1381         xchg(&i915->gpu_error.first_error, error);
1382
1383         mutex_lock(&i915->drm.struct_mutex);
1384
1385         if (rq->fence.error != -EIO) {
1386                 pr_err("Guilty request not identified!\n");
1387                 err = -EINVAL;
1388                 goto err_request;
1389         }
1390
1391 err_request:
1392         i915_request_put(rq);
1393 err_fini:
1394         hang_fini(&h);
1395 err_unlock:
1396         mutex_unlock(&i915->drm.struct_mutex);
1397         return err;
1398 }
1399
1400 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1401 {
1402         static const struct i915_subtest tests[] = {
1403                 SUBTEST(igt_global_reset), /* attempt to recover GPU first */
1404                 SUBTEST(igt_hang_sanitycheck),
1405                 SUBTEST(igt_reset_idle_engine),
1406                 SUBTEST(igt_reset_active_engine),
1407                 SUBTEST(igt_reset_engines),
1408                 SUBTEST(igt_reset_queue),
1409                 SUBTEST(igt_reset_wait),
1410                 SUBTEST(igt_reset_evict_ggtt),
1411                 SUBTEST(igt_reset_evict_ppgtt),
1412                 SUBTEST(igt_handle_error),
1413         };
1414         bool saved_hangcheck;
1415         int err;
1416
1417         if (!intel_has_gpu_reset(i915))
1418                 return 0;
1419
1420         if (i915_terminally_wedged(&i915->gpu_error))
1421                 return -EIO; /* we're long past hope of a successful reset */
1422
1423         intel_runtime_pm_get(i915);
1424         saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1425
1426         err = i915_subtests(tests, i915);
1427
1428         mutex_lock(&i915->drm.struct_mutex);
1429         igt_flush_test(i915, I915_WAIT_LOCKED);
1430         mutex_unlock(&i915->drm.struct_mutex);
1431
1432         i915_modparams.enable_hangcheck = saved_hangcheck;
1433         intel_runtime_pm_put(i915);
1434
1435         return err;
1436 }