2 * Copyright © 2016 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include <linux/kthread.h>
27 #include "../i915_selftest.h"
28 #include "i915_random.h"
29 #include "igt_flush_test.h"
30 #include "igt_wedge_me.h"
32 #include "mock_context.h"
35 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
38 struct drm_i915_private *i915;
39 struct drm_i915_gem_object *hws;
40 struct drm_i915_gem_object *obj;
41 struct i915_gem_context *ctx;
46 static int hang_init(struct hang *h, struct drm_i915_private *i915)
51 memset(h, 0, sizeof(*h));
54 h->ctx = kernel_context(i915);
56 return PTR_ERR(h->ctx);
58 h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
60 err = PTR_ERR(h->hws);
64 h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
66 err = PTR_ERR(h->obj);
70 i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
71 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
76 h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
78 vaddr = i915_gem_object_pin_map(h->obj,
79 HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
89 i915_gem_object_unpin_map(h->hws);
91 i915_gem_object_put(h->obj);
93 i915_gem_object_put(h->hws);
95 kernel_context_close(h->ctx);
99 static u64 hws_address(const struct i915_vma *hws,
100 const struct i915_request *rq)
102 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
105 static int emit_recurse_batch(struct hang *h,
106 struct i915_request *rq)
108 struct drm_i915_private *i915 = h->i915;
109 struct i915_address_space *vm =
110 rq->gem_context->ppgtt ?
111 &rq->gem_context->ppgtt->vm :
113 struct i915_vma *hws, *vma;
118 vma = i915_vma_instance(h->obj, vm, NULL);
122 hws = i915_vma_instance(h->hws, vm, NULL);
126 err = i915_vma_pin(vma, 0, 0, PIN_USER);
130 err = i915_vma_pin(hws, 0, 0, PIN_USER);
134 err = i915_vma_move_to_active(vma, rq, 0);
138 if (!i915_gem_object_has_active_reference(vma->obj)) {
139 i915_gem_object_get(vma->obj);
140 i915_gem_object_set_active_reference(vma->obj);
143 err = i915_vma_move_to_active(hws, rq, 0);
147 if (!i915_gem_object_has_active_reference(hws->obj)) {
148 i915_gem_object_get(hws->obj);
149 i915_gem_object_set_active_reference(hws->obj);
153 if (INTEL_GEN(i915) >= 8) {
154 *batch++ = MI_STORE_DWORD_IMM_GEN4;
155 *batch++ = lower_32_bits(hws_address(hws, rq));
156 *batch++ = upper_32_bits(hws_address(hws, rq));
157 *batch++ = rq->fence.seqno;
158 *batch++ = MI_ARB_CHECK;
160 memset(batch, 0, 1024);
161 batch += 1024 / sizeof(*batch);
163 *batch++ = MI_ARB_CHECK;
164 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
165 *batch++ = lower_32_bits(vma->node.start);
166 *batch++ = upper_32_bits(vma->node.start);
167 } else if (INTEL_GEN(i915) >= 6) {
168 *batch++ = MI_STORE_DWORD_IMM_GEN4;
170 *batch++ = lower_32_bits(hws_address(hws, rq));
171 *batch++ = rq->fence.seqno;
172 *batch++ = MI_ARB_CHECK;
174 memset(batch, 0, 1024);
175 batch += 1024 / sizeof(*batch);
177 *batch++ = MI_ARB_CHECK;
178 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
179 *batch++ = lower_32_bits(vma->node.start);
180 } else if (INTEL_GEN(i915) >= 4) {
181 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
183 *batch++ = lower_32_bits(hws_address(hws, rq));
184 *batch++ = rq->fence.seqno;
185 *batch++ = MI_ARB_CHECK;
187 memset(batch, 0, 1024);
188 batch += 1024 / sizeof(*batch);
190 *batch++ = MI_ARB_CHECK;
191 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
192 *batch++ = lower_32_bits(vma->node.start);
194 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
195 *batch++ = lower_32_bits(hws_address(hws, rq));
196 *batch++ = rq->fence.seqno;
197 *batch++ = MI_ARB_CHECK;
199 memset(batch, 0, 1024);
200 batch += 1024 / sizeof(*batch);
202 *batch++ = MI_ARB_CHECK;
203 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
204 *batch++ = lower_32_bits(vma->node.start);
206 *batch++ = MI_BATCH_BUFFER_END; /* not reached */
207 i915_gem_chipset_flush(h->i915);
210 if (INTEL_GEN(vm->i915) <= 5)
211 flags |= I915_DISPATCH_SECURE;
213 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
222 static struct i915_request *
223 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
225 struct i915_request *rq;
228 if (i915_gem_object_is_active(h->obj)) {
229 struct drm_i915_gem_object *obj;
232 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
234 return ERR_CAST(obj);
236 vaddr = i915_gem_object_pin_map(obj,
237 HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
239 i915_gem_object_put(obj);
240 return ERR_CAST(vaddr);
243 i915_gem_object_unpin_map(h->obj);
244 i915_gem_object_put(h->obj);
250 rq = i915_request_alloc(engine, h->ctx);
254 err = emit_recurse_batch(h, rq);
256 i915_request_add(rq);
263 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
265 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
268 static void hang_fini(struct hang *h)
270 *h->batch = MI_BATCH_BUFFER_END;
271 i915_gem_chipset_flush(h->i915);
273 i915_gem_object_unpin_map(h->obj);
274 i915_gem_object_put(h->obj);
276 i915_gem_object_unpin_map(h->hws);
277 i915_gem_object_put(h->hws);
279 kernel_context_close(h->ctx);
281 igt_flush_test(h->i915, I915_WAIT_LOCKED);
284 static bool wait_until_running(struct hang *h, struct i915_request *rq)
286 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
289 wait_for(i915_seqno_passed(hws_seqno(h, rq),
294 static int igt_hang_sanitycheck(void *arg)
296 struct drm_i915_private *i915 = arg;
297 struct i915_request *rq;
298 struct intel_engine_cs *engine;
299 enum intel_engine_id id;
303 /* Basic check that we can execute our hanging batch */
305 mutex_lock(&i915->drm.struct_mutex);
306 err = hang_init(&h, i915);
310 for_each_engine(engine, i915, id) {
313 if (!intel_engine_can_store_dword(engine))
316 rq = hang_create_request(&h, engine);
319 pr_err("Failed to create request for %s, err=%d\n",
324 i915_request_get(rq);
326 *h.batch = MI_BATCH_BUFFER_END;
327 i915_gem_chipset_flush(i915);
329 i915_request_add(rq);
331 timeout = i915_request_wait(rq,
333 MAX_SCHEDULE_TIMEOUT);
334 i915_request_put(rq);
338 pr_err("Wait for request failed on %s, err=%d\n",
347 mutex_unlock(&i915->drm.struct_mutex);
351 static void global_reset_lock(struct drm_i915_private *i915)
353 struct intel_engine_cs *engine;
354 enum intel_engine_id id;
356 pr_debug("%s: current gpu_error=%08lx\n",
357 __func__, i915->gpu_error.flags);
359 while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
360 wait_event(i915->gpu_error.reset_queue,
361 !test_bit(I915_RESET_BACKOFF,
362 &i915->gpu_error.flags));
364 for_each_engine(engine, i915, id) {
365 while (test_and_set_bit(I915_RESET_ENGINE + id,
366 &i915->gpu_error.flags))
367 wait_on_bit(&i915->gpu_error.flags,
368 I915_RESET_ENGINE + id,
369 TASK_UNINTERRUPTIBLE);
373 static void global_reset_unlock(struct drm_i915_private *i915)
375 struct intel_engine_cs *engine;
376 enum intel_engine_id id;
378 for_each_engine(engine, i915, id)
379 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
381 clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
382 wake_up_all(&i915->gpu_error.reset_queue);
385 static int igt_global_reset(void *arg)
387 struct drm_i915_private *i915 = arg;
388 unsigned int reset_count;
391 /* Check that we can issue a global GPU reset */
393 global_reset_lock(i915);
394 set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
396 mutex_lock(&i915->drm.struct_mutex);
397 reset_count = i915_reset_count(&i915->gpu_error);
399 i915_reset(i915, ALL_ENGINES, NULL);
401 if (i915_reset_count(&i915->gpu_error) == reset_count) {
402 pr_err("No GPU reset recorded!\n");
405 mutex_unlock(&i915->drm.struct_mutex);
407 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
408 global_reset_unlock(i915);
410 if (i915_terminally_wedged(&i915->gpu_error))
416 static bool wait_for_idle(struct intel_engine_cs *engine)
418 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
421 static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
423 struct intel_engine_cs *engine;
424 enum intel_engine_id id;
428 /* Check that we can issue an engine reset on an idle engine (no-op) */
430 if (!intel_has_reset_engine(i915))
434 mutex_lock(&i915->drm.struct_mutex);
435 err = hang_init(&h, i915);
436 mutex_unlock(&i915->drm.struct_mutex);
441 for_each_engine(engine, i915, id) {
442 unsigned int reset_count, reset_engine_count;
443 IGT_TIMEOUT(end_time);
445 if (active && !intel_engine_can_store_dword(engine))
448 if (!wait_for_idle(engine)) {
449 pr_err("%s failed to idle before reset\n",
455 reset_count = i915_reset_count(&i915->gpu_error);
456 reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
459 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
461 u32 seqno = intel_engine_get_seqno(engine);
464 struct i915_request *rq;
466 mutex_lock(&i915->drm.struct_mutex);
467 rq = hang_create_request(&h, engine);
470 mutex_unlock(&i915->drm.struct_mutex);
474 i915_request_get(rq);
475 i915_request_add(rq);
476 mutex_unlock(&i915->drm.struct_mutex);
478 if (!wait_until_running(&h, rq)) {
479 struct drm_printer p = drm_info_printer(i915->drm.dev);
481 pr_err("%s: Failed to start request %x, at %x\n",
482 __func__, rq->fence.seqno, hws_seqno(&h, rq));
483 intel_engine_dump(engine, &p,
484 "%s\n", engine->name);
486 i915_request_put(rq);
491 GEM_BUG_ON(!rq->global_seqno);
492 seqno = rq->global_seqno - 1;
493 i915_request_put(rq);
496 err = i915_reset_engine(engine, NULL);
498 pr_err("i915_reset_engine failed\n");
502 if (i915_reset_count(&i915->gpu_error) != reset_count) {
503 pr_err("Full GPU reset recorded! (engine reset expected)\n");
508 reset_engine_count += active;
509 if (i915_reset_engine_count(&i915->gpu_error, engine) !=
510 reset_engine_count) {
511 pr_err("%s engine reset %srecorded!\n",
512 engine->name, active ? "not " : "");
517 if (!wait_for_idle(engine)) {
518 struct drm_printer p =
519 drm_info_printer(i915->drm.dev);
521 pr_err("%s failed to idle after reset\n",
523 intel_engine_dump(engine, &p,
524 "%s\n", engine->name);
529 } while (time_before(jiffies, end_time));
530 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
535 err = igt_flush_test(i915, 0);
540 if (i915_terminally_wedged(&i915->gpu_error))
544 mutex_lock(&i915->drm.struct_mutex);
546 mutex_unlock(&i915->drm.struct_mutex);
552 static int igt_reset_idle_engine(void *arg)
554 return __igt_reset_engine(arg, false);
557 static int igt_reset_active_engine(void *arg)
559 return __igt_reset_engine(arg, true);
562 struct active_engine {
563 struct task_struct *task;
564 struct intel_engine_cs *engine;
565 unsigned long resets;
569 #define TEST_ACTIVE BIT(0)
570 #define TEST_OTHERS BIT(1)
571 #define TEST_SELF BIT(2)
572 #define TEST_PRIORITY BIT(3)
574 static int active_request_put(struct i915_request *rq)
581 if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
582 GEM_TRACE("%s timed out waiting for completion of fence %llx:%d, seqno %d.\n",
586 i915_request_global_seqno(rq));
589 i915_gem_set_wedged(rq->i915);
593 i915_request_put(rq);
598 static int active_engine(void *data)
600 I915_RND_STATE(prng);
601 struct active_engine *arg = data;
602 struct intel_engine_cs *engine = arg->engine;
603 struct i915_request *rq[8] = {};
604 struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
605 struct drm_file *file;
606 unsigned long count = 0;
609 file = mock_file(engine->i915);
611 return PTR_ERR(file);
613 for (count = 0; count < ARRAY_SIZE(ctx); count++) {
614 mutex_lock(&engine->i915->drm.struct_mutex);
615 ctx[count] = live_context(engine->i915, file);
616 mutex_unlock(&engine->i915->drm.struct_mutex);
617 if (IS_ERR(ctx[count])) {
618 err = PTR_ERR(ctx[count]);
620 i915_gem_context_put(ctx[count]);
625 while (!kthread_should_stop()) {
626 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
627 struct i915_request *old = rq[idx];
628 struct i915_request *new;
630 mutex_lock(&engine->i915->drm.struct_mutex);
631 new = i915_request_alloc(engine, ctx[idx]);
633 mutex_unlock(&engine->i915->drm.struct_mutex);
638 if (arg->flags & TEST_PRIORITY)
639 ctx[idx]->sched.priority =
640 i915_prandom_u32_max_state(512, &prng);
642 rq[idx] = i915_request_get(new);
643 i915_request_add(new);
644 mutex_unlock(&engine->i915->drm.struct_mutex);
646 err = active_request_put(old);
653 for (count = 0; count < ARRAY_SIZE(rq); count++) {
654 int err__ = active_request_put(rq[count]);
656 /* Keep the first error */
662 mock_file_free(engine->i915, file);
666 static int __igt_reset_engines(struct drm_i915_private *i915,
667 const char *test_name,
670 struct intel_engine_cs *engine, *other;
671 enum intel_engine_id id, tmp;
675 /* Check that issuing a reset on one engine does not interfere
676 * with any other engine.
679 if (!intel_has_reset_engine(i915))
682 if (flags & TEST_ACTIVE) {
683 mutex_lock(&i915->drm.struct_mutex);
684 err = hang_init(&h, i915);
685 mutex_unlock(&i915->drm.struct_mutex);
689 if (flags & TEST_PRIORITY)
690 h.ctx->sched.priority = 1024;
693 for_each_engine(engine, i915, id) {
694 struct active_engine threads[I915_NUM_ENGINES] = {};
695 unsigned long global = i915_reset_count(&i915->gpu_error);
696 unsigned long count = 0, reported;
697 IGT_TIMEOUT(end_time);
699 if (flags & TEST_ACTIVE &&
700 !intel_engine_can_store_dword(engine))
703 if (!wait_for_idle(engine)) {
704 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
705 engine->name, test_name);
710 memset(threads, 0, sizeof(threads));
711 for_each_engine(other, i915, tmp) {
712 struct task_struct *tsk;
714 threads[tmp].resets =
715 i915_reset_engine_count(&i915->gpu_error,
718 if (!(flags & TEST_OTHERS))
721 if (other == engine && !(flags & TEST_SELF))
724 threads[tmp].engine = other;
725 threads[tmp].flags = flags;
727 tsk = kthread_run(active_engine, &threads[tmp],
728 "igt/%s", other->name);
734 threads[tmp].task = tsk;
735 get_task_struct(tsk);
738 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
740 u32 seqno = intel_engine_get_seqno(engine);
741 struct i915_request *rq = NULL;
743 if (flags & TEST_ACTIVE) {
744 mutex_lock(&i915->drm.struct_mutex);
745 rq = hang_create_request(&h, engine);
748 mutex_unlock(&i915->drm.struct_mutex);
752 i915_request_get(rq);
753 i915_request_add(rq);
754 mutex_unlock(&i915->drm.struct_mutex);
756 if (!wait_until_running(&h, rq)) {
757 struct drm_printer p = drm_info_printer(i915->drm.dev);
759 pr_err("%s: Failed to start request %x, at %x\n",
760 __func__, rq->fence.seqno, hws_seqno(&h, rq));
761 intel_engine_dump(engine, &p,
762 "%s\n", engine->name);
764 i915_request_put(rq);
769 GEM_BUG_ON(!rq->global_seqno);
770 seqno = rq->global_seqno - 1;
773 err = i915_reset_engine(engine, NULL);
775 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
776 engine->name, test_name, err);
783 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
784 i915_request_put(rq);
787 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
788 struct drm_printer p =
789 drm_info_printer(i915->drm.dev);
791 pr_err("i915_reset_engine(%s:%s):"
792 " failed to idle after reset\n",
793 engine->name, test_name);
794 intel_engine_dump(engine, &p,
795 "%s\n", engine->name);
800 } while (time_before(jiffies, end_time));
801 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
802 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
803 engine->name, test_name, count);
805 reported = i915_reset_engine_count(&i915->gpu_error, engine);
806 reported -= threads[engine->id].resets;
807 if (reported != (flags & TEST_ACTIVE ? count : 0)) {
808 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu, expected %lu reported\n",
809 engine->name, test_name, count, reported,
810 (flags & TEST_ACTIVE ? count : 0));
816 for_each_engine(other, i915, tmp) {
819 if (!threads[tmp].task)
822 ret = kthread_stop(threads[tmp].task);
824 pr_err("kthread for other engine %s failed, err=%d\n",
829 put_task_struct(threads[tmp].task);
831 if (other != engine &&
832 threads[tmp].resets !=
833 i915_reset_engine_count(&i915->gpu_error, other)) {
834 pr_err("Innocent engine %s was reset (count=%ld)\n",
836 i915_reset_engine_count(&i915->gpu_error,
838 threads[tmp].resets);
844 if (global != i915_reset_count(&i915->gpu_error)) {
845 pr_err("Global reset (count=%ld)!\n",
846 i915_reset_count(&i915->gpu_error) - global);
854 err = igt_flush_test(i915, 0);
859 if (i915_terminally_wedged(&i915->gpu_error))
862 if (flags & TEST_ACTIVE) {
863 mutex_lock(&i915->drm.struct_mutex);
865 mutex_unlock(&i915->drm.struct_mutex);
871 static int igt_reset_engines(void *arg)
873 static const struct {
878 { "active", TEST_ACTIVE },
879 { "others-idle", TEST_OTHERS },
880 { "others-active", TEST_OTHERS | TEST_ACTIVE },
883 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
887 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
891 struct drm_i915_private *i915 = arg;
895 for (p = phases; p->name; p++) {
896 if (p->flags & TEST_PRIORITY) {
897 if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
901 err = __igt_reset_engines(arg, p->name, p->flags);
909 static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
911 struct i915_gpu_error *error = &rq->i915->gpu_error;
912 u32 reset_count = i915_reset_count(error);
914 error->stalled_mask = mask;
916 /* set_bit() must be after we have setup the backchannel (mask) */
917 smp_mb__before_atomic();
918 set_bit(I915_RESET_HANDOFF, &error->flags);
920 wake_up_all(&error->wait_queue);
925 static int igt_reset_wait(void *arg)
927 struct drm_i915_private *i915 = arg;
928 struct i915_request *rq;
929 unsigned int reset_count;
934 if (!intel_engine_can_store_dword(i915->engine[RCS]))
937 /* Check that we detect a stuck waiter and issue a reset */
939 global_reset_lock(i915);
941 mutex_lock(&i915->drm.struct_mutex);
942 err = hang_init(&h, i915);
946 rq = hang_create_request(&h, i915->engine[RCS]);
952 i915_request_get(rq);
953 i915_request_add(rq);
955 if (!wait_until_running(&h, rq)) {
956 struct drm_printer p = drm_info_printer(i915->drm.dev);
958 pr_err("%s: Failed to start request %x, at %x\n",
959 __func__, rq->fence.seqno, hws_seqno(&h, rq));
960 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
962 i915_gem_set_wedged(i915);
968 reset_count = fake_hangcheck(rq, ALL_ENGINES);
970 timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
972 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
978 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
979 if (i915_reset_count(&i915->gpu_error) == reset_count) {
980 pr_err("No GPU reset recorded!\n");
986 i915_request_put(rq);
990 mutex_unlock(&i915->drm.struct_mutex);
991 global_reset_unlock(i915);
993 if (i915_terminally_wedged(&i915->gpu_error))
1000 struct completion completion;
1001 struct i915_vma *vma;
1004 static int evict_vma(void *data)
1006 struct evict_vma *arg = data;
1007 struct i915_address_space *vm = arg->vma->vm;
1008 struct drm_i915_private *i915 = vm->i915;
1009 struct drm_mm_node evict = arg->vma->node;
1012 complete(&arg->completion);
1014 mutex_lock(&i915->drm.struct_mutex);
1015 err = i915_gem_evict_for_node(vm, &evict, 0);
1016 mutex_unlock(&i915->drm.struct_mutex);
1021 static int __igt_reset_evict_vma(struct drm_i915_private *i915,
1022 struct i915_address_space *vm)
1024 struct drm_i915_gem_object *obj;
1025 struct task_struct *tsk = NULL;
1026 struct i915_request *rq;
1027 struct evict_vma arg;
1031 if (!intel_engine_can_store_dword(i915->engine[RCS]))
1034 /* Check that we can recover an unbind stuck on a hanging request */
1036 global_reset_lock(i915);
1038 mutex_lock(&i915->drm.struct_mutex);
1039 err = hang_init(&h, i915);
1043 obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
1049 arg.vma = i915_vma_instance(obj, vm, NULL);
1050 if (IS_ERR(arg.vma)) {
1051 err = PTR_ERR(arg.vma);
1055 rq = hang_create_request(&h, i915->engine[RCS]);
1061 err = i915_vma_pin(arg.vma, 0, 0,
1062 i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER);
1066 err = i915_vma_move_to_active(arg.vma, rq, EXEC_OBJECT_WRITE);
1067 i915_vma_unpin(arg.vma);
1069 i915_request_get(rq);
1070 i915_request_add(rq);
1074 mutex_unlock(&i915->drm.struct_mutex);
1076 if (!wait_until_running(&h, rq)) {
1077 struct drm_printer p = drm_info_printer(i915->drm.dev);
1079 pr_err("%s: Failed to start request %x, at %x\n",
1080 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1081 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1083 i915_gem_set_wedged(i915);
1087 init_completion(&arg.completion);
1089 tsk = kthread_run(evict_vma, &arg, "igt/evict_vma");
1096 wait_for_completion(&arg.completion);
1098 if (wait_for(waitqueue_active(&rq->execute), 10)) {
1099 struct drm_printer p = drm_info_printer(i915->drm.dev);
1101 pr_err("igt/evict_vma kthread did not wait\n");
1102 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1104 i915_gem_set_wedged(i915);
1109 fake_hangcheck(rq, intel_engine_flag(rq->engine));
1112 struct igt_wedge_me w;
1114 /* The reset, even indirectly, should take less than 10ms. */
1115 igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/)
1116 err = kthread_stop(tsk);
1119 mutex_lock(&i915->drm.struct_mutex);
1121 i915_request_put(rq);
1123 i915_gem_object_put(obj);
1127 mutex_unlock(&i915->drm.struct_mutex);
1128 global_reset_unlock(i915);
1130 if (i915_terminally_wedged(&i915->gpu_error))
1136 static int igt_reset_evict_ggtt(void *arg)
1138 struct drm_i915_private *i915 = arg;
1140 return __igt_reset_evict_vma(i915, &i915->ggtt.vm);
1143 static int igt_reset_evict_ppgtt(void *arg)
1145 struct drm_i915_private *i915 = arg;
1146 struct i915_gem_context *ctx;
1149 mutex_lock(&i915->drm.struct_mutex);
1150 ctx = kernel_context(i915);
1151 mutex_unlock(&i915->drm.struct_mutex);
1153 return PTR_ERR(ctx);
1156 if (ctx->ppgtt) /* aliasing == global gtt locking, covered above */
1157 err = __igt_reset_evict_vma(i915, &ctx->ppgtt->vm);
1159 kernel_context_close(ctx);
1163 static int wait_for_others(struct drm_i915_private *i915,
1164 struct intel_engine_cs *exclude)
1166 struct intel_engine_cs *engine;
1167 enum intel_engine_id id;
1169 for_each_engine(engine, i915, id) {
1170 if (engine == exclude)
1173 if (!wait_for_idle(engine))
1180 static int igt_reset_queue(void *arg)
1182 struct drm_i915_private *i915 = arg;
1183 struct intel_engine_cs *engine;
1184 enum intel_engine_id id;
1188 /* Check that we replay pending requests following a hang */
1190 global_reset_lock(i915);
1192 mutex_lock(&i915->drm.struct_mutex);
1193 err = hang_init(&h, i915);
1197 for_each_engine(engine, i915, id) {
1198 struct i915_request *prev;
1199 IGT_TIMEOUT(end_time);
1202 if (!intel_engine_can_store_dword(engine))
1205 prev = hang_create_request(&h, engine);
1207 err = PTR_ERR(prev);
1211 i915_request_get(prev);
1212 i915_request_add(prev);
1216 struct i915_request *rq;
1217 unsigned int reset_count;
1219 rq = hang_create_request(&h, engine);
1225 i915_request_get(rq);
1226 i915_request_add(rq);
1229 * XXX We don't handle resetting the kernel context
1230 * very well. If we trigger a device reset twice in
1231 * quick succession while the kernel context is
1232 * executing, we may end up skipping the breadcrumb.
1233 * This is really only a problem for the selftest as
1234 * normally there is a large interlude between resets
1235 * (hangcheck), or we focus on resetting just one
1236 * engine and so avoid repeatedly resetting innocents.
1238 err = wait_for_others(i915, engine);
1240 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1241 __func__, engine->name);
1242 i915_request_put(rq);
1243 i915_request_put(prev);
1246 i915_gem_set_wedged(i915);
1250 if (!wait_until_running(&h, prev)) {
1251 struct drm_printer p = drm_info_printer(i915->drm.dev);
1253 pr_err("%s(%s): Failed to start request %x, at %x\n",
1254 __func__, engine->name,
1255 prev->fence.seqno, hws_seqno(&h, prev));
1256 intel_engine_dump(engine, &p,
1257 "%s\n", engine->name);
1259 i915_request_put(rq);
1260 i915_request_put(prev);
1262 i915_gem_set_wedged(i915);
1268 reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
1270 i915_reset(i915, ENGINE_MASK(id), NULL);
1272 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
1273 &i915->gpu_error.flags));
1275 if (prev->fence.error != -EIO) {
1276 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1278 i915_request_put(rq);
1279 i915_request_put(prev);
1284 if (rq->fence.error) {
1285 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1287 i915_request_put(rq);
1288 i915_request_put(prev);
1293 if (i915_reset_count(&i915->gpu_error) == reset_count) {
1294 pr_err("No GPU reset recorded!\n");
1295 i915_request_put(rq);
1296 i915_request_put(prev);
1301 i915_request_put(prev);
1304 } while (time_before(jiffies, end_time));
1305 pr_info("%s: Completed %d resets\n", engine->name, count);
1307 *h.batch = MI_BATCH_BUFFER_END;
1308 i915_gem_chipset_flush(i915);
1310 i915_request_put(prev);
1312 err = igt_flush_test(i915, I915_WAIT_LOCKED);
1320 mutex_unlock(&i915->drm.struct_mutex);
1321 global_reset_unlock(i915);
1323 if (i915_terminally_wedged(&i915->gpu_error))
1329 static int igt_handle_error(void *arg)
1331 struct drm_i915_private *i915 = arg;
1332 struct intel_engine_cs *engine = i915->engine[RCS];
1334 struct i915_request *rq;
1335 struct i915_gpu_state *error;
1338 /* Check that we can issue a global GPU and engine reset */
1340 if (!intel_has_reset_engine(i915))
1343 if (!engine || !intel_engine_can_store_dword(engine))
1346 mutex_lock(&i915->drm.struct_mutex);
1348 err = hang_init(&h, i915);
1352 rq = hang_create_request(&h, engine);
1358 i915_request_get(rq);
1359 i915_request_add(rq);
1361 if (!wait_until_running(&h, rq)) {
1362 struct drm_printer p = drm_info_printer(i915->drm.dev);
1364 pr_err("%s: Failed to start request %x, at %x\n",
1365 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1366 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1368 i915_gem_set_wedged(i915);
1374 mutex_unlock(&i915->drm.struct_mutex);
1376 /* Temporarily disable error capture */
1377 error = xchg(&i915->gpu_error.first_error, (void *)-1);
1379 i915_handle_error(i915, ENGINE_MASK(engine->id), 0, NULL);
1381 xchg(&i915->gpu_error.first_error, error);
1383 mutex_lock(&i915->drm.struct_mutex);
1385 if (rq->fence.error != -EIO) {
1386 pr_err("Guilty request not identified!\n");
1392 i915_request_put(rq);
1396 mutex_unlock(&i915->drm.struct_mutex);
1400 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1402 static const struct i915_subtest tests[] = {
1403 SUBTEST(igt_global_reset), /* attempt to recover GPU first */
1404 SUBTEST(igt_hang_sanitycheck),
1405 SUBTEST(igt_reset_idle_engine),
1406 SUBTEST(igt_reset_active_engine),
1407 SUBTEST(igt_reset_engines),
1408 SUBTEST(igt_reset_queue),
1409 SUBTEST(igt_reset_wait),
1410 SUBTEST(igt_reset_evict_ggtt),
1411 SUBTEST(igt_reset_evict_ppgtt),
1412 SUBTEST(igt_handle_error),
1414 bool saved_hangcheck;
1417 if (!intel_has_gpu_reset(i915))
1420 if (i915_terminally_wedged(&i915->gpu_error))
1421 return -EIO; /* we're long past hope of a successful reset */
1423 intel_runtime_pm_get(i915);
1424 saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1426 err = i915_subtests(tests, i915);
1428 mutex_lock(&i915->drm.struct_mutex);
1429 igt_flush_test(i915, I915_WAIT_LOCKED);
1430 mutex_unlock(&i915->drm.struct_mutex);
1432 i915_modparams.enable_hangcheck = saved_hangcheck;
1433 intel_runtime_pm_put(i915);