sv_magic -> scratchpad
[chai.git] / re.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <memory.h>
4 #include <sys/mman.h>
5 #include <stdbool.h>
6 #include <unistd.h>
7
8 #include "shim.h"
9 #include "jobs.h"
10 #include "memory.h"
11 #include "../oolong/chai-notes.h"
12
13 #define HEAP_FREE_ADDRESS 0x1DABE0000
14
15 #define SV_OFFSET (0x4000)
16
17 /* TODO: allocate these indices dynamically; message passing */
18
19 #define INDEX_VERTEX 1
20 #define INDEX_TILER 2
21 #define INDEX_SV 3
22
23 #define INDEX_FRAGMENT 1
24
25 int atom_count = 0;
26
27 uint64_t framebuffer;
28
29 uint64_t last_fragment;
30 uint64_t last_tiler;
31
32 void *set_value_helper(int fd, uint64_t out) {
33         void* packet = galloc(sizeof(struct job_descriptor_header) + sizeof(struct payload_set_value));
34
35         struct job_descriptor_header header = {
36                 .exception_status = JOB_NOT_STARTED,
37                 .first_incomplete_task = 0,
38                 .fault_pointer = 0,
39                 .job_descriptor_size = 1, /* 64-bit */
40                 .job_type = JOB_TYPE_SET_VALUE,
41                 .job_barrier = 0, 
42                 .job_index = INDEX_SV,
43                 .job_dependency_index_1 = 0,
44                 .job_dependency_index_2 = 0,
45                 .next_job = 0 
46         };
47
48         struct payload_set_value payload = {
49                 .out = out,
50                 .unknown = 0x03
51         };
52
53         memcpy(packet, &header, sizeof(header));
54         memcpy(packet + sizeof(header), &payload, sizeof(payload));
55
56         sync_gpu(fd, packet, (uint32_t) packet, sizeof(header) + sizeof(payload));
57         return packet;
58 }
59
60 uint64_t make_mfbd(bool tiler, uint64_t scratchpad)
61 {
62         struct tentative_mfbd *mfbd = galloc(sizeof(struct tentative_mfbd));
63         memset(mfbd, 0, sizeof(struct tentative_mfbd));
64
65         /* zeroes */
66         mfbd->block2[0] = scratchpad + SV_OFFSET;
67         mfbd->block2[1] = scratchpad + SV_OFFSET + 0x200;
68         mfbd->ugaT = /* (uint32_t) galloc(64) */ /* 0x5ABA0000 */ scratchpad;
69         mfbd->unknown_gpu_address = /* (uint32_t) galloc(64) */ /* 0x5ABA0100 */ 0;
70
71         /* Unknown contents -- it's a mystery! */
72         mfbd->unknown2 = /* (uint32_t) galloc(64) */ 0x5ABA0200;
73         mfbd->unknown_gpu_addressN = /* (uint32_t) galloc(64) */ /* 0x5ABA0300 */ 0;
74
75         /* Match traces. TODO decode */
76         mfbd->flags = 0xF0;
77         mfbd->heap_free_address = HEAP_FREE_ADDRESS;
78         mfbd->blah = 0x1F00000000;
79         mfbd->unknown3 = tiler ? 0 : 0xFFFFF8C0;
80         mfbd->unknown1 = 0x1600;
81
82         mfbd->block1[4] = 0x02D801C2;
83         mfbd->block1[6] = 0x02D801C2;
84
85         /* This might not a tiler issue so much as a which-frame issue.
86          * First tiler is 0xFF form. Rest of C021. All fragment C021.
87          * TODO: Investigate!
88          */
89
90         mfbd->block1[7] = tiler ? 0x04001080 : 0x01001080;
91         mfbd->block1[8] = tiler ? 0x000000FF : 0xC0210000;
92         mfbd->block1[9] = /* tiler */ false ? 0x3F800000 : 0x00000000;
93
94         /*mfbd->block3[0] = 0x00000158;
95         mfbd->block3[1] = 0x00000420;
96         mfbd->block3[14] = 0x04000000;
97         mfbd->block3[15] = 0x880A8899;*/
98
99         uint64_t sab0 = 0x5ABA5ABA00000000;
100
101         uint64_t block3[] = {
102                 0x0000000000000000,
103                 0x0000000000030005,
104                 sab0 /*0x00000000B27F1600*/,
105                 mfbd->block2[0] /*0x0000000102024000*/,
106                 0x0000000000000003,
107                 0x0000000000000000,
108                 0x0000000000000000,
109                 0x0000000000000000,
110                 sab0 + 0x300 /* 0x00000000B27F1900 */
111         };
112
113         memcpy(mfbd->block3, block3, sizeof(block3));
114
115         return (uint64_t ) (uint32_t) mfbd | MFBD | (tiler ? FBD_VERTEX_TILER : FBD_FRAGMENT);
116 }
117
118 int job_chain_fragment(int fd, uint64_t scratchpad) {
119         void* packet = galloc(sizeof(struct job_descriptor_header) + sizeof(struct payload_fragment));
120
121         struct job_descriptor_header header = {
122                 .exception_status = JOB_NOT_STARTED,
123                 .first_incomplete_task = 0,
124                 .fault_pointer = 0,
125                 .job_descriptor_size = JOB_32_BIT,
126                 .job_type = JOB_TYPE_FRAGMENT,
127                 .job_barrier = 0, 
128                 .job_index = INDEX_FRAGMENT,
129                 .job_dependency_index_1 = 0,
130                 .job_dependency_index_2 = 0,
131                 .next_job = 0 
132         };
133
134         struct payload_fragment payload = {
135                 .min_tile_coord = MAKE_TILE_COORD(0, 0, 0),
136                 .max_tile_coord = MAKE_TILE_COORD(29, 45, 0),
137                 .fragment_fbd = make_mfbd(false, scratchpad)
138         };
139
140         memcpy(packet, &header, sizeof(header));
141         memcpy(packet + sizeof(header), &payload, sizeof(payload));
142         sync_gpu(fd, packet, (uint32_t) packet, sizeof(header) + sizeof(payload));
143
144         struct base_dependency depNoDep = {
145                 .atom_id = 0,
146                 .dependency_type = BASE_JD_DEP_TYPE_INVALID
147         };
148
149         struct base_dependency depTiler = {
150                 .atom_id = atom_count /* last one */,
151                 .dependency_type = BASE_JD_DEP_TYPE_DATA
152         };
153
154         printf("Framebuffer: %LX\n", framebuffer);
155         uint64_t* resource = calloc(sizeof(u64), 1);
156         resource[0] = framebuffer | BASE_EXT_RES_ACCESS_EXCLUSIVE;
157
158         /* TODO: free resource */
159
160         struct base_jd_atom_v2 job = {
161                 .jc = (uint32_t) packet,
162                 .extres_list = resource,
163                 .nr_extres = 1,
164                 .core_req = BASE_JD_REQ_EXTERNAL_RESOURCES | BASE_JD_REQ_FS,
165                 .atom_number = ++atom_count,
166                 .prio = BASE_JD_PRIO_MEDIUM,
167                 .device_nr = 0
168         };
169
170         job.pre_dep[0] = depTiler;
171         job.pre_dep[1] = depNoDep;
172
173         submit_job(fd, job);
174
175         last_fragment = (uint32_t) packet;
176
177         return 0;
178 }
179
180 uint64_t import_shader(int fd, uint8_t *shader, size_t sz, bool fragment)
181 {
182         int pages = 1 + (sz >> PAGE_SHIFT);
183
184         uint64_t gpu = alloc_gpu_pages(fd, pages, BASE_MEM_PROT_CPU_RD |
185                         BASE_MEM_PROT_CPU_WR | BASE_MEM_PROT_GPU_RD |
186                         BASE_MEM_PROT_GPU_EX); uint8_t *cpu =
187                 mmap_gpu(fd, gpu, pages);
188
189         memcpy(cpu, shader, sz);
190
191         /* Trap once the fragment shader is executed */
192         if(fragment) {
193                 for(int i = 0; i < 128; ++i) {
194                         cpu[i] = rand() & 0xFF;
195                 }
196         }
197
198         sync_gpu(fd, cpu, gpu, sz);
199
200         /* TODO: munmap */
201
202         return gpu | (fragment ? 9 : 5); /* Unknown flag */
203 }
204
205 uint32_t upload_vertices()
206 {
207         /* TODO: Vertices should be parametric */
208         float vertices[] = {
209                 0.0, 0.0, 0.0,
210                 0.5, 1.0, 0.0,
211                 1.0, 0.0, 0.0
212         };
213
214         struct vertex_buffer *vb;
215         vb = (struct vertex_buffer*) galloc(sizeof(*vb));
216         
217         vb->vertices = (uint64_t) (uint32_t) galloc(sizeof(vertices));
218         memcpy((void*) (uint32_t) vb->vertices, vertices, sizeof(vertices));
219
220         vb->vertex_size = sizeof(float) * 3;
221         vb->size = sizeof(vertices);
222
223         vb->vertices |= 1; /* TODO flags */
224         
225         return (uint32_t) vb;
226 }
227
228 void* vertex_tiler_helper(int fd, bool tiler, uint32_t fbd, uint32_t vertex_buffer, uint32_t zero_buffer)
229 {
230         void* packet = galloc(sizeof(struct job_descriptor_header) + sizeof(struct payload_vertex_tiler32));
231
232         struct job_descriptor_header header = {
233                 .exception_status = JOB_NOT_STARTED,
234                 .first_incomplete_task = 0,
235                 .fault_pointer = 0,
236                 .job_descriptor_size = JOB_32_BIT,
237                 .job_barrier = 0, 
238                 .job_type = tiler ? JOB_TYPE_TILER : JOB_TYPE_VERTEX,
239                 .job_index = tiler ? INDEX_TILER : INDEX_VERTEX,
240                 .job_dependency_index_1 = tiler ? INDEX_VERTEX : 0,
241                 .job_dependency_index_2 = tiler ? 0 : INDEX_SV
242         };
243
244         /* TODO */
245         uint32_t mode = CHAI_TRIANGLE_FAN;
246         uint32_t mode_gooks = 0x14000000 | (tiler ? (0x030000 | mode) : 0);
247         uint32_t other_gook = tiler ? 0x00000003 : 0x00000000;
248
249         struct payload_vertex_tiler32 payload = {
250                 .block1 = {
251                         0x00000003, 0x28000000, mode_gooks, 0x00000000,
252                         0x00000000, other_gook, 0x00000000, 0x00000000,
253                         0x00000005, 0x00000000, 0x00000000
254                 },
255                 .zeroes = zero_buffer,
256                 .unknown1 = (uint32_t) galloc(16),
257                 .null1 = 0,
258                 .null2 = 0,
259                 .unknown2 = (uint32_t) galloc(32),
260                 .shader = (uint32_t) galloc(sizeof(struct shader_meta)),
261                 .vertices = vertex_buffer,
262                 .unknown4 = (uint32_t) galloc(16),
263                 .unknown5 = (uint32_t) galloc(32),
264                 .unknown6 = (uint32_t) galloc(64),
265                 .nullForVertex = tiler ? (uint32_t) galloc(64) : 0,
266                 .null4 = 0,
267                 .fbd = fbd,
268                 .unknown7 = tiler ? 0 : ((uint32_t) galloc(64) | 1) /* TODO */
269         };
270
271         struct shader_meta *shader = (struct shader_meta*) payload.shader;
272
273         /* TODO: Integrate an assembler */
274 #include "../shader_hex.h"
275         shader->shader = import_shader(fd,
276                         (uint8_t*) (tiler ? fragment_shader : vertex_shader),
277                         tiler ? sizeof(fragment_shader) : sizeof(vertex_shader),
278                         tiler);
279
280         if(!tiler) {
281                 uint32_t ni[] = {
282                         0x43200000, 0x42F00000, 0x3F000000, 0x00000000,
283                         0x43200000, 0x42F00000, 0x3F000000, 0x00000000
284                 };
285
286                 memcpy((void*) payload.unknown2, ni, sizeof(ni));
287         }
288
289         if(tiler) {
290                 /* Lose precision... on purpose? */
291                 payload.unknown7 = (uint32_t) shader->shader;
292         }
293
294         payload.unknown7 = tiler ? 0xDEADBA00 : 0xDEADFA00;
295
296         /* TODO: Decode me! */
297
298         if(tiler) {
299                 shader->unknown1 = 0x0007000000000000;
300                 shader->unknown2 = 0x0000000000020602;
301         } else {
302                 shader->unknown1 = 0x0005000100000000;
303                 shader->unknown2 = 0x0000000000420002;
304         }
305
306         /* I have *no* idea */
307
308         uint32_t *p = (uint32_t*) payload.unknown4;
309         *p = 0x2DEA2200;
310
311         uint64_t pi[] = {
312                 0x0000000017E49000, 0x0000000017E49000, 
313                 0x0000000017E49000, 0x0000000017E49000, 
314                 0x00000000179A2200, 0x0000000017E49000, 
315                 0x0000000017E49000
316         };
317
318         memcpy((void*) payload.unknown6, pi, sizeof(pi));
319
320         if(tiler) {
321                 uint32_t ni[] = {
322                         0xFF800000, 0xFF800000,
323                         0x7F800000, 0x7F800000,
324                         0x00000000, 0x3F800000,
325                         0x00000000, 0x00EF013F,
326                         0x00000000, 0x0000001F,
327                         0x02020000, 0x00000001
328                 };
329
330                 memcpy((void*) payload.nullForVertex, ni, sizeof(ni));
331         }
332
333         /* Use some magic numbers from the traces */
334         uint64_t* unk1 = (uint64_t*) payload.unknown1;
335         unk1[0] = 0x000000B296271001;
336         unk1[1] = 0x000000B296273000;
337
338         uint32_t writeBuffer = (uint32_t) galloc(64);
339
340         uint64_t* unk5 = (uint64_t*) payload.unknown5;
341         unk5[0] = ((uint64_t) (tiler ? 0xDB : 0xA3) << 56) | writeBuffer | 1;
342         unk5[1] = 0x0000004000000010;
343
344         if(tiler) {
345                 uint32_t ni[] = {
346                         0x00000001, 0x00000000, 0x00070000, 0x00020602,
347                         0x00000000, 0x00000000, 0x00000000, 0x3712FFFF,
348                         0x44F0FFFF, 0x0007FF00, 0x0007FF00, 0x00000000,
349                         0x00000000, 0x00000000, 0x00000000, 0x00000200,
350                         0x00000000, 0xF0122122, 0x00000000, 0x00000000,
351                         0x00000000, 0xF0122122, 0x00000000, 0xFF800000,
352                         0xFF800000, 0x7F800000, 0x7F800000, 0x00000000,
353                         0x3F800000, 0x00000000, 0xEF013F00, 0x00000000,
354                         0x0000001F, 0x02020000, 0x00000001, 0x00000000
355                 };
356
357                 memcpy(payload.block2, ni, sizeof(ni));
358         } else {
359                 uint32_t ni[] = {
360                         0x00000000, 0x0000000C, 0x00000030, 0x2DEA2200,
361                         0x00000000, 0x00000000, 0x00000000, /* Address to 1 */ 0xCAFEDA01,
362                         0x57000000, 0x00000010, 0x00000040, 0x17E49000,
363                         0x00000000, 0x17E49000, 0x00000000, 0x17E49000,
364                         0x00000000, 0x17E49000, 0x00000000, 0x179A2200,
365                         0x00000000, 0x17E49000, 0x00000000, 0x17E49000,
366                         0x00000000, 0x00000000, 0x00000000, 0x43200000,
367                         0x42F00000, 0x3F000000, 0x00000000, 0x43200000,
368                         0x42F00000, 0x3F000000, 0x00000000, 0x00000000
369                 };
370
371                 memcpy(payload.block2, ni, sizeof(ni));
372         }
373
374         /* Trap tiler job execution */
375
376         if(tiler) {
377                 payload.shader = 0x5AB00A05;
378
379                 /* Hit second */
380                 //payload.zeroes = 0x5AB01A00;
381
382                 payload.unknown1 = 0x5AB02A00;
383                 payload.unknown2 = 0x5AB03A00;
384                 payload.vertices = 0x5AB04A00;
385                 payload.unknown4 = 0x5AB05A00;
386                 payload.unknown5 = 0x5AB06A00;
387                 payload.unknown6 = 0x5AB07A00;
388
389                 /* Hit third */
390                 // payload.fbd   = 0x5AB09A00;
391
392                 /* Hit first */
393                 // payload.nullForVertex = 0x5AB08A00;
394         }
395
396         memcpy(packet, &header, sizeof(header));
397         memcpy(packet + sizeof(header), &payload, sizeof(payload));
398         sync_gpu(fd, packet, (uint32_t) packet, sizeof(header) + sizeof(payload));
399
400         return packet;
401 }
402
403 int job_chain_vertex_tiler(int fd, uint64_t scratchpad) {
404         uint32_t vertex_buffer = upload_vertices();
405         uint32_t vertex_fbd = (uint32_t) make_mfbd(true, scratchpad);
406
407         uint32_t zero_buffer = (uint32_t) alloc_gpu_pages(fd, 0x20,
408                         0x3800 | BASE_MEM_PROT_CPU_RD |
409                         BASE_MEM_PROT_CPU_WR | BASE_MEM_PROT_GPU_RD);
410         memset( (void*) zero_buffer, 0xDA, 0x20 << PAGE_SHIFT);
411         sync_gpu(fd, (void*) zero_buffer, zero_buffer, 0x20 << PAGE_SHIFT);
412
413         void *set = set_value_helper(fd, scratchpad + SV_OFFSET);
414         void *vertex = vertex_tiler_helper(fd, false, vertex_fbd, vertex_buffer, zero_buffer);
415         void *tiler = vertex_tiler_helper(fd, true, vertex_fbd, vertex_buffer, zero_buffer);
416
417         ((struct job_descriptor_header*) set)->next_job = (uint32_t) vertex;
418         ((struct job_descriptor_header*) vertex)->next_job = (uint32_t) tiler;
419
420         struct base_dependency depNoDep = {
421                 .atom_id = 0,
422                 .dependency_type = BASE_JD_DEP_TYPE_INVALID
423         };
424
425         struct base_jd_atom_v2 job = {
426                 .jc = (uint32_t) set,
427                 .extres_list = NULL,
428                 .nr_extres = 0,
429                 .core_req = BASE_JD_REQ_CS | BASE_JD_REQ_T | BASE_JD_REQ_CF | BASE_JD_REQ_COHERENT_GROUP,
430                 .atom_number = ++atom_count,
431                 .prio = BASE_JD_PRIO_MEDIUM,
432                 .device_nr = 0
433         };
434
435         job.pre_dep[0] = depNoDep;
436         job.pre_dep[1] = depNoDep;
437
438         submit_job(fd, job);
439
440         last_tiler = (uint32_t) tiler;
441
442         return 0;
443 }
444
445 void job_chain_replay(int fd)
446 {
447         struct base_jd_replay_payload *payload;
448
449         payload = (struct base_jd_replay_payload*) galloc(sizeof(*payload));
450
451         payload->tiler_jc_list = last_tiler;
452         payload->fragment_jc = last_fragment;
453         payload->tiler_heap_free = HEAP_FREE_ADDRESS;
454         payload->fragment_hierarchy_mask = 0;
455         payload->tiler_hierarchy_mask = 0;
456         payload->hierarchy_default_weight = 0x10000;
457         payload->tiler_core_req = BASE_JD_REQ_T | BASE_JD_REQ_COHERENT_GROUP;
458         payload->fragment_core_req = BASE_JD_REQ_FS;
459
460         struct base_dependency depNoDep = {
461                 .atom_id = 0,
462                 .dependency_type = BASE_JD_DEP_TYPE_INVALID
463         };
464
465         struct base_dependency depFragment = {
466                 .atom_id = atom_count,
467                 .dependency_type = BASE_JD_DEP_TYPE_DATA
468         };
469
470         uint64_t* resource = malloc(sizeof(u64) * 1);
471         resource[0] = framebuffer | BASE_EXT_RES_ACCESS_EXCLUSIVE;
472
473         struct base_jd_atom_v2 job = {
474                 .jc = (uint32_t) payload,
475                 .extres_list = resource,
476                 .nr_extres = 1,
477                 .core_req = BASE_JD_REQ_EXTERNAL_RESOURCES | BASE_JD_REQ_SOFT_REPLAY,
478                 .atom_number = ++atom_count,
479                 .prio = BASE_JD_PRIO_LOW,
480                 .device_nr = 0
481         };
482
483         job.pre_dep[0] = depFragment;
484         job.pre_dep[1] = depNoDep;
485
486         submit_job(fd, job);
487 }
488
489 extern uint32_t cbma_bottom;
490 extern uint32_t cbma_top;
491
492 int main()
493 {
494         int fd = open_kernel_module();
495
496         init_cbma(fd);
497         stream_create(fd, "malitl_339_0x53ae8");
498         stream_create(fd, "malitl_339_0x53f78");
499
500         //size_t fb_size = 29 * 16 * 45 * 16 * 4 * 2;
501
502         // framebuffer = (uint64_t) (uint32_t) galloc(fb_size);
503
504         /* Fake framebuffer to trap accesses */
505         framebuffer = 0x1CAFE0000;
506         printf("Framebuffer: %LX\n", framebuffer);
507
508         uint64_t scratchpad = alloc_gpu_pages(fd, 8, 0xC);
509         job_chain_vertex_tiler(fd, scratchpad);
510         job_chain_fragment(fd, scratchpad);
511         job_chain_replay(fd);
512         sync_gpu(fd, (uint8_t*) cbma_top, cbma_top, cbma_bottom - cbma_top);
513         flush_job_queue(fd);
514
515         sleep(3);
516
517         /* Dump framebuffer to a file */
518         /*uint8_t *fb = (uint8_t*) (uint32_t) framebuffer;
519         FILE *fp = fopen("framebuffer.bin", "wb");
520         fwrite(fb, 1, fb_size, fp);
521         fclose(fp);*/
522
523         /* Hang to prevent the tracer from going bananas */
524
525         while(1);
526
527         return 0;
528 }