Real GPU heap
[chai.git] / re.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <memory.h>
4 #include <sys/mman.h>
5 #include <stdbool.h>
6 #include <unistd.h>
7
8 #include "shim.h"
9 #include "jobs.h"
10 #include "memory.h"
11 #include "../oolong/chai-notes.h"
12
13 uint64_t heap_free_address;
14
15 #define SV_OFFSET (0x4000)
16
17 /* TODO: allocate these indices dynamically; message passing */
18
19 #define INDEX_VERTEX 1
20 #define INDEX_TILER 2
21 #define INDEX_SV 3
22
23 #define INDEX_FRAGMENT 1
24
25 int atom_count = 0;
26
27 uint64_t framebuffer;
28
29 uint64_t last_fragment;
30 uint64_t last_tiler;
31
32 void *set_value_helper(int fd, uint64_t out) {
33         void* packet = galloc(sizeof(struct job_descriptor_header) + sizeof(struct payload_set_value));
34
35         struct job_descriptor_header header = {
36                 .exception_status = JOB_NOT_STARTED,
37                 .job_descriptor_size = JOB_64_BIT,
38                 .job_type = JOB_TYPE_SET_VALUE,
39                 .job_index = INDEX_SV,
40         };
41
42         struct payload_set_value payload = {
43                 .out = out,
44                 .unknown = 0x03
45         };
46
47         memcpy(packet, &header, sizeof(header));
48         memcpy(packet + sizeof(header), &payload, sizeof(payload));
49
50         sync_gpu(fd, packet, (uint32_t) packet, sizeof(header) + sizeof(payload));
51         return packet;
52 }
53
54 uint64_t make_mfbd(bool tiler, uint64_t scratchpad)
55 {
56         struct tentative_mfbd *mfbd = galloc(sizeof(struct tentative_mfbd));
57         memset(mfbd, 0, sizeof(struct tentative_mfbd));
58
59         /* zeroes */
60         mfbd->block2[0] = scratchpad + SV_OFFSET;
61         mfbd->block2[1] = scratchpad + SV_OFFSET + 0x200;
62         mfbd->ugaT = scratchpad;
63         mfbd->unknown_gpu_address = 0;
64
65         /* Unknown contents -- it's a mystery! */
66         mfbd->unknown2 = 0x5ABA0200;
67         mfbd->unknown_gpu_addressN = 0;
68
69         /* Match traces. TODO decode */
70         mfbd->flags = 0xF0;
71         mfbd->heap_free_address = heap_free_address;
72         mfbd->blah = 0x1F00000000;
73         mfbd->unknown3 = tiler ? 0 : 0xFFFFF8C0;
74         mfbd->unknown1 = 0x1600;
75
76         mfbd->block1[4] = 0x02D801C2;
77         mfbd->block1[6] = 0x02D801C2;
78
79         /* This might not a tiler issue so much as a which-frame issue.
80          * First tiler is 0xFF form. Rest of C021. All fragment C021.
81          * TODO: Investigate!
82          */
83
84         mfbd->block1[7] = tiler ? 0x04001080 : 0x01001080;
85         mfbd->block1[8] = tiler ? 0x000000FF : 0xC0210000;
86         mfbd->block1[9] = tiler ? 0x3F800000 : 0x00000000;
87
88         uint64_t sab0 = 0x5ABA5ABA;
89
90         uint64_t block3[] = {
91                 0x0000000000000000,
92                 0x0000000000030005,
93                 sab0,
94                 mfbd->block2[0],
95                 0x0000000000000003,
96                 0x0000000000000000,
97                 0x0000000000000000,
98                 0x0000000000000000,
99                 sab0 + 0x300,
100         };
101
102         memcpy(mfbd->block3, block3, sizeof(block3));
103
104         return (uint64_t) (uint32_t) mfbd | MFBD | (tiler ? FBD_VERTEX_TILER : FBD_FRAGMENT);
105 }
106
107 int job_chain_fragment(int fd, uint64_t scratchpad) {
108         void* packet = galloc(sizeof(struct job_descriptor_header) + sizeof(struct payload_fragment));
109
110         struct job_descriptor_header header = {
111                 .exception_status = JOB_NOT_STARTED,
112                 .job_descriptor_size = JOB_32_BIT,
113                 .job_type = JOB_TYPE_FRAGMENT,
114                 .job_index = INDEX_FRAGMENT,
115         };
116
117         struct payload_fragment payload = {
118                 .min_tile_coord = MAKE_TILE_COORD(0, 0, 0),
119                 .max_tile_coord = MAKE_TILE_COORD(29, 45, 0),
120                 .fragment_fbd = make_mfbd(false, scratchpad)
121         };
122
123         memcpy(packet, &header, sizeof(header));
124         memcpy(packet + sizeof(header), &payload, sizeof(payload));
125         sync_gpu(fd, packet, (uint32_t) packet, sizeof(header) + sizeof(payload));
126
127         struct base_dependency depNoDep = {
128                 .atom_id = 0,
129                 .dependency_type = BASE_JD_DEP_TYPE_INVALID
130         };
131
132         struct base_dependency depTiler = {
133                 .atom_id = atom_count /* last one */,
134                 .dependency_type = BASE_JD_DEP_TYPE_DATA
135         };
136
137         printf("Framebuffer: %LX\n", framebuffer);
138         uint64_t* resource = calloc(sizeof(u64), 1);
139         resource[0] = framebuffer | BASE_EXT_RES_ACCESS_EXCLUSIVE;
140
141         /* TODO: free resource */
142
143         struct base_jd_atom_v2 job = {
144                 .jc = (uint32_t) packet,
145                 .extres_list = resource,
146                 .nr_extres = 1,
147                 .core_req = BASE_JD_REQ_EXTERNAL_RESOURCES | BASE_JD_REQ_FS,
148                 .atom_number = ++atom_count,
149                 .prio = BASE_JD_PRIO_MEDIUM,
150                 .device_nr = 0
151         };
152
153         job.pre_dep[0] = depTiler;
154         job.pre_dep[1] = depNoDep;
155
156         submit_job(fd, job);
157
158         last_fragment = (uint32_t) packet;
159
160         return 0;
161 }
162
163 uint64_t import_shader(int fd, uint8_t *shader, size_t sz, bool fragment)
164 {
165         int pages = 1 + (sz >> PAGE_SHIFT);
166
167         uint64_t gpu = alloc_gpu_pages(fd, pages, BASE_MEM_PROT_CPU_RD |
168                         BASE_MEM_PROT_CPU_WR | BASE_MEM_PROT_GPU_RD |
169                         BASE_MEM_PROT_GPU_EX); uint8_t *cpu =
170                 mmap_gpu(fd, gpu, pages);
171
172         memcpy(cpu, shader, sz);
173
174         /* Trap once the fragment shader is executed */
175         if(fragment) {
176                 for(int i = 0; i < 128; ++i) {
177                         cpu[i] = rand() & 0xFF;
178                 }
179         }
180
181         sync_gpu(fd, cpu, gpu, sz);
182
183         /* TODO: munmap */
184
185         return gpu | (fragment ? 9 : 5); /* Unknown flag */
186 }
187
188 uint32_t upload_vertices()
189 {
190         /* TODO: Vertices should be parametric */
191         float vertices[] = {
192                 0.0, 0.0, 0.0,
193                 1.0, 0.0, 0.0,
194                 1.0, 1.0, 0.0,
195                 0.0, 1.0, 0.0
196         };
197
198         struct vertex_buffer *vb;
199         vb = (struct vertex_buffer*) galloc(sizeof(*vb));
200         
201         vb->vertices = (uint64_t) (uint32_t) galloc(sizeof(vertices));
202         memcpy((void*) (uint32_t) vb->vertices, vertices, sizeof(vertices));
203
204         vb->vertex_size = sizeof(float) * 3;
205         vb->size = sizeof(vertices);
206
207         vb->vertices |= 1; /* TODO flags */
208         
209         return (uint32_t) vb;
210 }
211
212 void* vertex_tiler_helper(int fd, bool tiler, uint32_t fbd, uint32_t vertex_buffer, uint32_t zero_buffer)
213 {
214         void* packet = galloc(sizeof(struct job_descriptor_header) + sizeof(struct payload_vertex_tiler32));
215
216         struct job_descriptor_header header = {
217                 .exception_status = JOB_NOT_STARTED,
218                 .job_descriptor_size = JOB_32_BIT,
219                 .job_type = tiler ? JOB_TYPE_TILER : JOB_TYPE_VERTEX,
220                 .job_index = tiler ? INDEX_TILER : INDEX_VERTEX,
221                 .job_dependency_index_1 = tiler ? INDEX_VERTEX : 0,
222                 .job_dependency_index_2 = tiler ? 0 : INDEX_SV
223         };
224
225         /* TODO */
226         uint32_t mode = CHAI_TRIANGLE_FAN;
227         uint32_t mode_gooks = 0x14000000 | (tiler ? (0x030000 | mode) : 0);
228         uint32_t other_gook = tiler ? 0x00000003 : 0x00000000;
229
230         struct payload_vertex_tiler32 payload = {
231                 .block1 = {
232                         0x00000003, 0x28000000, mode_gooks, 0x00000000,
233                         0x00000000, other_gook, 0x00000000, 0x00000000,
234                         0x00000005, 0x00000000, 0x00000000
235                 },
236                 .zeroes = zero_buffer,
237                 .unknown1 = (uint32_t) galloc(16),
238                 .null1 = 0,
239                 .null2 = 0,
240                 .unknown2 = (uint32_t) galloc(32),
241                 .shader = (uint32_t) galloc(sizeof(struct shader_meta)),
242                 .vertices = vertex_buffer,
243                 .unknown4 = (uint32_t) galloc(16),
244                 .unknown5 = (uint32_t) galloc(32),
245                 .unknown6 = (uint32_t) galloc(64),
246                 .nullForVertex = tiler ? (uint32_t) galloc(64) : 0,
247                 .null4 = 0,
248                 .fbd = fbd,
249                 .unknown7 = tiler ? 0 : ((uint32_t) galloc(64) | 1) /* TODO */
250         };
251
252         struct shader_meta *shader = (struct shader_meta*) payload.shader;
253
254         /* TODO: Integrate an assembler */
255 #include "../shader_hex.h"
256         shader->shader = import_shader(fd,
257                         (uint8_t*) (tiler ? fragment_shader : vertex_shader),
258                         tiler ? sizeof(fragment_shader) : sizeof(vertex_shader),
259                         tiler);
260
261         if(!tiler) {
262                 uint32_t ni[] = {
263                         0x43200000, 0x42F00000, 0x3F000000, 0x00000000,
264                         0x43200000, 0x42F00000, 0x3F000000, 0x00000000
265                 };
266
267                 memcpy((void*) payload.unknown2, ni, sizeof(ni));
268         }
269
270         if(tiler) {
271                 /* Lose precision... on purpose? */
272                 payload.unknown7 = (uint32_t) shader->shader;
273         }
274
275         payload.unknown7 = tiler ? 0xDEADBA00 : 0xDEADFA00;
276
277         /* TODO: Decode me! */
278
279         if(tiler) {
280                 shader->unknown1 = 0x0007000000000000;
281                 shader->unknown2 = 0x0000000000020602;
282         } else {
283                 shader->unknown1 = 0x0005000100000000;
284                 shader->unknown2 = 0x0000000000420002;
285         }
286
287         /* I have *no* idea */
288
289         uint32_t *p = (uint32_t*) payload.unknown4;
290         *p = 0x2DEA2200;
291
292         uint64_t pi[] = {
293                 0x0000000017E49000, 0x0000000017E49000, 
294                 0x0000000017E49000, 0x0000000017E49000, 
295                 0x00000000179A2200, 0x0000000017E49000, 
296                 0x0000000017E49000
297         };
298
299         memcpy((void*) payload.unknown6, pi, sizeof(pi));
300
301         if(tiler) {
302                 uint32_t ni[] = {
303                         0xFF800000, 0xFF800000,
304                         0x7F800000, 0x7F800000,
305                         0x00000000, 0x3F800000,
306                         0x00000000, 0x00EF013F,
307                         0x00000000, 0x0000001F,
308                         0x02020000, 0x00000001
309                 };
310
311                 memcpy((void*) payload.nullForVertex, ni, sizeof(ni));
312         }
313
314         /* Use some magic numbers from the traces */
315         uint64_t* unk1 = (uint64_t*) payload.unknown1;
316         /* unk1[0] = 0x000000B296271001;
317         unk1[1] = 0x000000B296273000; */
318
319         unk1[0] = 0x5a5a5a5a5a5a1001;
320         unk1[1] = 0x5a5a5a5a5a5a3000;
321
322         uint32_t writeBuffer = (uint32_t) galloc(64);
323
324         uint64_t* unk5 = (uint64_t*) payload.unknown5;
325         unk5[0] = ((uint64_t) (tiler ? 0xDB : 0x7A) << 56) | writeBuffer | 1;
326         unk5[1] = 0x0000004000000010;
327
328         if(tiler) {
329                 uint32_t ni[] = {
330                         0x00000001, 0x00000000, 0x00070000, 0x00020602,
331                         0x00000000, 0x00000000, 0x00000000, 0x3712FFFF,
332                         0x44F0FFFF, 0x0007FF00, 0x0007FF00, 0x00000000,
333                         0x00000000, 0x00000000, 0x00000000, 0x00000200,
334                         0x00000000, 0xF0122122, 0x00000000, 0x00000000,
335                         0x00000000, 0xF0122122, 0x00000000, 0xFF800000,
336                         0xFF800000, 0x7F800000, 0x7F800000, 0x00000000,
337                         0x3F800000, 0x00000000, 0xEF013F00, 0x00000000,
338                         0x0000001F, 0x02020000, 0x00000001, 0x00000000
339                 };
340
341                 memcpy(payload.block2, ni, sizeof(ni));
342         } else {
343                 uint32_t ni[] = {
344                         0x00000000, 0x0000000C, 0x00000030, 0x2DEA2200,
345                         0x00000000, 0x00000000, 0x00000000, /* Address to 1 */ 0xCAFEDA01,
346                         0x57000000, 0x00000010, 0x00000040, 0x17E49000,
347                         0x00000000, 0x17E49000, 0x00000000, 0x17E49000,
348                         0x00000000, 0x17E49000, 0x00000000, 0x179A2200,
349                         0x00000000, 0x17E49000, 0x00000000, 0x17E49000,
350                         0x00000000, 0x00000000, 0x00000000, 0x43200000,
351                         0x42F00000, 0x3F000000, 0x00000000, 0x43200000,
352                         0x42F00000, 0x3F000000, 0x00000000, 0x00000000
353                 };
354
355                 memcpy(payload.block2, ni, sizeof(ni));
356         }
357
358         /* Trap tiler job execution */
359
360         if(tiler) {
361                 payload.shader = 0x5AB00A05;
362
363                 /* Hit second */
364                 //payload.zeroes = 0x5AB01A00;
365
366                 payload.unknown1 = 0x5AB02A00;
367                 payload.unknown2 = 0x5AB03A00;
368                 payload.vertices = 0x5AB04A00;
369                 payload.unknown4 = 0x5AB05A00;
370                 payload.unknown5 = 0x5AB06A00;
371                 payload.unknown6 = 0x5AB07A00;
372                 payload.unknown7 = 0x5AB0DA00;
373
374                 /* Hit third */
375                 //payload.fbd    = 0x5AB09A00;
376
377                 /* Hit first */
378                 // payload.nullForVertex = 0x5AB08A00;
379         }
380
381         memcpy(packet, &header, sizeof(header));
382         memcpy(packet + sizeof(header), &payload, sizeof(payload));
383         sync_gpu(fd, packet, (uint32_t) packet, sizeof(header) + sizeof(payload));
384
385         return packet;
386 }
387
388 int job_chain_vertex_tiler(int fd, uint64_t scratchpad) {
389         uint32_t vertex_buffer = upload_vertices();
390         uint32_t vertex_fbd = (uint32_t) make_mfbd(true, scratchpad);
391
392         uint32_t zero_buffer = (uint32_t) alloc_gpu_pages(fd, 0x20,
393                         0x3800 | BASE_MEM_PROT_CPU_RD |
394                         BASE_MEM_PROT_CPU_WR | BASE_MEM_PROT_GPU_RD);
395
396         sync_gpu(fd, (void*) zero_buffer, zero_buffer, 0x20 << PAGE_SHIFT);
397
398         void *set = set_value_helper(fd, scratchpad + SV_OFFSET);
399         void *vertex = vertex_tiler_helper(fd, false, vertex_fbd, vertex_buffer, zero_buffer);
400         void *tiler = vertex_tiler_helper(fd, true, vertex_fbd, vertex_buffer, zero_buffer);
401
402         ((struct job_descriptor_header*) set)->next_job = (uint32_t) vertex;
403         ((struct job_descriptor_header*) vertex)->next_job = (uint32_t) tiler;
404
405         struct base_dependency depNoDep = {
406                 .atom_id = 0,
407                 .dependency_type = BASE_JD_DEP_TYPE_INVALID
408         };
409
410         struct base_jd_atom_v2 job = {
411                 .jc = (uint32_t) set,
412                 .extres_list = NULL,
413                 .nr_extres = 0,
414                 .core_req = BASE_JD_REQ_CS | BASE_JD_REQ_T | BASE_JD_REQ_CF | BASE_JD_REQ_COHERENT_GROUP,
415                 .atom_number = ++atom_count,
416                 .prio = BASE_JD_PRIO_MEDIUM,
417                 .device_nr = 0
418         };
419
420         job.pre_dep[0] = depNoDep;
421         job.pre_dep[1] = depNoDep;
422
423         submit_job(fd, job);
424
425         last_tiler = (uint32_t) tiler;
426
427         return 0;
428 }
429
430 void job_chain_replay(int fd)
431 {
432         struct base_jd_replay_payload *payload;
433
434         payload = (struct base_jd_replay_payload*) galloc(sizeof(*payload));
435
436         payload->tiler_jc_list = last_tiler;
437         payload->fragment_jc = last_fragment;
438         payload->tiler_heap_free = heap_free_address;
439         payload->fragment_hierarchy_mask = 0;
440         payload->tiler_hierarchy_mask = 0;
441         payload->hierarchy_default_weight = 0x10000;
442         payload->tiler_core_req = BASE_JD_REQ_T | BASE_JD_REQ_COHERENT_GROUP;
443         payload->fragment_core_req = BASE_JD_REQ_FS;
444
445         struct base_dependency depNoDep = {
446                 .atom_id = 0,
447                 .dependency_type = BASE_JD_DEP_TYPE_INVALID
448         };
449
450         struct base_dependency depFragment = {
451                 .atom_id = atom_count,
452                 .dependency_type = BASE_JD_DEP_TYPE_DATA
453         };
454
455         uint64_t* resource = malloc(sizeof(u64) * 1);
456         resource[0] = framebuffer | BASE_EXT_RES_ACCESS_EXCLUSIVE;
457
458         struct base_jd_atom_v2 job = {
459                 .jc = (uint32_t) payload,
460                 .extres_list = resource,
461                 .nr_extres = 1,
462                 .core_req = BASE_JD_REQ_EXTERNAL_RESOURCES | BASE_JD_REQ_SOFT_REPLAY,
463                 .atom_number = ++atom_count,
464                 .prio = BASE_JD_PRIO_LOW,
465                 .device_nr = 0
466         };
467
468         job.pre_dep[0] = depFragment;
469         job.pre_dep[1] = depNoDep;
470
471         submit_job(fd, job);
472 }
473
474 extern uint32_t cbma_bottom;
475 extern uint32_t cbma_top;
476
477 int main()
478 {
479         int fd = open_kernel_module();
480
481         query_gpu_props(fd);
482
483         init_cbma(fd);
484         stream_create(fd, "malitl_339_0x53ae8");
485         stream_create(fd, "malitl_339_0x53f78");
486
487         heap_free_address = alloc_gpu_heap(fd, 0x8000); 
488
489         //size_t fb_size = 29 * 16 * 45 * 16 * 4 * 2;
490
491         // framebuffer = (uint64_t) (uint32_t) galloc(fb_size);
492
493         /* Fake framebuffer to trap accesses */
494         framebuffer = 0x1CAFE0000;
495         printf("Framebuffer: %LX\n", framebuffer);
496
497         uint64_t scratchpad = alloc_gpu_pages(fd, 8, 0xC);
498         job_chain_vertex_tiler(fd, scratchpad);
499         //job_chain_fragment(fd, scratchpad);
500         //job_chain_replay(fd);
501         sync_gpu(fd, (uint8_t*) cbma_top, cbma_top, cbma_bottom - cbma_top);
502         flush_job_queue(fd);
503
504         sleep(3);
505
506         /* Dump framebuffer to a file */
507         /*uint8_t *fb = (uint8_t*) (uint32_t) framebuffer;
508         FILE *fp = fopen("framebuffer.bin", "wb");
509         fwrite(fb, 1, fb_size, fp);
510         fclose(fp);*/
511
512         /* Hang to prevent the tracer from going bananas */
513
514         while(1);
515
516         return 0;
517 }