Fix vertex upload regression
[chai.git] / re.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <memory.h>
4 #include <sys/mman.h>
5 #include <stdbool.h>
6 #include <unistd.h>
7
8 #include "shim.h"
9 #include "jobs.h"
10 #include "memory.h"
11 #include "../oolong/chai-notes.h"
12
13 #define SV_OFFSET (0x4000)
14
15 #define XYZ_COMPONENT_COUNT 3
16
17 #define INDEX_FRAGMENT 1
18
19 /* Provides sample_fragment and sample_vertex */
20 #include "../shader_hex.h"
21
22 int atom_count = 0;
23
24 struct base_dependency no_dependency = {
25         .atom_id = 0,
26         .dependency_type = BASE_JD_DEP_TYPE_INVALID
27 };
28
29 struct job_descriptor_header* set_value_helper(int fd, uint64_t out) {
30         void* packet = galloc(sizeof(struct job_descriptor_header) +
31                         sizeof(struct payload_set_value));
32
33         struct job_descriptor_header header = {
34                 .exception_status = JOB_NOT_STARTED,
35                 .job_descriptor_size = JOB_64_BIT,
36                 .job_type = JOB_TYPE_SET_VALUE
37         };
38
39         struct payload_set_value payload = {
40                 .out = out,
41                 .unknown = 0x03
42         };
43
44         memcpy(packet, &header, sizeof(header));
45         memcpy(packet + sizeof(header), &payload, sizeof(payload));
46
47         sync_gpu(fd, packet, (uint32_t) packet, sizeof(header) + sizeof(payload));
48         return packet;
49 }
50
51 uint64_t make_mfbd(bool tiler, uint64_t heap_free_address, uint64_t scratchpad)
52 {
53         struct tentative_mfbd *mfbd = galloc(sizeof(struct tentative_mfbd));
54
55         mfbd->block2[0] = scratchpad + SV_OFFSET;
56         mfbd->block2[1] = scratchpad + SV_OFFSET + 0x200;
57         mfbd->ugaT = scratchpad;
58         mfbd->unknown2 = heap_free_address | 0x8000000;
59         mfbd->flags = 0xF0;
60         mfbd->heap_free_address = heap_free_address;
61         mfbd->blah = 0x1F00000000;
62         mfbd->unknown1 = 0x1600;
63
64         if(!tiler)
65                 mfbd->unknown3 = 0xFFFFF8C0;
66
67         mfbd->block1[4] = 0x02D801C2;
68         mfbd->block1[6] = 0x02D801C2;
69
70         /* This might not a tiler issue so much as a which-frame issue.
71          * First tiler is 0xFF form. Rest of C021. All fragment C021.
72          * TODO: Investigate!
73          */
74
75         mfbd->block1[7] = tiler ? 0x04001080 : 0x01001080;
76         mfbd->block1[8] = tiler ? 0x000000FF : 0xC0210000;
77         mfbd->block1[9] = tiler ? 0x3F800000 : 0x00000000;
78
79         uint64_t sab0 = 0x5ABA5ABA;
80
81         uint64_t block3[] = {
82                 0x0000000000000000,
83                 0x0000000000030005,
84                 sab0,
85                 mfbd->block2[0],
86                 0x0000000000000003,
87                 0x0000000000000000,
88                 0x0000000000000000,
89                 0x0000000000000000,
90                 sab0 + 0x300,
91         };
92
93         memcpy(mfbd->block3, block3, sizeof(block3));
94
95         return (uint32_t) mfbd | MFBD | (tiler ? FBD_TILER : FBD_FRAGMENT);
96 }
97
98 uint32_t job_chain_fragment(int fd, uint64_t framebuffer,
99                 uint64_t heap_free_address, uint64_t scratchpad) {
100         void* packet = galloc(sizeof(struct job_descriptor_header)
101                         + sizeof(struct payload_fragment));
102
103         struct job_descriptor_header header = {
104                 .exception_status = JOB_NOT_STARTED,
105                 .job_descriptor_size = JOB_32_BIT,
106                 .job_type = JOB_TYPE_FRAGMENT,
107                 .job_index = INDEX_FRAGMENT,
108         };
109
110         struct payload_fragment payload = {
111                 .min_tile_coord = MAKE_TILE_COORD(0, 0, 0),
112                 .max_tile_coord = MAKE_TILE_COORD(29, 45, 0),
113                 .fragment_fbd = make_mfbd(false, heap_free_address, scratchpad)
114         };
115
116         memcpy(packet, &header, sizeof(header));
117         memcpy(packet + sizeof(header), &payload, sizeof(payload));
118         sync_gpu(fd, packet, (uint32_t) packet, sizeof(header) + sizeof(payload));
119
120         struct base_dependency depTiler = {
121                 .atom_id = atom_count /* last one */,
122                 .dependency_type = BASE_JD_DEP_TYPE_DATA
123         };
124
125         uint64_t* resource = calloc(sizeof(u64), 1);
126         resource[0] = framebuffer | BASE_EXT_RES_ACCESS_EXCLUSIVE;
127
128         /* TODO: free resource */
129
130         struct base_jd_atom_v2 job = {
131                 .jc = (uint32_t) packet,
132                 .extres_list = resource,
133                 .nr_extres = 1,
134                 .core_req = BASE_JD_REQ_EXTERNAL_RESOURCES | BASE_JD_REQ_FS,
135                 .atom_number = ++atom_count,
136                 .prio = BASE_JD_PRIO_MEDIUM,
137                 .device_nr = 0,
138                 .pre_dep = { depTiler, no_dependency }
139         };
140
141         submit_job(fd, job);
142
143         return (uint32_t) packet;
144 }
145
146 uint64_t import_shader(int fd, uint8_t *shader, size_t sz, bool fragment)
147 {
148         int pages = 1 + (sz >> PAGE_SHIFT);
149
150         uint64_t gpu = alloc_gpu_pages(fd, pages, BASE_MEM_PROT_CPU_RD |
151                         BASE_MEM_PROT_CPU_WR | BASE_MEM_PROT_GPU_RD |
152                         BASE_MEM_PROT_GPU_EX);
153
154         uint8_t *cpu = mmap_gpu(fd, gpu, pages);
155
156         memcpy(cpu, shader, sz);
157         sync_gpu(fd, cpu, gpu, sz);
158
159         /* TODO: munmap */
160
161         return gpu | SHADER | (fragment ? SHADER_FRAGMENT : SHADER_VERTEX);
162 }
163
164 uint32_t upload_vertices(float *vertices, size_t sz)
165 {
166         struct vertex_buffer *vb;
167         vb = (struct vertex_buffer*) galloc(sizeof(*vb));
168         
169         float *verts = (float*) galloc(sz);
170         memcpy(verts, vertices, sz);
171         vb->vertices = (uint64_t) (uint32_t) verts;
172
173         vb->vertex_size = sizeof(float) * XYZ_COMPONENT_COUNT; 
174         vb->size = sz;
175
176         vb->vertices |= 1; /* TODO flags */
177         
178         return (uint32_t) vb;
179 }
180
181 struct job_descriptor_header* vertex_tiler_helper(int fd, bool tiler,
182                 uint32_t fbd, uint32_t vertex_buffer,
183                 uint32_t zero_buffer, uint32_t mode,
184                 void *shader, size_t shader_size)
185 {
186         void* packet = galloc(sizeof(struct job_descriptor_header)
187                         + sizeof(struct payload_vertex_tiler32));
188
189         struct job_descriptor_header header = {
190                 .exception_status = JOB_NOT_STARTED,
191                 .job_descriptor_size = JOB_32_BIT,
192                 .job_type = tiler ? JOB_TYPE_TILER : JOB_TYPE_VERTEX
193         };
194
195         /* TODO */
196         uint32_t mode_gooks = 0x14000000 | (tiler ? (0x030000 | mode) : 0);
197         uint32_t other_gook = tiler ? 0x00000003 : 0x00000000;
198
199         struct payload_vertex_tiler32 payload = {
200                 .block1 = {
201                         0x00000003, 0x28000000, mode_gooks, 0x00000000,
202                         0x00000000, other_gook, 0x00000000, 0x00000000,
203                         0x00000005, 0x00000000, 0x00000000
204                 },
205                 .zeroes = zero_buffer,
206                 .unknown1 = (uint32_t) galloc(16),
207                 .null1 = 0,
208                 .null2 = 0,
209                 .unknown2 = (uint32_t) galloc(32),
210                 .shader = (uint32_t) galloc(sizeof(struct shader_meta)),
211                 .vertices = vertex_buffer,
212                 .unknown4 = (uint32_t) galloc(16),
213                 .unknown5 = (uint32_t) galloc(32),
214                 .unknown6 = (uint32_t) galloc(64),
215                 .nullForVertex = tiler ? (uint32_t) galloc(64) : 0,
216                 .null4 = 0,
217                 .fbd = fbd,
218                 .unknown7 = tiler ? 0 : ((uint32_t) galloc(64) | 1) /* TODO */
219         };
220
221         struct shader_meta *s = (struct shader_meta*) payload.shader;
222         s->shader = import_shader(fd, shader, shader_size, tiler);
223
224         if(!tiler) {
225                 uint32_t ni[] = {
226                         0x43200000, 0x42F00000, 0x3F000000, 0x00000000,
227                         0x43200000, 0x42F00000, 0x3F000000, 0x00000000
228                 };
229
230                 memcpy((void*) payload.unknown2, ni, sizeof(ni));
231         }
232
233         if(tiler) {
234                 /* Lose precision... on purpose? */
235                 payload.unknown7 = (uint32_t) s->shader;
236         }
237
238         payload.unknown7 = tiler ? 0xDEADBA00 : 0xDEADFA00;
239
240         /* TODO: Decode me! */
241
242         if(tiler) {
243                 s->unknown1 = 0x0007000000000000;
244                 s->unknown2 = 0x0000000000020602;
245         } else {
246                 s->unknown1 = 0x0005000100000000;
247                 s->unknown2 = 0x0000000000420002;
248         }
249
250         /* I have *no* idea */
251
252         uint32_t *p = (uint32_t*) payload.unknown4;
253         *p = 0x2DEA2200;
254
255         uint64_t pi[] = {
256                 0x0000000017E49000, 0x0000000017E49000, 
257                 0x0000000017E49000, 0x0000000017E49000, 
258                 0x00000000179A2200, 0x0000000017E49000, 
259                 0x0000000017E49000
260         };
261
262         memcpy((void*) payload.unknown6, pi, sizeof(pi));
263
264         if(tiler) {
265                 uint32_t ni[] = {
266                         0xFF800000, 0xFF800000,
267                         0x7F800000, 0x7F800000,
268                         0x00000000, 0x3F800000,
269                         0x00000000, 0x00EF013F,
270                         0x00000000, 0x0000001F,
271                         0x02020000, 0x00000001
272                 };
273
274                 memcpy((void*) payload.nullForVertex, ni, sizeof(ni));
275         }
276
277         /* Use some magic numbers from the traces */
278         uint64_t* unk1 = (uint64_t*) payload.unknown1;
279         /* unk1[0] = 0x000000B296271001;
280         unk1[1] = 0x000000B296273000; */
281
282         unk1[0] = 0x5a5a5a5a5a5a1001;
283         unk1[1] = 0x5a5a5a5a5a5a3000;
284
285         uint32_t writeBuffer = (uint32_t) galloc(64);
286
287         uint64_t* unk5 = (uint64_t*) payload.unknown5;
288         unk5[0] = ((uint64_t) (tiler ? 0xDB : 0x7A) << 56) | writeBuffer | 1;
289         unk5[1] = 0x0000004000000010;
290
291         if(tiler) {
292                 uint32_t ni[] = {
293                         0x00000001, 0x00000000, 0x00070000, 0x00020602,
294                         0x00000000, 0x00000000, 0x00000000, 0x3712FFFF,
295                         0x44F0FFFF, 0x0007FF00, 0x0007FF00, 0x00000000,
296                         0x00000000, 0x00000000, 0x00000000, 0x00000200,
297                         0x00000000, 0xF0122122, 0x00000000, 0x00000000,
298                         0x00000000, 0xF0122122, 0x00000000, 0xFF800000,
299                         0xFF800000, 0x7F800000, 0x7F800000, 0x00000000,
300                         0x3F800000, 0x00000000, 0xEF013F00, 0x00000000,
301                         0x0000001F, 0x02020000, 0x00000001, 0x00000000
302                 };
303
304                 memcpy(payload.block2, ni, sizeof(ni));
305         } else {
306                 uint32_t ni[] = {
307                         0x00000000, 0x0000000C, 0x00000030, 0x2DEA2200,
308                         0x00000000, 0x00000000, 0x00000000, /* Address to 1 */ 0xCAFEDA01,
309                         0x57000000, 0x00000010, 0x00000040, 0x17E49000,
310                         0x00000000, 0x17E49000, 0x00000000, 0x17E49000,
311                         0x00000000, 0x17E49000, 0x00000000, 0x179A2200,
312                         0x00000000, 0x17E49000, 0x00000000, 0x17E49000,
313                         0x00000000, 0x00000000, 0x00000000, 0x43200000,
314                         0x42F00000, 0x3F000000, 0x00000000, 0x43200000,
315                         0x42F00000, 0x3F000000, 0x00000000, 0x00000000
316                 };
317
318                 memcpy(payload.block2, ni, sizeof(ni));
319         }
320
321         /* Trap tiler job execution */
322
323         if(tiler) {
324                 payload.shader = 0x5AB00A05;
325
326                 /* Hit second */
327                 //payload.zeroes = 0x5AB01A00;
328
329                 payload.unknown1 = 0x5AB02A00;
330                 payload.unknown2 = 0x5AB03A00;
331                 payload.vertices = 0x5AB04A00;
332                 payload.unknown4 = 0x5AB05A00;
333                 payload.unknown5 = 0x5AB06A00;
334                 payload.unknown6 = 0x5AB07A00;
335                 payload.unknown7 = 0x5AB0DA00;
336
337                 /* Hit third */
338                 //payload.fbd    = 0x5AB09A00;
339
340                 /* Hit first */
341                 // payload.nullForVertex = 0x5AB08A00;
342         }
343
344         memcpy(packet, &header, sizeof(header));
345         memcpy(packet + sizeof(header), &payload, sizeof(payload));
346         sync_gpu(fd, packet, (uint32_t) packet, sizeof(header) + sizeof(payload));
347
348         return packet;
349 }
350
351 uint32_t job_chain_vertex_tiler(int fd,
352                 float *vertices, size_t vertex_size, int mode,
353                 void* vertex_shader, size_t vs_sz,
354                 void *fragment_shader, size_t fs_sz,
355                 uint64_t heap_free_address, uint64_t scratchpad)
356 {
357         uint32_t vertex_buffer = upload_vertices(vertices, vertex_size);
358         uint32_t vertex_fbd = (uint32_t) make_mfbd(true, heap_free_address, scratchpad);
359
360         uint32_t zero_buffer = (uint32_t) alloc_gpu_pages(fd, 0x20,
361                         0x3800 | BASE_MEM_PROT_CPU_RD |
362                         BASE_MEM_PROT_CPU_WR | BASE_MEM_PROT_GPU_RD);
363
364         sync_gpu(fd, (void*) zero_buffer, zero_buffer, 0x20 << PAGE_SHIFT);
365
366         struct job_descriptor_header *set = set_value_helper(fd, scratchpad + SV_OFFSET);
367
368         struct job_descriptor_header *vertex =
369                 vertex_tiler_helper(fd, false,
370                                 vertex_fbd, vertex_buffer,
371                                 zero_buffer, mode,
372                                 vertex_shader, vs_sz);
373
374         struct job_descriptor_header *tiler =
375                 vertex_tiler_helper(fd, true,
376                                 vertex_fbd, vertex_buffer,
377                                 zero_buffer, mode,
378                                 fragment_shader, fs_sz);
379
380         set->next_job = (uint32_t) vertex;
381         vertex->next_job = (uint32_t) tiler;
382
383         /* TODO: Determine if these numbers are meaningful */
384         set->job_index = 3;
385         vertex->job_index = 1;
386         tiler->job_index = 2;
387
388         vertex->job_dependency_index_2 = set->job_index;
389         tiler->job_dependency_index_1 = vertex->job_index;
390
391         struct base_jd_atom_v2 job = {
392                 .jc = (uint32_t) set,
393                 .extres_list = NULL,
394                 .nr_extres = 0,
395                 .core_req = BASE_JD_REQ_CS | BASE_JD_REQ_T
396                         | BASE_JD_REQ_CF | BASE_JD_REQ_COHERENT_GROUP,
397                 .atom_number = ++atom_count,
398                 .prio = BASE_JD_PRIO_MEDIUM,
399                 .device_nr = 0,
400                 .pre_dep = { no_dependency, no_dependency }
401         };
402
403         submit_job(fd, job);
404
405         return (uint32_t) tiler;
406 }
407
408 void job_chain_replay(int fd, uint32_t tiler_jc, uint32_t fragment_jc,
409                 uint64_t heap_free_address, uint64_t framebuffer)
410 {
411         struct base_jd_replay_payload *payload;
412
413         payload = (struct base_jd_replay_payload*) galloc(sizeof(*payload));
414
415         payload->tiler_jc_list = tiler_jc;
416         payload->fragment_jc = fragment_jc;
417         payload->tiler_heap_free = heap_free_address;
418         payload->fragment_hierarchy_mask = 0;
419         payload->tiler_hierarchy_mask = 0;
420         payload->hierarchy_default_weight = 0x10000;
421         payload->tiler_core_req = BASE_JD_REQ_T | BASE_JD_REQ_COHERENT_GROUP;
422         payload->fragment_core_req = BASE_JD_REQ_FS;
423
424         struct base_dependency depFragment = {
425                 .atom_id = atom_count,
426                 .dependency_type = BASE_JD_DEP_TYPE_DATA
427         };
428
429         uint64_t* resource = malloc(sizeof(u64) * 1);
430         resource[0] = framebuffer | BASE_EXT_RES_ACCESS_EXCLUSIVE;
431
432         struct base_jd_atom_v2 job = {
433                 .jc = (uint32_t) payload,
434                 .extres_list = resource,
435                 .nr_extres = 1,
436                 .core_req = BASE_JD_REQ_EXTERNAL_RESOURCES | BASE_JD_REQ_SOFT_REPLAY,
437                 .atom_number = ++atom_count,
438                 .prio = BASE_JD_PRIO_LOW,
439                 .device_nr = 0,
440                 .pre_dep = { depFragment, no_dependency }
441         };
442
443         submit_job(fd, job);
444 }
445
446 int main()
447 {
448         int fd = open_kernel_module();
449
450         query_gpu_props(fd);
451
452         init_cbma(fd);
453         stream_create(fd, "malitl_339_0x53ae8");
454         stream_create(fd, "malitl_339_0x53f78");
455
456         uint64_t scratchpad = alloc_gpu_pages(fd, 8, 0xC);
457         uint64_t heap_free_address = alloc_gpu_heap(fd, 0x8000);        
458
459         // size_t fb_size = 29 * 16 * 45 * 16 * 4 * 2;
460         // uint64_t framebuffer = (uint64_t) (uint32_t) galloc(fb_size);
461         
462         /* Fake framebuffer to trap accesses */
463         uint64_t framebuffer = 0x1CAFE0000;
464
465         float vertices[] = {
466                 0.0, 0.0, 0.0,
467                 1.0, 0.0, 0.0,
468                 1.0, 1.0, 0.0,
469                 0.0, 1.0, 0.0
470         };
471
472         uint32_t tiler_jc =
473                 job_chain_vertex_tiler(fd,
474                         vertices, sizeof(vertices), CHAI_TRIANGLE_FAN,
475                         sample_vertex, sizeof(sample_vertex),
476                         sample_fragment, sizeof(sample_fragment),
477                         heap_free_address, scratchpad);
478
479         uint32_t fragment_jc = job_chain_fragment(fd, framebuffer, heap_free_address, scratchpad);
480
481         job_chain_replay(fd, tiler_jc, fragment_jc, heap_free_address, framebuffer);
482         sync_cbma(fd);
483         flush_job_queue(fd);
484
485         sleep(3);
486
487         /* Dump framebuffer to a file */
488         /*uint8_t *fb = (uint8_t*) (uint32_t) framebuffer;
489         FILE *fp = fopen("framebuffer.bin", "wb");
490         fwrite(fb, 1, fb_size, fp);
491         fclose(fp);*/
492
493         /* Hang to prevent the tracer from going bananas */
494
495         while(1);
496
497         return 0;
498 }