1 // SPDX-License-Identifier: GPL-2.0+
3 * Copyright 2016 Tom aan de Wiel
4 * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
6 * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
8 * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
12 #include <linux/string.h>
13 #include "vicodec-codec.h"
16 #define DEADZONE_WIDTH 20
18 static const uint8_t zigzag[64] = {
24 5, 12, 19, 26, 33, 40,
25 6, 13, 20, 27, 34, 41, 48,
26 7, 14, 21, 28, 35, 42, 49, 56,
27 15, 22, 29, 36, 43, 50, 57,
28 23, 30, 37, 44, 51, 58,
37 static int rlc(const s16 *in, __be16 *output, int blocktype)
45 /* read in block from framebuffer */
49 for (y = 0; y < 8; y++) {
50 for (x = 0; x < 8; x++) {
56 /* keep track of amount of trailing zeros */
57 for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
60 *output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
63 to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
66 while (i < to_encode) {
70 /* count leading zeros */
71 while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
79 /* 4 bits for run, 12 for coefficient (quantization by 4) */
80 *output++ = htons((cnt | tmp << 4));
84 if (lastzero_run > 14) {
85 *output = htons(ALL_ZEROS | 0);
93 * This function will worst-case increase rlc_in by 65*2 bytes:
94 * one s16 value for the header and 8 * 8 coefficients of type s16.
96 static s16 derlc(const __be16 **rlc_in, s16 *dwht_out)
99 const __be16 *input = *rlc_in;
100 s16 ret = ntohs(*input++);
102 s16 block[8 * 8 + 16];
107 * Now de-compress, it expands one byte to up to 15 bytes
108 * (or fills the remainder of the 64 bytes with zeroes if it
109 * is the last byte to expand).
111 * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
112 * allow for overflow if the incoming data was malformed.
114 while (dec_count < 8 * 8) {
115 s16 in = ntohs(*input++);
116 int length = in & 0xf;
119 /* fill remainder with zeros */
121 for (i = 0; i < 64 - dec_count; i++)
126 for (i = 0; i < length; i++)
129 dec_count += length + 1;
134 for (i = 0; i < 64; i++) {
139 dwht_out[x + y * 8] = *wp++;
145 static const int quant_table[] = {
146 2, 2, 2, 2, 2, 2, 2, 2,
147 2, 2, 2, 2, 2, 2, 2, 2,
148 2, 2, 2, 2, 2, 2, 2, 3,
149 2, 2, 2, 2, 2, 2, 3, 6,
150 2, 2, 2, 2, 2, 3, 6, 6,
151 2, 2, 2, 2, 3, 6, 6, 6,
152 2, 2, 2, 3, 6, 6, 6, 6,
153 2, 2, 3, 6, 6, 6, 6, 8,
156 static const int quant_table_p[] = {
157 3, 3, 3, 3, 3, 3, 3, 3,
158 3, 3, 3, 3, 3, 3, 3, 3,
159 3, 3, 3, 3, 3, 3, 3, 3,
160 3, 3, 3, 3, 3, 3, 3, 6,
161 3, 3, 3, 3, 3, 3, 6, 6,
162 3, 3, 3, 3, 3, 6, 6, 9,
163 3, 3, 3, 3, 6, 6, 9, 9,
164 3, 3, 3, 6, 6, 9, 9, 10,
167 static void quantize_intra(s16 *coeff, s16 *de_coeff)
169 const int *quant = quant_table;
172 for (j = 0; j < 8; j++) {
173 for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
175 if (*coeff >= -DEADZONE_WIDTH &&
176 *coeff <= DEADZONE_WIDTH)
177 *coeff = *de_coeff = 0;
179 *de_coeff = *coeff << *quant;
184 static void dequantize_intra(s16 *coeff)
186 const int *quant = quant_table;
189 for (j = 0; j < 8; j++)
190 for (i = 0; i < 8; i++, quant++, coeff++)
194 static void quantize_inter(s16 *coeff, s16 *de_coeff)
196 const int *quant = quant_table_p;
199 for (j = 0; j < 8; j++) {
200 for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
202 if (*coeff >= -DEADZONE_WIDTH &&
203 *coeff <= DEADZONE_WIDTH)
204 *coeff = *de_coeff = 0;
206 *de_coeff = *coeff << *quant;
211 static void dequantize_inter(s16 *coeff)
213 const int *quant = quant_table_p;
216 for (j = 0; j < 8; j++)
217 for (i = 0; i < 8; i++, quant++, coeff++)
221 static void fwht(const u8 *block, s16 *output_block, unsigned int stride,
222 unsigned int input_step, bool intra)
224 /* we'll need more than 8 bits for the transformed coefficients */
225 s32 workspace1[8], workspace2[8];
226 const u8 *tmp = block;
227 s16 *out = output_block;
228 int add = intra ? 256 : 0;
232 stride *= input_step;
234 for (i = 0; i < 8; i++, tmp += stride, out += 8) {
235 if (input_step == 1) {
236 workspace1[0] = tmp[0] + tmp[1] - add;
237 workspace1[1] = tmp[0] - tmp[1];
239 workspace1[2] = tmp[2] + tmp[3] - add;
240 workspace1[3] = tmp[2] - tmp[3];
242 workspace1[4] = tmp[4] + tmp[5] - add;
243 workspace1[5] = tmp[4] - tmp[5];
245 workspace1[6] = tmp[6] + tmp[7] - add;
246 workspace1[7] = tmp[6] - tmp[7];
248 workspace1[0] = tmp[0] + tmp[2] - add;
249 workspace1[1] = tmp[0] - tmp[2];
251 workspace1[2] = tmp[4] + tmp[6] - add;
252 workspace1[3] = tmp[4] - tmp[6];
254 workspace1[4] = tmp[8] + tmp[10] - add;
255 workspace1[5] = tmp[8] - tmp[10];
257 workspace1[6] = tmp[12] + tmp[14] - add;
258 workspace1[7] = tmp[12] - tmp[14];
262 workspace2[0] = workspace1[0] + workspace1[2];
263 workspace2[1] = workspace1[0] - workspace1[2];
264 workspace2[2] = workspace1[1] - workspace1[3];
265 workspace2[3] = workspace1[1] + workspace1[3];
267 workspace2[4] = workspace1[4] + workspace1[6];
268 workspace2[5] = workspace1[4] - workspace1[6];
269 workspace2[6] = workspace1[5] - workspace1[7];
270 workspace2[7] = workspace1[5] + workspace1[7];
273 out[0] = workspace2[0] + workspace2[4];
274 out[1] = workspace2[0] - workspace2[4];
275 out[2] = workspace2[1] - workspace2[5];
276 out[3] = workspace2[1] + workspace2[5];
277 out[4] = workspace2[2] + workspace2[6];
278 out[5] = workspace2[2] - workspace2[6];
279 out[6] = workspace2[3] - workspace2[7];
280 out[7] = workspace2[3] + workspace2[7];
285 for (i = 0; i < 8; i++, out++) {
287 workspace1[0] = out[0] + out[1 * 8];
288 workspace1[1] = out[0] - out[1 * 8];
290 workspace1[2] = out[2 * 8] + out[3 * 8];
291 workspace1[3] = out[2 * 8] - out[3 * 8];
293 workspace1[4] = out[4 * 8] + out[5 * 8];
294 workspace1[5] = out[4 * 8] - out[5 * 8];
296 workspace1[6] = out[6 * 8] + out[7 * 8];
297 workspace1[7] = out[6 * 8] - out[7 * 8];
300 workspace2[0] = workspace1[0] + workspace1[2];
301 workspace2[1] = workspace1[0] - workspace1[2];
302 workspace2[2] = workspace1[1] - workspace1[3];
303 workspace2[3] = workspace1[1] + workspace1[3];
305 workspace2[4] = workspace1[4] + workspace1[6];
306 workspace2[5] = workspace1[4] - workspace1[6];
307 workspace2[6] = workspace1[5] - workspace1[7];
308 workspace2[7] = workspace1[5] + workspace1[7];
310 out[0 * 8] = workspace2[0] + workspace2[4];
311 out[1 * 8] = workspace2[0] - workspace2[4];
312 out[2 * 8] = workspace2[1] - workspace2[5];
313 out[3 * 8] = workspace2[1] + workspace2[5];
314 out[4 * 8] = workspace2[2] + workspace2[6];
315 out[5 * 8] = workspace2[2] - workspace2[6];
316 out[6 * 8] = workspace2[3] - workspace2[7];
317 out[7 * 8] = workspace2[3] + workspace2[7];
322 * Not the nicest way of doing it, but P-blocks get twice the range of
323 * that of the I-blocks. Therefore we need a type bigger than 8 bits.
324 * Furthermore values can be negative... This is just a version that
325 * works with 16 signed data
327 static void fwht16(const s16 *block, s16 *output_block, int stride, int intra)
329 /* we'll need more than 8 bits for the transformed coefficients */
330 s32 workspace1[8], workspace2[8];
331 const s16 *tmp = block;
332 s16 *out = output_block;
335 for (i = 0; i < 8; i++, tmp += stride, out += 8) {
337 workspace1[0] = tmp[0] + tmp[1];
338 workspace1[1] = tmp[0] - tmp[1];
340 workspace1[2] = tmp[2] + tmp[3];
341 workspace1[3] = tmp[2] - tmp[3];
343 workspace1[4] = tmp[4] + tmp[5];
344 workspace1[5] = tmp[4] - tmp[5];
346 workspace1[6] = tmp[6] + tmp[7];
347 workspace1[7] = tmp[6] - tmp[7];
350 workspace2[0] = workspace1[0] + workspace1[2];
351 workspace2[1] = workspace1[0] - workspace1[2];
352 workspace2[2] = workspace1[1] - workspace1[3];
353 workspace2[3] = workspace1[1] + workspace1[3];
355 workspace2[4] = workspace1[4] + workspace1[6];
356 workspace2[5] = workspace1[4] - workspace1[6];
357 workspace2[6] = workspace1[5] - workspace1[7];
358 workspace2[7] = workspace1[5] + workspace1[7];
361 out[0] = workspace2[0] + workspace2[4];
362 out[1] = workspace2[0] - workspace2[4];
363 out[2] = workspace2[1] - workspace2[5];
364 out[3] = workspace2[1] + workspace2[5];
365 out[4] = workspace2[2] + workspace2[6];
366 out[5] = workspace2[2] - workspace2[6];
367 out[6] = workspace2[3] - workspace2[7];
368 out[7] = workspace2[3] + workspace2[7];
373 for (i = 0; i < 8; i++, out++) {
375 workspace1[0] = out[0] + out[1*8];
376 workspace1[1] = out[0] - out[1*8];
378 workspace1[2] = out[2*8] + out[3*8];
379 workspace1[3] = out[2*8] - out[3*8];
381 workspace1[4] = out[4*8] + out[5*8];
382 workspace1[5] = out[4*8] - out[5*8];
384 workspace1[6] = out[6*8] + out[7*8];
385 workspace1[7] = out[6*8] - out[7*8];
388 workspace2[0] = workspace1[0] + workspace1[2];
389 workspace2[1] = workspace1[0] - workspace1[2];
390 workspace2[2] = workspace1[1] - workspace1[3];
391 workspace2[3] = workspace1[1] + workspace1[3];
393 workspace2[4] = workspace1[4] + workspace1[6];
394 workspace2[5] = workspace1[4] - workspace1[6];
395 workspace2[6] = workspace1[5] - workspace1[7];
396 workspace2[7] = workspace1[5] + workspace1[7];
399 out[0*8] = workspace2[0] + workspace2[4];
400 out[1*8] = workspace2[0] - workspace2[4];
401 out[2*8] = workspace2[1] - workspace2[5];
402 out[3*8] = workspace2[1] + workspace2[5];
403 out[4*8] = workspace2[2] + workspace2[6];
404 out[5*8] = workspace2[2] - workspace2[6];
405 out[6*8] = workspace2[3] - workspace2[7];
406 out[7*8] = workspace2[3] + workspace2[7];
410 static void ifwht(const s16 *block, s16 *output_block, int intra)
413 * we'll need more than 8 bits for the transformed coefficients
414 * use native unit of cpu
416 int workspace1[8], workspace2[8];
417 int inter = intra ? 0 : 1;
418 const s16 *tmp = block;
419 s16 *out = output_block;
422 for (i = 0; i < 8; i++, tmp += 8, out += 8) {
424 workspace1[0] = tmp[0] + tmp[1];
425 workspace1[1] = tmp[0] - tmp[1];
427 workspace1[2] = tmp[2] + tmp[3];
428 workspace1[3] = tmp[2] - tmp[3];
430 workspace1[4] = tmp[4] + tmp[5];
431 workspace1[5] = tmp[4] - tmp[5];
433 workspace1[6] = tmp[6] + tmp[7];
434 workspace1[7] = tmp[6] - tmp[7];
437 workspace2[0] = workspace1[0] + workspace1[2];
438 workspace2[1] = workspace1[0] - workspace1[2];
439 workspace2[2] = workspace1[1] - workspace1[3];
440 workspace2[3] = workspace1[1] + workspace1[3];
442 workspace2[4] = workspace1[4] + workspace1[6];
443 workspace2[5] = workspace1[4] - workspace1[6];
444 workspace2[6] = workspace1[5] - workspace1[7];
445 workspace2[7] = workspace1[5] + workspace1[7];
448 out[0] = workspace2[0] + workspace2[4];
449 out[1] = workspace2[0] - workspace2[4];
450 out[2] = workspace2[1] - workspace2[5];
451 out[3] = workspace2[1] + workspace2[5];
452 out[4] = workspace2[2] + workspace2[6];
453 out[5] = workspace2[2] - workspace2[6];
454 out[6] = workspace2[3] - workspace2[7];
455 out[7] = workspace2[3] + workspace2[7];
460 for (i = 0; i < 8; i++, out++) {
462 workspace1[0] = out[0] + out[1 * 8];
463 workspace1[1] = out[0] - out[1 * 8];
465 workspace1[2] = out[2 * 8] + out[3 * 8];
466 workspace1[3] = out[2 * 8] - out[3 * 8];
468 workspace1[4] = out[4 * 8] + out[5 * 8];
469 workspace1[5] = out[4 * 8] - out[5 * 8];
471 workspace1[6] = out[6 * 8] + out[7 * 8];
472 workspace1[7] = out[6 * 8] - out[7 * 8];
475 workspace2[0] = workspace1[0] + workspace1[2];
476 workspace2[1] = workspace1[0] - workspace1[2];
477 workspace2[2] = workspace1[1] - workspace1[3];
478 workspace2[3] = workspace1[1] + workspace1[3];
480 workspace2[4] = workspace1[4] + workspace1[6];
481 workspace2[5] = workspace1[4] - workspace1[6];
482 workspace2[6] = workspace1[5] - workspace1[7];
483 workspace2[7] = workspace1[5] + workspace1[7];
489 out[0 * 8] = workspace2[0] + workspace2[4];
490 out[1 * 8] = workspace2[0] - workspace2[4];
491 out[2 * 8] = workspace2[1] - workspace2[5];
492 out[3 * 8] = workspace2[1] + workspace2[5];
493 out[4 * 8] = workspace2[2] + workspace2[6];
494 out[5 * 8] = workspace2[2] - workspace2[6];
495 out[6 * 8] = workspace2[3] - workspace2[7];
496 out[7 * 8] = workspace2[3] + workspace2[7];
498 for (d = 0; d < 8; d++)
503 out[0 * 8] = workspace2[0] + workspace2[4];
504 out[1 * 8] = workspace2[0] - workspace2[4];
505 out[2 * 8] = workspace2[1] - workspace2[5];
506 out[3 * 8] = workspace2[1] + workspace2[5];
507 out[4 * 8] = workspace2[2] + workspace2[6];
508 out[5 * 8] = workspace2[2] - workspace2[6];
509 out[6 * 8] = workspace2[3] - workspace2[7];
510 out[7 * 8] = workspace2[3] + workspace2[7];
512 for (d = 0; d < 8; d++) {
520 static void fill_encoder_block(const u8 *input, s16 *dst,
521 unsigned int stride, unsigned int input_step)
525 for (i = 0; i < 8; i++) {
526 for (j = 0; j < 8; j++, input += input_step)
528 input += (stride - 8) * input_step;
532 static int var_intra(const s16 *input)
536 const s16 *tmp = input;
539 for (i = 0; i < 8 * 8; i++, tmp++)
543 for (i = 0; i < 8 * 8; i++, tmp++)
544 ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
548 static int var_inter(const s16 *old, const s16 *new)
553 for (i = 0; i < 8 * 8; i++, old++, new++)
554 ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
558 static int decide_blocktype(const u8 *cur, const u8 *reference,
559 s16 *deltablock, unsigned int stride,
560 unsigned int input_step)
569 fill_encoder_block(cur, tmp, stride, input_step);
570 fill_encoder_block(reference, old, 8, 1);
571 vari = var_intra(tmp);
573 for (k = 0; k < 8; k++) {
574 for (l = 0; l < 8; l++) {
575 *deltablock = *work - *reference;
582 vard = var_inter(old, tmp);
583 return vari <= vard ? IBLOCK : PBLOCK;
586 static void fill_decoder_block(u8 *dst, const s16 *input, int stride)
590 for (i = 0; i < 8; i++) {
591 for (j = 0; j < 8; j++, input++, dst++) {
594 else if (*input > 255)
603 static void add_deltas(s16 *deltas, const u8 *ref, int stride)
607 for (k = 0; k < 8; k++) {
608 for (l = 0; l < 8; l++) {
611 * Due to quantizing, it might possible that the
612 * decoded coefficients are slightly out of range
616 else if (*deltas > 255)
624 static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
625 struct cframe *cf, u32 height, u32 width,
626 unsigned int input_step,
627 bool is_intra, bool next_is_intra)
629 u8 *input_start = input;
630 __be16 *rlco_start = *rlco;
632 __be16 pframe_bit = htons(PFRAME_BIT);
634 unsigned int last_size = 0;
637 for (j = 0; j < height / 8; j++) {
638 for (i = 0; i < width / 8; i++) {
639 /* intra code, first frame is always intra coded. */
640 int blocktype = IBLOCK;
644 blocktype = decide_blocktype(input, refp,
645 deltablock, width, input_step);
646 if (is_intra || blocktype == IBLOCK) {
647 fwht(input, cf->coeffs, width, input_step, 1);
648 quantize_intra(cf->coeffs, cf->de_coeffs);
652 encoding |= FRAME_PCODED;
653 fwht16(deltablock, cf->coeffs, 8, 0);
654 quantize_inter(cf->coeffs, cf->de_coeffs);
656 if (!next_is_intra) {
657 ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
659 if (blocktype == PBLOCK)
660 add_deltas(cf->de_fwht, refp, 8);
661 fill_decoder_block(refp, cf->de_fwht, 8);
664 input += 8 * input_step;
667 if (encoding & FRAME_UNENCODED)
670 size = rlc(cf->coeffs, *rlco, blocktype);
671 if (last_size == size &&
672 !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
673 __be16 *last_rlco = *rlco - size;
674 s16 hdr = ntohs(*last_rlco);
676 if (!((*last_rlco ^ **rlco) & pframe_bit) &&
677 (hdr & DUPS_MASK) < DUPS_MASK)
678 *last_rlco = htons(hdr + 2);
684 if (*rlco >= rlco_max)
685 encoding |= FRAME_UNENCODED;
688 input += width * 7 * input_step;
690 if (encoding & FRAME_UNENCODED) {
691 u8 *out = (u8 *)rlco_start;
695 * The compressed stream should never contain the magic
696 * header, so when we copy the YUV data we replace 0xff
697 * by 0xfe. Since YUV is limited range such values
698 * shouldn't appear anyway.
700 for (i = 0; i < height * width; i++, input += input_step)
701 *out++ = (*input == 0xff) ? 0xfe : *input;
702 *rlco = (__be16 *)out;
707 u32 encode_frame(struct raw_frame *frm, struct raw_frame *ref_frm,
708 struct cframe *cf, bool is_intra, bool next_is_intra)
710 unsigned int size = frm->height * frm->width;
711 __be16 *rlco = cf->rlc_data;
715 rlco_max = rlco + size / 2 - 256;
716 encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
717 frm->height, frm->width,
718 1, is_intra, next_is_intra);
719 if (encoding & FRAME_UNENCODED)
720 encoding |= LUMA_UNENCODED;
721 encoding &= ~FRAME_UNENCODED;
722 rlco_max = rlco + size / 8 - 256;
723 encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max, cf,
724 frm->height / 2, frm->width / 2,
725 frm->chroma_step, is_intra, next_is_intra);
726 if (encoding & FRAME_UNENCODED)
727 encoding |= CB_UNENCODED;
728 encoding &= ~FRAME_UNENCODED;
729 rlco_max = rlco + size / 8 - 256;
730 encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max, cf,
731 frm->height / 2, frm->width / 2,
732 frm->chroma_step, is_intra, next_is_intra);
733 if (encoding & FRAME_UNENCODED)
734 encoding |= CR_UNENCODED;
735 encoding &= ~FRAME_UNENCODED;
736 cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
740 static void decode_plane(struct cframe *cf, const __be16 **rlco, u8 *ref,
741 u32 height, u32 width, bool uncompressed)
743 unsigned int copies = 0;
749 memcpy(ref, *rlco, width * height);
750 *rlco += width * height / 2;
755 * When decoding each macroblock the rlco pointer will be increased
756 * by 65 * 2 bytes worst-case.
757 * To avoid overflow the buffer has to be 65/64th of the actual raw
758 * image size, just in case someone feeds it malicious data.
760 for (j = 0; j < height / 8; j++) {
761 for (i = 0; i < width / 8; i++) {
762 u8 *refp = ref + j * 8 * width + i * 8;
765 memcpy(cf->de_fwht, copy, sizeof(copy));
766 if (stat & PFRAME_BIT)
767 add_deltas(cf->de_fwht, refp, width);
768 fill_decoder_block(refp, cf->de_fwht, width);
773 stat = derlc(rlco, cf->coeffs);
775 if (stat & PFRAME_BIT)
776 dequantize_inter(cf->coeffs);
778 dequantize_intra(cf->coeffs);
780 ifwht(cf->coeffs, cf->de_fwht,
781 (stat & PFRAME_BIT) ? 0 : 1);
783 copies = (stat & DUPS_MASK) >> 1;
785 memcpy(copy, cf->de_fwht, sizeof(copy));
786 if (stat & PFRAME_BIT)
787 add_deltas(cf->de_fwht, refp, width);
788 fill_decoder_block(refp, cf->de_fwht, width);
793 void decode_frame(struct cframe *cf, struct raw_frame *ref, u32 hdr_flags)
795 const __be16 *rlco = cf->rlc_data;
797 decode_plane(cf, &rlco, ref->luma, cf->height, cf->width,
798 hdr_flags & VICODEC_FL_LUMA_IS_UNCOMPRESSED);
799 decode_plane(cf, &rlco, ref->cb, cf->height / 2, cf->width / 2,
800 hdr_flags & VICODEC_FL_CB_IS_UNCOMPRESSED);
801 decode_plane(cf, &rlco, ref->cr, cf->height / 2, cf->width / 2,
802 hdr_flags & VICODEC_FL_CR_IS_UNCOMPRESSED);