diff --git a/decoder/LAVVideo/decoders/avcodec.cpp b/decoder/LAVVideo/decoders/avcodec.cpp index 369640d..7e1b4c7 100644 --- a/decoder/LAVVideo/decoders/avcodec.cpp +++ b/decoder/LAVVideo/decoders/avcodec.cpp @@ -690,7 +690,7 @@ STDMETHODIMP CDecAvcodec::Decode(const BYTE *buffer, int buflen, REFERENCE_TIME // MPEG-2 and VC-1 just wait for a keyframe.. if (m_nCodecId == CODEC_ID_H264 && (bParserFrame || !m_pParser || got_picture)) { m_h264RandomAccess.judgeFrameUsability(m_pFrame, &got_picture); - } else if (m_nCodecId == CODEC_ID_MPEG2VIDEO || m_nCodecId == CODEC_ID_VC1) { + } else if (m_nCodecId == CODEC_ID_MPEG2VIDEO || m_nCodecId == CODEC_ID_VC1 || m_nCodecId == CODEC_ID_RV30 || m_nCodecId == CODEC_ID_RV40) { if (m_bWaitingForKeyFrame && got_picture) { if (m_pFrame->key_frame) { m_bWaitingForKeyFrame = FALSE; diff --git a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h index e02546e..63d6b12 100644 --- a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h +++ b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h @@ -29,15 +29,26 @@ reg = _mm_srli_epi16(reg, 8-bits); /* shift to the required dithering strength */ // Load 8 16-bit pixels into a register, and dither them to 8 bit -// The 8-bit pixels will be in the low-bytes of the 8 16-bit parts +// The 8-bit pixels will be in the high-bytes of the 8 16-bit parts +// NOTE: the low-bytes are clobbered, and not empty. // reg - register to store pixels in // dreg - register with dithering coefficients // src - memory pointer of the source // shift - shift offset to 8-bit (ie. 2 for 10bit) -#define PIXCONV_LOAD_PIXEL16_DITHER(reg,dreg,src,shift) \ +#define PIXCONV_LOAD_PIXEL16_DITHER_HIGH(reg,dreg,src,shift) \ reg = _mm_load_si128((const __m128i *)(src)); /* load (aligned) */ \ - reg = _mm_adds_epu16(reg, dreg); /* dither */ \ - reg = _mm_srli_epi16(reg, shift); /* shift to 8-bit */ + reg = _mm_slli_epi16(reg, 8-shift); /* shift to 16-bit */ \ + reg = _mm_adds_epu16(reg, dreg); /* dither */ + +// Load 8 16-bit pixels into a register, and dither them to 8 bit +// The 8-bit pixels will be in the low-bytes of the 8 16-bit parts +// reg - register to store pixels in +// dreg - register with dithering coefficients +// src - memory pointer of the source +// shift - shift offset to 8-bit (ie. 2 for 10bit) +#define PIXCONV_LOAD_PIXEL16_DITHER(reg,dreg,src,shift) \ + PIXCONV_LOAD_PIXEL16_DITHER_HIGH(reg,dreg,src,shift) \ + reg = _mm_srli_epi16(reg, 8); /* shift to 8-bit */ // Load 8 16-bit pixels into a register, and dither them to 8 bit // The 8-bit pixels will be in the 8 low-bytes in the register diff --git a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp index 40425e3..524fa1b 100644 --- a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp +++ b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp @@ -40,16 +40,18 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_yv12_nv12_dither_le) int line, i; - __m128i xmm0,xmm1,xmm2,xmm3,xmm4; + __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5; uint8_t *dstY = dst; uint8_t *dstV = dst + outYStride * height; uint8_t *dstU = dstV + outUVStride * (height >> 1); + xmm5 = _mm_set1_epi32(0xff00ff00); + // Process Y for (line = 0; line < height; ++line) { // Load dithering coefficients for this line - PIXCONV_LOAD_DITHER_COEFFS(xmm4,line,shift,dithers); + PIXCONV_LOAD_DITHER_COEFFS(xmm4,line,8,dithers); __m128i *dst128Y = (__m128i *)(dst + line * outYStride); @@ -70,7 +72,7 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_yv12_nv12_dither_le) for (line = 0; line < (height >> 1); ++line) { // Load dithering coefficients for this line - PIXCONV_LOAD_DITHER_COEFFS(xmm4,line,shift,dithers); + PIXCONV_LOAD_DITHER_COEFFS(xmm4,line,8,dithers); __m128i *dst128UV = (__m128i *)(dstV + line * outYStride); __m128i *dst128U = (__m128i *)(dstU + line * outUVStride); @@ -79,12 +81,12 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_yv12_nv12_dither_le) for (i = 0; i < chromaWidth; i+=16) { PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (u+i), shift); /* U0U0U0U0 */ PIXCONV_LOAD_PIXEL16_DITHER(xmm1, xmm4, (u+i+8), shift); /* U0U0U0U0 */ - PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm4, (v+i), shift); /* V0V0V0V0 */ - PIXCONV_LOAD_PIXEL16_DITHER(xmm3, xmm4, (v+i+8), shift); /* V0V0V0V0 */ + PIXCONV_LOAD_PIXEL16_DITHER_HIGH(xmm2, xmm4, (v+i), shift); /* 0V0V0V0V */ + PIXCONV_LOAD_PIXEL16_DITHER_HIGH(xmm3, xmm4, (v+i+8), shift); /* 0V0V0V0V */ if (nv12) { - xmm2 = _mm_slli_epi16(xmm2, 8); /* 0V0V0V0V */ - xmm3 = _mm_slli_epi16(xmm3, 8); /* 0V0V0V0V */ + xmm2 = _mm_and_si128(xmm2, xmm5); + xmm3 = _mm_and_si128(xmm3, xmm5); xmm0 = _mm_or_si128(xmm0, xmm2); /* UVUVUVUV */ xmm1 = _mm_or_si128(xmm1, xmm3); /* UVUVUVUV */ _mm_stream_si128(dst128UV++, xmm0); @@ -93,6 +95,8 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_yv12_nv12_dither_le) xmm0 = _mm_packus_epi16(xmm0, xmm1); /* UUUUUUUU */ _mm_stream_si128(dst128U++, xmm0); + xmm2 = _mm_srli_epi16(xmm2, 8); + xmm3 = _mm_srli_epi16(xmm3, 8); xmm2 = _mm_packus_epi16(xmm2, xmm3); /* VVVVVVVV */ _mm_stream_si128(dst128V++, xmm2); } diff --git a/decoder/LAVVideo/pixconv/yuv444_ayuv.cpp b/decoder/LAVVideo/pixconv/yuv444_ayuv.cpp index 1eb15a4..8aa143b 100644 --- a/decoder/LAVVideo/pixconv/yuv444_ayuv.cpp +++ b/decoder/LAVVideo/pixconv/yuv444_ayuv.cpp @@ -107,19 +107,19 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv444_ayuv_dither_le) for (line = 0; line < height; ++line) { // Load dithering coefficients for this line - PIXCONV_LOAD_DITHER_COEFFS(xmm7,line,shift,dithers); + PIXCONV_LOAD_DITHER_COEFFS(xmm7,line,8,dithers); __m128i *dst128 = (__m128i *)(dst + line * outStride); for (i = 0; i < width; i+=8) { // Load pixels into registers, and apply dithering PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm7, (y+i), shift); /* Y0Y0Y0Y0 */ - PIXCONV_LOAD_PIXEL16_DITHER(xmm1, xmm7, (u+i), shift); /* U0U0U0U0 */ + PIXCONV_LOAD_PIXEL16_DITHER_HIGH(xmm1, xmm7, (u+i), shift); /* U0U0U0U0 */ PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm7, (v+i), shift); /* V0V0V0V0 */ // Interlave into AYUV xmm0 = _mm_or_si128(xmm0, xmm6); /* YAYAYAYA */ - xmm1 = _mm_slli_epi16(xmm1, 8); /* 0U0U0U0U */ + xmm1 = _mm_and_si128(xmm1, xmm6); /* clear out clobbered low-bytes */ xmm2 = _mm_or_si128(xmm2, xmm1); /* VUVUVUVU */ xmm3 = _mm_unpacklo_epi16(xmm2, xmm0); /* VUYAVUYA */ xmm4 = _mm_unpackhi_epi16(xmm2, xmm0); /* VUYAVUYA */