1 /*
2  * Copyright (c) 2020-2022 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef GRAPHIC_LITE_GRAPHIC_NEON_UTILS_H
17 #define GRAPHIC_LITE_GRAPHIC_NEON_UTILS_H
18 
19 #include "graphic_config.h"
20 #ifdef ARM_NEON_OPT
21 #include <arm_neon.h>
22 
23 #include "gfx_utils/color.h"
24 #include "gfx_utils/graphic_math.h"
25 #include "gfx_utils/graphic_types.h"
26 
27 namespace OHOS {
28 #define BASEMSB 128
29 #define NEON_STEP_4 4
30 #define NEON_STEP_8 8
31 #define NEON_STEP_32 32
32 #define NEON_A 3
33 #define NEON_R 2
34 #define NEON_G 1
35 #define NEON_B 0
36 
Multipling(uint8x8_t a,uint8x8_t b)37 static inline uint8x8_t Multipling(uint8x8_t a, uint8x8_t b)
38 {
39     uint16x8_t calcType = vmlal_u8(vdupq_n_u16(BASEMSB), a, b);
40     uint8x8_t result = vshrn_n_u16(calcType, NEON_STEP_8);
41     return vshrn_n_u16(vaddq_u16(vmovl_u8(result), calcType), NEON_STEP_8);
42 }
43 
NeonPreLerp(uint8x8_t p,uint8x8_t q,uint8x8_t a)44 static inline uint8x8_t NeonPreLerp(uint8x8_t p, uint8x8_t q, uint8x8_t a)
45 {
46     uint16x8_t calcType = vaddl_u8(p, q);
47     return vsub_u8(vshrn_n_u16(calcType, NEON_STEP_8), Multipling(p, a));
48 }
49 
NeonLerp(uint8x8_t p,uint8x8_t q,uint8x8_t alpha)50 static inline uint8x8_t NeonLerp(uint8x8_t p, uint8x8_t q, uint8x8_t alpha)
51 {
52     uint16x8_t mulRes = vmlal_u8(vdupq_n_u16(BASEMSB), alpha, vsub_u8(p, q));
53     uint8x8_t result = vshrn_n_u16(mulRes, NEON_STEP_8);
54 
55     return vqadd_u8(p, vshrn_n_u16(vaddq_u16(vmovl_u8(result), mulRes), NEON_STEP_8));
56 }
57 // return vIn / 255
NeonFastDiv255(uint16x8_t vIn)58 static inline uint8x8_t NeonFastDiv255(uint16x8_t vIn)
59 {
60     // 257: 2^8 + 1; 8: number of shifts
61     return vmovn_u16(vshrq_n_u16(vIn + vshrq_n_u16(vIn + vdupq_n_u16(257), 8), 8));
62 }
63 
64 // return a * b / 255
NeonMulDiv255(uint8x8_t a,uint8x8_t b)65 static inline uint8x8_t NeonMulDiv255(uint8x8_t a, uint8x8_t b)
66 {
67     return NeonFastDiv255(vmull_u8(a, b));
68 }
69 
70 // return a / b
71 // a, b and result are floating-point numbers.
NeonDiv(float32x4_t a,float32x4_t b)72 static inline float32x4_t NeonDiv(float32x4_t a, float32x4_t b)
73 {
74     float32x4_t reciprocal = vrecpeq_f32(b);
75     reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
76     reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
77     return vmulq_f32(a, reciprocal);
78 }
79 
80 // return a / b
81 // a is a 16-bits integer, b and result are 8-bits integers.
NeonDivInt(uint16x8_t a,uint8x8_t b)82 static inline uint8x8_t NeonDivInt(uint16x8_t a, uint8x8_t b)
83 {
84     float32x4_t low = NeonDiv(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a))),
85                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(b)))));
86     float32x4_t high = NeonDiv(vcvtq_f32_u32(vmovl_u16(vget_high_u16(a))),
87                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(b)))));
88     return vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(low)), vmovn_u32(vcvtq_u32_f32(high))));
89 }
90 
NeonMemcpy(void * dst,int32_t dstSize,const void * src,int32_t srcSize)91 static void NeonMemcpy(void* dst, int32_t dstSize, const void* src, int32_t srcSize)
92 {
93     int32_t sz = MATH_MIN(dstSize, srcSize);
94     // 64-bytes aligned
95     int32_t mod = sz % 64;
96     if (mod) {
97         if (memcpy_s(dst, mod, src, mod) != EOK) {
98             return;
99         }
100         sz -= mod;
101         if (sz == 0) {
102             return;
103         }
104         dst = (uint8_t*)dst + mod;
105         src = (uint8_t*)src + mod;
106     }
107 
108     asm volatile(
109         "NEONCopyPLD: \n"
110         " PLD [%[src], #0xC0] \n"
111         " VLDM %[src]!, {d0-d7} \n"
112         " VSTM %[dst]!, {d0-d7} \n"
113         " SUBS %[sz], %[sz], #0x40 \n"
114         " BGT NEONCopyPLD \n"
115         : [dst] "+r"(dst), [src] "+r"(src), [sz] "+r"(sz)
116         :
117         : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "cc", "memory");
118 }
119 
NeonBlendRGBA(uint8x8_t & r1,uint8x8_t & g1,uint8x8_t & b1,uint8x8_t & a1,uint8x8_t r2,uint8x8_t g2,uint8x8_t b2,uint8x8_t a2)120 static inline void NeonBlendRGBA(uint8x8_t& r1, uint8x8_t& g1, uint8x8_t& b1, uint8x8_t& a1,
121                                  uint8x8_t r2, uint8x8_t g2, uint8x8_t b2, uint8x8_t a2)
122 {
123     uint8x8_t da = NeonMulDiv255(a1, vdup_n_u8(OPA_OPAQUE) - a2);
124     a1 = a1 - NeonMulDiv255(a2, a1) + a2;
125     uint16x8_t r = vmull_u8(r2, a2) + vmull_u8(r1, da);
126     uint16x8_t g = vmull_u8(g2, a2) + vmull_u8(g1, da);
127     uint16x8_t b = vmull_u8(b2, a2) + vmull_u8(b1, da);
128     r1 = NeonDivInt(r, a1);
129     g1 = NeonDivInt(g, a1);
130     b1 = NeonDivInt(b, a1);
131 }
132 
NeonBlendXRGB(uint8x8_t & r1,uint8x8_t & g1,uint8x8_t & b1,uint8x8_t & a1,uint8x8_t r2,uint8x8_t g2,uint8x8_t b2,uint8x8_t a2)133 static inline void NeonBlendXRGB(uint8x8_t& r1, uint8x8_t& g1, uint8x8_t& b1, uint8x8_t& a1,
134                                  uint8x8_t r2, uint8x8_t g2, uint8x8_t b2, uint8x8_t a2)
135 {
136     uint8x8_t da = vdup_n_u8(OPA_OPAQUE) - a2;
137     a1 = a1 - NeonMulDiv255(a2, a1) + a2;
138     uint16x8_t r = vmull_u8(r2, a2) + vmull_u8(r1, da);
139     uint16x8_t g = vmull_u8(g2, a2) + vmull_u8(g1, da);
140     uint16x8_t b = vmull_u8(b2, a2) + vmull_u8(b1, da);
141     r1 = NeonDivInt(r, a1);
142     g1 = NeonDivInt(g, a1);
143     b1 = NeonDivInt(b, a1);
144 }
145 
NeonBlendRGB(uint8x8_t & r1,uint8x8_t & g1,uint8x8_t & b1,uint8x8_t & a1,uint8x8_t r2,uint8x8_t g2,uint8x8_t b2,uint8x8_t a2)146 static inline void NeonBlendRGB(uint8x8_t& r1, uint8x8_t& g1, uint8x8_t& b1, uint8x8_t& a1,
147                                 uint8x8_t r2, uint8x8_t g2, uint8x8_t b2, uint8x8_t a2)
148 {
149     uint8x8_t da = vdup_n_u8(OPA_OPAQUE) - a2;
150     r1 = NeonMulDiv255(r2, a2) + NeonMulDiv255(r1, da);
151     g1 = NeonMulDiv255(g2, a2) + NeonMulDiv255(g1, da);
152     b1 = NeonMulDiv255(b2, a2) + NeonMulDiv255(b1, da);
153 }
154 
LoadBuf_ARGB8888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)155 static inline void LoadBuf_ARGB8888(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
156 {
157     uint8x8x4_t vBuf = vld4_u8(buf);
158     r = vBuf.val[NEON_R];
159     g = vBuf.val[NEON_G];
160     b = vBuf.val[NEON_B];
161     a = vBuf.val[NEON_A];
162 }
163 
LoadBuf_XRGB8888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)164 static inline void LoadBuf_XRGB8888(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
165 {
166     uint8x8x4_t vBuf = vld4_u8(buf);
167     r = vBuf.val[NEON_R];
168     g = vBuf.val[NEON_G];
169     b = vBuf.val[NEON_B];
170     a = vBuf.val[NEON_A];
171 }
172 
LoadBuf_RGB888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)173 static inline void LoadBuf_RGB888(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
174 {
175     uint8x8x3_t vBuf = vld3_u8(buf);
176     r = vBuf.val[NEON_R];
177     g = vBuf.val[NEON_G];
178     b = vBuf.val[NEON_B];
179 }
180 
LoadBuf_RGB565(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)181 static inline void LoadBuf_RGB565(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
182 {
183     uint16x8_t vBuf = vld1q_u16(reinterpret_cast<uint16_t*>(buf));
184     // 3: RRRRRGGG|GGGBBBBB => RRGGGGGG|BBBBB000
185     b = vmovn_u16(vshlq_n_u16(vBuf, 3));
186     // 5, 2: RRRRRGGG|GGGBBBBB => XXXRRRRR|GGGGGG00
187     g = vshl_n_u8(vshrn_n_u16(vBuf, 5), 2);
188     // 11, 3: RRRRRGGG|GGGBBBBB => XXXXXXXX|RRRRR000
189     r = vmovn_u16(vshlq_n_u16(vshrq_n_u16(vBuf, 11), 3));
190 }
191 
LoadBufA_ARGB8888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a,uint8_t opa)192 static inline void LoadBufA_ARGB8888(uint8_t* buf,
193                                      uint8x8_t& r,
194                                      uint8x8_t& g,
195                                      uint8x8_t& b,
196                                      uint8x8_t& a,
197                                      uint8_t opa)
198 {
199     uint8x8x4_t vBuf = vld4_u8(buf);
200     r = vBuf.val[NEON_R];
201     g = vBuf.val[NEON_G];
202     b = vBuf.val[NEON_B];
203     a = NeonMulDiv255(vBuf.val[NEON_A], vdup_n_u8(opa));
204 }
205 
LoadBufA_XRGB8888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a,uint8_t opa)206 static inline void LoadBufA_XRGB8888(uint8_t* buf,
207                                      uint8x8_t& r,
208                                      uint8x8_t& g,
209                                      uint8x8_t& b,
210                                      uint8x8_t& a,
211                                      uint8_t opa)
212 {
213     uint8x8x4_t vBuf = vld4_u8(buf);
214     r = vBuf.val[NEON_R];
215     g = vBuf.val[NEON_G];
216     b = vBuf.val[NEON_B];
217     a = vdup_n_u8(opa);
218 }
219 
LoadBufA_RGB888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a,uint8_t opa)220 static inline void LoadBufA_RGB888(uint8_t* buf,
221                                    uint8x8_t& r,
222                                    uint8x8_t& g,
223                                    uint8x8_t& b,
224                                    uint8x8_t& a,
225                                    uint8_t opa)
226 {
227     uint8x8x3_t vBuf = vld3_u8(buf);
228     r = vBuf.val[NEON_R];
229     g = vBuf.val[NEON_G];
230     b = vBuf.val[NEON_B];
231     a = vdup_n_u8(opa);
232 }
233 
LoadBufA_RGB565(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a,uint8_t opa)234 static inline void LoadBufA_RGB565(uint8_t* buf,
235                                    uint8x8_t& r,
236                                    uint8x8_t& g,
237                                    uint8x8_t& b,
238                                    uint8x8_t& a,
239                                    uint8_t opa)
240 {
241     uint16x8_t vBuf = vld1q_u16(reinterpret_cast<uint16_t*>(buf));
242     // 3: RRRRRGGG|GGGBBBBB => RRGGGGGG|BBBBB000
243     b = vmovn_u16(vshlq_n_u16(vBuf, 3));
244     // 5, 2: RRRRRGGG|GGGBBBBB => XXXRRRRR|GGGGGG00
245     g = vshl_n_u8(vshrn_n_u16(vBuf, 5), 2);
246     // 11, 3: RRRRRGGG|GGGBBBBB => XXXXXXXX|RRRRR000
247     r = vmovn_u16(vshlq_n_u16(vshrq_n_u16(vBuf, 11), 3));
248     a = vdup_n_u8(opa);
249 }
SetPixelColor_ARGB8888(uint8_t * buf,const uint8_t & r,const uint8_t & g,const uint8_t & b,const uint8_t & a)250 static inline void SetPixelColor_ARGB8888(uint8_t* buf,
251                                           const uint8_t& r,
252                                           const uint8_t& g,
253                                           const uint8_t& b,
254                                           const uint8_t& a)
255 {
256     uint8x8x4_t vBuf;
257     vBuf.val[NEON_R] = vdup_n_u8(r);
258     vBuf.val[NEON_G] = vdup_n_u8(g);
259     vBuf.val[NEON_B] = vdup_n_u8(b);
260     vBuf.val[NEON_A] = vdup_n_u8(a);
261     vst4_u8(buf, vBuf);
262 }
SetPixelColor_ARGB8888(uint8_t * dstBuf,uint8_t * srcBuf)263 static inline void SetPixelColor_ARGB8888(uint8_t* dstBuf, uint8_t* srcBuf)
264 {
265     uint8x8x4_t vSrcBuf = vld4_u8(srcBuf);
266     uint8x8x4_t vDstBuf;
267     vDstBuf.val[NEON_R] = vSrcBuf.val[NEON_R];
268     vDstBuf.val[NEON_G] = vSrcBuf.val[NEON_G];
269     vDstBuf.val[NEON_B] = vSrcBuf.val[NEON_B];
270     vDstBuf.val[NEON_A] = vSrcBuf.val[NEON_A];
271     vst4_u8(dstBuf, vDstBuf);
272 }
StoreBuf_ARGB8888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)273 static inline void StoreBuf_ARGB8888(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
274 {
275     uint8x8x4_t vBuf;
276     vBuf.val[NEON_R] = r;
277     vBuf.val[NEON_G] = g;
278     vBuf.val[NEON_B] = b;
279     vBuf.val[NEON_A] = a;
280     vst4_u8(buf, vBuf);
281 }
282 
StoreBuf_XRGB8888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)283 static inline void StoreBuf_XRGB8888(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
284 {
285     uint8x8x4_t vBuf;
286     vBuf.val[NEON_R] = r;
287     vBuf.val[NEON_G] = g;
288     vBuf.val[NEON_B] = b;
289     vBuf.val[NEON_A] = a;
290     vst4_u8(buf, vBuf);
291 }
292 
StoreBuf_RGB888(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)293 static inline void StoreBuf_RGB888(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
294 {
295     uint8x8x3_t vBuf;
296     vBuf.val[NEON_R] = r;
297     vBuf.val[NEON_G] = g;
298     vBuf.val[NEON_B] = b;
299     vst3_u8(buf, vBuf);
300 }
301 
StoreBuf_RGB565(uint8_t * buf,uint8x8_t & r,uint8x8_t & g,uint8x8_t & b,uint8x8_t & a)302 static inline void StoreBuf_RGB565(uint8_t* buf, uint8x8_t& r, uint8x8_t& g, uint8x8_t& b, uint8x8_t& a)
303 {
304     /* red left shift 8 bit.
305      * vBuf => RRRRRXXX|XXXXXXXX
306      */
307     uint16x8_t vBuf = vshll_n_u8(r, 8);
308     /* Keep the first 5 digits of vBuf, and splice it with vshll_n_u8(g, 8).
309      * vBuf => RRRRRGGG|GGGXXXXX
310      */
311     vBuf = vsriq_n_u16(vBuf, vshll_n_u8(g, 8), 5);
312     /* Keep the first 11 digits of vBuf, and splice it with vshll_n_u8(b, 8).
313      * vBuf => RRRRRGGG|GGGBBBBB
314      */
315     vBuf = vsriq_n_u16(vBuf, vshll_n_u8(b, 8), 11);
316     vst1q_u16(reinterpret_cast<uint16_t*>(buf), vBuf);
317 }
318 } // namespace OHOS
319 #endif
320 #endif
321