1f1213b82SDamjan Marion/*
2f1213b82SDamjan Marion * Copyright (c) 2016 Cisco and/or its affiliates.
3f1213b82SDamjan Marion * Licensed under the Apache License, Version 2.0 (the "License");
4f1213b82SDamjan Marion * you may not use this file except in compliance with the License.
5f1213b82SDamjan Marion * You may obtain a copy of the License at:
6f1213b82SDamjan Marion *
7f1213b82SDamjan Marion *     http://www.apache.org/licenses/LICENSE-2.0
8f1213b82SDamjan Marion *
9f1213b82SDamjan Marion * Unless required by applicable law or agreed to in writing, software
10f1213b82SDamjan Marion * distributed under the License is distributed on an "AS IS" BASIS,
11f1213b82SDamjan Marion * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12f1213b82SDamjan Marion * See the License for the specific language governing permissions and
13f1213b82SDamjan Marion * limitations under the License.
14f1213b82SDamjan Marion */
15f1213b82SDamjan Marion/*-
16f1213b82SDamjan Marion *   BSD LICENSE
17f1213b82SDamjan Marion *
18f1213b82SDamjan Marion *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
19f1213b82SDamjan Marion *   All rights reserved.
20f1213b82SDamjan Marion *
21f1213b82SDamjan Marion *   Redistribution and use in source and binary forms, with or without
22f1213b82SDamjan Marion *   modification, are permitted provided that the following conditions
23f1213b82SDamjan Marion *   are met:
24f1213b82SDamjan Marion *
25f1213b82SDamjan Marion *     * Redistributions of source code must retain the above copyright
26f1213b82SDamjan Marion *       notice, this list of conditions and the following disclaimer.
27f1213b82SDamjan Marion *     * Redistributions in binary form must reproduce the above copyright
28f1213b82SDamjan Marion *       notice, this list of conditions and the following disclaimer in
29f1213b82SDamjan Marion *       the documentation and/or other materials provided with the
30f1213b82SDamjan Marion *       distribution.
31f1213b82SDamjan Marion *     * Neither the name of Intel Corporation nor the names of its
32f1213b82SDamjan Marion *       contributors may be used to endorse or promote products derived
33f1213b82SDamjan Marion *       from this software without specific prior written permission.
34f1213b82SDamjan Marion *
35f1213b82SDamjan Marion *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
36f1213b82SDamjan Marion *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37f1213b82SDamjan Marion *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
38f1213b82SDamjan Marion *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
39f1213b82SDamjan Marion *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40f1213b82SDamjan Marion *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
41f1213b82SDamjan Marion *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
42f1213b82SDamjan Marion *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43f1213b82SDamjan Marion *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44f1213b82SDamjan Marion *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
45f1213b82SDamjan Marion *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46f1213b82SDamjan Marion */
47f1213b82SDamjan Marion
48f1213b82SDamjan Marion#ifndef included_clib_memcpy_sse3_h
49f1213b82SDamjan Marion#define included_clib_memcpy_sse3_h
50f1213b82SDamjan Marion
51f1213b82SDamjan Marion#include <stdint.h>
52f1213b82SDamjan Marion#include <x86intrin.h>
53a66971f9SBenoît Ganne#include <vppinfra/warnings.h>
54a66971f9SBenoît Ganne
55a66971f9SBenoît Ganne/* *INDENT-OFF* */
56a66971f9SBenoît GanneWARN_OFF (stringop-overflow)
57a66971f9SBenoît Ganne/* *INDENT-ON* */
58f1213b82SDamjan Marion
59f1213b82SDamjan Marionstatic inline void
60c3799996SDave Barachclib_mov16 (u8 * dst, const u8 * src)
61f1213b82SDamjan Marion{
6231e59d9bSDamjan Marion  __m128i xmm0;
6331e59d9bSDamjan Marion
6431e59d9bSDamjan Marion  xmm0 = _mm_loadu_si128 ((const __m128i *) src);
6531e59d9bSDamjan Marion  _mm_storeu_si128 ((__m128i *) dst, xmm0);
66f1213b82SDamjan Marion}
67f1213b82SDamjan Marion
68f1213b82SDamjan Marionstatic inline void
69c3799996SDave Barachclib_mov32 (u8 * dst, const u8 * src)
70f1213b82SDamjan Marion{
7131e59d9bSDamjan Marion  clib_mov16 ((u8 *) dst + 0 * 16, (const u8 *) src + 0 * 16);
7231e59d9bSDamjan Marion  clib_mov16 ((u8 *) dst + 1 * 16, (const u8 *) src + 1 * 16);
73f1213b82SDamjan Marion}
74f1213b82SDamjan Marion
75f1213b82SDamjan Marionstatic inline void
76c3799996SDave Barachclib_mov64 (u8 * dst, const u8 * src)
77f1213b82SDamjan Marion{
78c3799996SDave Barach  clib_mov32 ((u8 *) dst + 0 * 32, (const u8 *) src + 0 * 32);
79c3799996SDave Barach  clib_mov32 ((u8 *) dst + 1 * 32, (const u8 *) src + 1 * 32);
80f1213b82SDamjan Marion}
81f1213b82SDamjan Marion
82f1213b82SDamjan Marionstatic inline void
83c3799996SDave Barachclib_mov128 (u8 * dst, const u8 * src)
84f1213b82SDamjan Marion{
85c3799996SDave Barach  clib_mov64 ((u8 *) dst + 0 * 64, (const u8 *) src + 0 * 64);
86c3799996SDave Barach  clib_mov64 ((u8 *) dst + 1 * 64, (const u8 *) src + 1 * 64);
87f1213b82SDamjan Marion}
88f1213b82SDamjan Marion
89f1213b82SDamjan Marionstatic inline void
90c3799996SDave Barachclib_mov256 (u8 * dst, const u8 * src)
91f1213b82SDamjan Marion{
92c3799996SDave Barach  clib_mov128 ((u8 *) dst + 0 * 128, (const u8 *) src + 0 * 128);
93c3799996SDave Barach  clib_mov128 ((u8 *) dst + 1 * 128, (const u8 *) src + 1 * 128);
94f1213b82SDamjan Marion}
95f1213b82SDamjan Marion
96f1213b82SDamjan Marion/**
97f1213b82SDamjan Marion * Macro for copying unaligned block from one location to another with constant load offset,
98f1213b82SDamjan Marion * 47 bytes leftover maximum,
99f1213b82SDamjan Marion * locations should not overlap.
100f1213b82SDamjan Marion * Requirements:
101f1213b82SDamjan Marion * - Store is aligned
102f1213b82SDamjan Marion * - Load offset is <offset>, which must be immediate value within [1, 15]
103f1213b82SDamjan Marion * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
104f1213b82SDamjan Marion * - <dst>, <src>, <len> must be variables
105f1213b82SDamjan Marion * - __m128i <xmm0> ~ <xmm8> must be pre-defined
106f1213b82SDamjan Marion */
107f1213b82SDamjan Marion#define CLIB_MVUNALIGN_LEFT47_IMM(dst, src, len, offset)                                                    \
108f1213b82SDamjan Marion({                                                                                                          \
109f1213b82SDamjan Marion    int tmp;                                                                                                \
110f1213b82SDamjan Marion    while (len >= 128 + 16 - offset) {                                                                      \
111f1213b82SDamjan Marion        xmm0 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 0 * 16));                       \
112f1213b82SDamjan Marion        len -= 128;                                                                                         \
113f1213b82SDamjan Marion        xmm1 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 1 * 16));                       \
114f1213b82SDamjan Marion        xmm2 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 2 * 16));                       \
115f1213b82SDamjan Marion        xmm3 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 3 * 16));                       \
116f1213b82SDamjan Marion        xmm4 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 4 * 16));                       \
117f1213b82SDamjan Marion        xmm5 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 5 * 16));                       \
118f1213b82SDamjan Marion        xmm6 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 6 * 16));                       \
119f1213b82SDamjan Marion        xmm7 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 7 * 16));                       \
120f1213b82SDamjan Marion        xmm8 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 8 * 16));                       \
121f1213b82SDamjan Marion        src = (const u8 *)src + 128;                                                                        \
122f1213b82SDamjan Marion        _mm_storeu_si128((__m128i *)((u8 *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));             \
123f1213b82SDamjan Marion        _mm_storeu_si128((__m128i *)((u8 *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));             \
124f1213b82SDamjan Marion        _mm_storeu_si128((__m128i *)((u8 *)dst + 2 * 16), _mm_alignr_epi8(xmm3, xmm2, offset));             \
125f1213b82SDamjan Marion        _mm_storeu_si128((__m128i *)((u8 *)dst + 3 * 16), _mm_alignr_epi8(xmm4, xmm3, offset));             \
126f1213b82SDamjan Marion        _mm_storeu_si128((__m128i *)((u8 *)dst + 4 * 16), _mm_alignr_epi8(xmm5, xmm4, offset));             \
127f1213b82SDamjan Marion        _mm_storeu_si128((__m128i *)((u8 *)dst + 5 * 16), _mm_alignr_epi8(xmm6, xmm5, offset));             \
128f1213b82SDamjan Marion        _mm_storeu_si128((__m128i *)((u8 *)dst + 6 * 16), _mm_alignr_epi8(xmm7, xmm6, offset));             \
129f1213b82SDamjan Marion        _mm_storeu_si128((__m128i *)((u8 *)dst + 7 * 16), _mm_alignr_epi8(xmm8, xmm7, offset));             \
130f1213b82SDamjan Marion        dst = (u8 *)dst + 128;                                                                              \
131f1213b82SDamjan Marion    }                                                                                                       \
132f1213b82SDamjan Marion    tmp = len;                                                                                              \
133f1213b82SDamjan Marion    len = ((len - 16 + offset) & 127) + 16 - offset;                                                        \
134f1213b82SDamjan Marion    tmp -= len;                                                                                             \
135f1213b82SDamjan Marion    src = (const u8 *)src + tmp;                                                                            \
136f1213b82SDamjan Marion    dst = (u8 *)dst + tmp;                                                                                  \
137f1213b82SDamjan Marion    if (len >= 32 + 16 - offset) {                                                                          \
138f1213b82SDamjan Marion        while (len >= 32 + 16 - offset) {                                                                   \
139f1213b82SDamjan Marion            xmm0 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 0 * 16));                   \
140f1213b82SDamjan Marion            len -= 32;                                                                                      \
141f1213b82SDamjan Marion            xmm1 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 1 * 16));                   \
142f1213b82SDamjan Marion            xmm2 = _mm_loadu_si128((const __m128i *)((const u8 *)src - offset + 2 * 16));                   \
143f1213b82SDamjan Marion            src = (const u8 *)src + 32;                                                                     \
144f1213b82SDamjan Marion            _mm_storeu_si128((__m128i *)((u8 *)dst + 0 * 16), _mm_alignr_epi8(xmm1, xmm0, offset));         \
145f1213b82SDamjan Marion            _mm_storeu_si128((__m128i *)((u8 *)dst + 1 * 16), _mm_alignr_epi8(xmm2, xmm1, offset));         \
146f1213b82SDamjan Marion            dst = (u8 *)dst + 32;                                                                           \
147f1213b82SDamjan Marion        }                                                                                                   \
148f1213b82SDamjan Marion        tmp = len;                                                                                          \
149f1213b82SDamjan Marion        len = ((len - 16 + offset) & 31) + 16 - offset;                                                     \
150f1213b82SDamjan Marion        tmp -= len;                                                                                         \
151f1213b82SDamjan Marion        src = (const u8 *)src + tmp;                                                                        \
152f1213b82SDamjan Marion        dst = (u8 *)dst + tmp;                                                                              \
153f1213b82SDamjan Marion    }                                                                                                       \
154f1213b82SDamjan Marion})
155f1213b82SDamjan Marion
156f1213b82SDamjan Marion/**
157f1213b82SDamjan Marion * Macro for copying unaligned block from one location to another,
158f1213b82SDamjan Marion * 47 bytes leftover maximum,
159f1213b82SDamjan Marion * locations should not overlap.
160f1213b82SDamjan Marion * Use switch here because the aligning instruction requires immediate value for shift count.
161f1213b82SDamjan Marion * Requirements:
162f1213b82SDamjan Marion * - Store is aligned
163f1213b82SDamjan Marion * - Load offset is <offset>, which must be within [1, 15]
164f1213b82SDamjan Marion * - For <src>, make sure <offset> bit backwards & <16 - offset> bit forwards are available for loading
165f1213b82SDamjan Marion * - <dst>, <src>, <len> must be variables
166f1213b82SDamjan Marion * - __m128i <xmm0> ~ <xmm8> used in CLIB_MVUNALIGN_LEFT47_IMM must be pre-defined
167f1213b82SDamjan Marion */
168f1213b82SDamjan Marion#define CLIB_MVUNALIGN_LEFT47(dst, src, len, offset)                  \
169f1213b82SDamjan Marion({                                                                    \
170f1213b82SDamjan Marion    switch (offset) {                                                 \
171f1213b82SDamjan Marion    case 0x01: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x01); break;   \
172f1213b82SDamjan Marion    case 0x02: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x02); break;   \
173f1213b82SDamjan Marion    case 0x03: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x03); break;   \
174f1213b82SDamjan Marion    case 0x04: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x04); break;   \
175f1213b82SDamjan Marion    case 0x05: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x05); break;   \
176f1213b82SDamjan Marion    case 0x06: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x06); break;   \
177f1213b82SDamjan Marion    case 0x07: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x07); break;   \
178f1213b82SDamjan Marion    case 0x08: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x08); break;   \
179f1213b82SDamjan Marion    case 0x09: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x09); break;   \
180f1213b82SDamjan Marion    case 0x0A: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0A); break;   \
181f1213b82SDamjan Marion    case 0x0B: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0B); break;   \
182f1213b82SDamjan Marion    case 0x0C: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0C); break;   \
183f1213b82SDamjan Marion    case 0x0D: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0D); break;   \
184f1213b82SDamjan Marion    case 0x0E: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0E); break;   \
185f1213b82SDamjan Marion    case 0x0F: CLIB_MVUNALIGN_LEFT47_IMM(dst, src, n, 0x0F); break;   \
186f1213b82SDamjan Marion    default:;                                                         \
187f1213b82SDamjan Marion    }                                                                 \
188f1213b82SDamjan Marion})
189f1213b82SDamjan Marion
190f1213b82SDamjan Marionstatic inline void *
191178cf493SDave Barachclib_memcpy_fast (void *dst, const void *src, size_t n)
192f1213b82SDamjan Marion{
193c3799996SDave Barach  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
194c3799996SDave Barach  uword dstu = (uword) dst;
195c3799996SDave Barach  uword srcu = (uword) src;
196c3799996SDave Barach  void *ret = dst;
197c3799996SDave Barach  size_t dstofss;
198c3799996SDave Barach  size_t srcofs;
199f1213b82SDamjan Marion
200f1213b82SDamjan Marion	/**
201f1213b82SDamjan Marion	 * Copy less than 16 bytes
202f1213b82SDamjan Marion	 */
203c3799996SDave Barach  if (n < 16)
204c3799996SDave Barach    {
205c3799996SDave Barach      if (n & 0x01)
206c3799996SDave Barach	{
207c3799996SDave Barach	  *(u8 *) dstu = *(const u8 *) srcu;
208c3799996SDave Barach	  srcu = (uword) ((const u8 *) srcu + 1);
209c3799996SDave Barach	  dstu = (uword) ((u8 *) dstu + 1);
210f1213b82SDamjan Marion	}
211c3799996SDave Barach      if (n & 0x02)
212c3799996SDave Barach	{
213c3799996SDave Barach	  *(u16 *) dstu = *(const u16 *) srcu;
214c3799996SDave Barach	  srcu = (uword) ((const u16 *) srcu + 1);
215c3799996SDave Barach	  dstu = (uword) ((u16 *) dstu + 1);
216c3799996SDave Barach	}
217c3799996SDave Barach      if (n & 0x04)
218c3799996SDave Barach	{
219c3799996SDave Barach	  *(u32 *) dstu = *(const u32 *) srcu;
220c3799996SDave Barach	  srcu = (uword) ((const u32 *) srcu + 1);
221c3799996SDave Barach	  dstu = (uword) ((u32 *) dstu + 1);
222c3799996SDave Barach	}
223c3799996SDave Barach      if (n & 0x08)
224c3799996SDave Barach	{
225c3799996SDave Barach	  *(u64 *) dstu = *(const u64 *) srcu;
226c3799996SDave Barach	}
227c3799996SDave Barach      return ret;
228c3799996SDave Barach    }
229f1213b82SDamjan Marion
230fad3fb36SDamjan Marion  /**
231fad3fb36SDamjan Marion   * Fast way when copy size doesn't exceed 512 bytes
232fad3fb36SDamjan Marion   */
233c3799996SDave Barach  if (n <= 32)
234c3799996SDave Barach    {
235c3799996SDave Barach      clib_mov16 ((u8 *) dst, (const u8 *) src);
236c3799996SDave Barach      clib_mov16 ((u8 *) dst - 16 + n, (const u8 *) src - 16 + n);
237c3799996SDave Barach      return ret;
238c3799996SDave Barach    }
239c3799996SDave Barach  if (n <= 48)
240c3799996SDave Barach    {
241c3799996SDave Barach      clib_mov32 ((u8 *) dst, (const u8 *) src);
242c3799996SDave Barach      clib_mov16 ((u8 *) dst - 16 + n, (const u8 *) src - 16 + n);
243c3799996SDave Barach      return ret;
244c3799996SDave Barach    }
245c3799996SDave Barach  if (n <= 64)
246c3799996SDave Barach    {
247c3799996SDave Barach      clib_mov32 ((u8 *) dst, (const u8 *) src);
248c3799996SDave Barach      clib_mov16 ((u8 *) dst + 32, (const u8 *) src + 32);
249c3799996SDave Barach      clib_mov16 ((u8 *) dst - 16 + n, (const u8 *) src - 16 + n);
250c3799996SDave Barach      return ret;
251c3799996SDave Barach    }
252c3799996SDave Barach  if (n <= 128)
253c3799996SDave Barach    {
254c3799996SDave Barach      goto COPY_BLOCK_128_BACK15;
255c3799996SDave Barach    }
256c3799996SDave Barach  if (n <= 512)
257c3799996SDave Barach    {
258c3799996SDave Barach      if (n >= 256)
259c3799996SDave Barach	{
260c3799996SDave Barach	  n -= 256;
261c3799996SDave Barach	  clib_mov128 ((u8 *) dst, (const u8 *) src);
262c3799996SDave Barach	  clib_mov128 ((u8 *) dst + 128, (const u8 *) src + 128);
263c3799996SDave Barach	  src = (const u8 *) src + 256;
264c3799996SDave Barach	  dst = (u8 *) dst + 256;
265f1213b82SDamjan Marion	}
266c3799996SDave Barach    COPY_BLOCK_255_BACK15:
267c3799996SDave Barach      if (n >= 128)
268c3799996SDave Barach	{
269c3799996SDave Barach	  n -= 128;
270c3799996SDave Barach	  clib_mov128 ((u8 *) dst, (const u8 *) src);
271c3799996SDave Barach	  src = (const u8 *) src + 128;
272c3799996SDave Barach	  dst = (u8 *) dst + 128;
273f1213b82SDamjan Marion	}
274c3799996SDave Barach    COPY_BLOCK_128_BACK15:
275c3799996SDave Barach      if (n >= 64)
276c3799996SDave Barach	{
277c3799996SDave Barach	  n -= 64;
278c3799996SDave Barach	  clib_mov64 ((u8 *) dst, (const u8 *) src);
279c3799996SDave Barach	  src = (const u8 *) src + 64;
280c3799996SDave Barach	  dst = (u8 *) dst + 64;
281f1213b82SDamjan Marion	}
282c3799996SDave Barach    COPY_BLOCK_64_BACK15:
283c3799996SDave Barach      if (n >= 32)
284c3799996SDave Barach	{
285c3799996SDave Barach	  n -= 32;
286c3799996SDave Barach	  clib_mov32 ((u8 *) dst, (const u8 *) src);
287c3799996SDave Barach	  src = (const u8 *) src + 32;
288c3799996SDave Barach	  dst = (u8 *) dst + 32;
289f1213b82SDamjan Marion	}
290c3799996SDave Barach      if (n > 16)
291c3799996SDave Barach	{
292c3799996SDave Barach	  clib_mov16 ((u8 *) dst, (const u8 *) src);
293c3799996SDave Barach	  clib_mov16 ((u8 *) dst - 16 + n, (const u8 *) src - 16 + n);
294c3799996SDave Barach	  return ret;
295f1213b82SDamjan Marion	}
296c3799996SDave Barach      if (n > 0)
297c3799996SDave Barach	{
298c3799996SDave Barach	  clib_mov16 ((u8 *) dst - 16 + n, (const u8 *) src - 16 + n);
299c3799996SDave Barach	}
300c3799996SDave Barach      return ret;
301c3799996SDave Barach    }
302f1213b82SDamjan Marion
303fad3fb36SDamjan Marion  /**
304fad3fb36SDamjan Marion   * Make store aligned when copy size exceeds 512 bytes,
305fad3fb36SDamjan Marion   * and make sure the first 15 bytes are copied, because
306fad3fb36SDamjan Marion   * unaligned copy functions require up to 15 bytes
307fad3fb36SDamjan Marion   * backwards access.
308fad3fb36SDamjan Marion   */
309fad3fb36SDamjan Marion  dstofss = (uword) dst & 0x0F;
310fad3fb36SDamjan Marion  if (dstofss > 0)
311fad3fb36SDamjan Marion    {
312fad3fb36SDamjan Marion      dstofss = 16 - dstofss + 16;
313fad3fb36SDamjan Marion      n -= dstofss;
314fad3fb36SDamjan Marion      clib_mov32 ((u8 *) dst, (const u8 *) src);
315fad3fb36SDamjan Marion      src = (const u8 *) src + dstofss;
316fad3fb36SDamjan Marion      dst = (u8 *) dst + dstofss;
317fad3fb36SDamjan Marion    }
318c3799996SDave Barach  srcofs = ((uword) src & 0x0F);
319f1213b82SDamjan Marion
320fad3fb36SDamjan Marion  /**
321fad3fb36SDamjan Marion   * For aligned copy
322fad3fb36SDamjan Marion   */
323c3799996SDave Barach  if (srcofs == 0)
324c3799996SDave Barach    {
325fad3fb36SDamjan Marion      /**
326fad3fb36SDamjan Marion       * Copy 256-byte blocks
327fad3fb36SDamjan Marion       */
328c3799996SDave Barach      for (; n >= 256; n -= 256)
329c3799996SDave Barach	{
330c3799996SDave Barach	  clib_mov256 ((u8 *) dst, (const u8 *) src);
331c3799996SDave Barach	  dst = (u8 *) dst + 256;
332c3799996SDave Barach	  src = (const u8 *) src + 256;
333c3799996SDave Barach	}
334f1213b82SDamjan Marion
335fad3fb36SDamjan Marion      /**
336fad3fb36SDamjan Marion       * Copy whatever left
337fad3fb36SDamjan Marion       */
338c3799996SDave Barach      goto COPY_BLOCK_255_BACK15;
339c3799996SDave Barach    }
340f1213b82SDamjan Marion
341fad3fb36SDamjan Marion  /**
342fad3fb36SDamjan Marion   * For copy with unaligned load
343fad3fb36SDamjan Marion   */
344c3799996SDave Barach  CLIB_MVUNALIGN_LEFT47 (dst, src, n, srcofs);
345f1213b82SDamjan Marion
346fad3fb36SDamjan Marion  /**
347fad3fb36SDamjan Marion   * Copy whatever left
348fad3fb36SDamjan Marion   */
349c3799996SDave Barach  goto COPY_BLOCK_64_BACK15;
350f1213b82SDamjan Marion}
351f1213b82SDamjan Marion
352a66971f9SBenoît Ganne/* *INDENT-OFF* */
353a66971f9SBenoît GanneWARN_ON (stringop-overflow)
354a66971f9SBenoît Ganne/* *INDENT-ON* */
355f1213b82SDamjan Marion
356f1213b82SDamjan Marion#undef CLIB_MVUNALIGN_LEFT47_IMM
357f1213b82SDamjan Marion#undef CLIB_MVUNALIGN_LEFT47
358f1213b82SDamjan Marion
359f1213b82SDamjan Marion#endif /* included_clib_memcpy_sse3_h */
360f1213b82SDamjan Marion
361c3799996SDave Barach
362c3799996SDave Barach/*
363c3799996SDave Barach * fd.io coding-style-patch-verification: ON
364c3799996SDave Barach *
365c3799996SDave Barach * Local Variables:
366c3799996SDave Barach * eval: (c-set-style "gnu")
367c3799996SDave Barach * End:
368c3799996SDave Barach */
369