相信大家代码里有很多地方用到memcpy这个函数,相信这个函数的占用是不小的,有时优化了memcpy,能使整个项目的运行效率提升。通过适当的编码技巧,让我们的内存拷贝速度超过memcpy两倍,是可以实现的。
有人说memcpy还能优化么?不就是rep movsd么?CPU和内存之间的带宽是固定的,怎么可能优化呢?其实是普通的内存拷贝并没有发挥全部的带宽,很多被浪费掉了,比如要等到数据完全读取成功后再去写入,然后要写入成功后再去读取新的。而优化本身就是使这两者尽量的并行。发挥最大的带宽。
现代的内存拷贝都需要判断内存大小,并按照大小选择不同策略进行拷贝,比如大内存拷贝(超过cache大小),那么最好使用并行若干读取指令和写入指令,然后再并行写入,使得CPU前后结果依赖得以大大降低,并且使用缓冲预取,再CPU处理数据之前,就把数据放到离CPU最近的CACHE。这样已经可以比memcpy快很多了,如果再加上一些新指令的帮助,大内存拷贝会大大提速。
但是用同样的代码去拷贝小内存,因为额外的开销,难对齐的内存,准备工作一大堆,如果实际要拷贝的内存很小的话,这样的工作开销还比直接按照 dword复制慢很多。在VC10的memcpy实现中将内存按照128个字节大小进行区分,小于该大小的使用普通拷贝,大于该大小的使用普通SSE指令拷贝,而现在我们就要来挑战VC10的memcpy,本篇先叙述小内存拷贝部分。
适合拷贝64字节以内的数据量。原理很简单,LOOP UNROLL。rep movsb/movsd是靠不住的,小内存拷贝还是得展开循环。废话不多说,代码贴上:
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <windows.h> #include <mmsystem.h> #ifdef _MSC_VER #pragma comment(lib, "winmm.lib") #endif void *memcpy_tiny(void *dst, const void *src, size_t len) { if (len <= 63) { register unsigned char *dd = (unsigned char*)dst + len; register const unsigned char *ss = (const unsigned char*)src + len; switch (len) { case 68: *((int*)(dd - 68)) = *((int*)(ss - 68)); case 64: *((int*)(dd - 64)) = *((int*)(ss - 64)); case 60: *((int*)(dd - 60)) = *((int*)(ss - 60)); case 56: *((int*)(dd - 56)) = *((int*)(ss - 56)); case 52: *((int*)(dd - 52)) = *((int*)(ss - 52)); case 48: *((int*)(dd - 48)) = *((int*)(ss - 48)); case 44: *((int*)(dd - 44)) = *((int*)(ss - 44)); case 40: *((int*)(dd - 40)) = *((int*)(ss - 40)); case 36: *((int*)(dd - 36)) = *((int*)(ss - 36)); case 32: *((int*)(dd - 32)) = *((int*)(ss - 32)); case 28: *((int*)(dd - 28)) = *((int*)(ss - 28)); case 24: *((int*)(dd - 24)) = *((int*)(ss - 24)); case 20: *((int*)(dd - 20)) = *((int*)(ss - 20)); case 16: *((int*)(dd - 16)) = *((int*)(ss - 16)); case 12: *((int*)(dd - 12)) = *((int*)(ss - 12)); case 8: *((int*)(dd - 8)) = *((int*)(ss - 8)); case 4: *((int*)(dd - 4)) = *((int*)(ss - 4)); break; case 67: *((int*)(dd - 67)) = *((int*)(ss - 67)); case 63: *((int*)(dd - 63)) = *((int*)(ss - 63)); case 59: *((int*)(dd - 59)) = *((int*)(ss - 59)); case 55: *((int*)(dd - 55)) = *((int*)(ss - 55)); case 51: *((int*)(dd - 51)) = *((int*)(ss - 51)); case 47: *((int*)(dd - 47)) = *((int*)(ss - 47)); case 43: *((int*)(dd - 43)) = *((int*)(ss - 43)); case 39: *((int*)(dd - 39)) = *((int*)(ss - 39)); case 35: *((int*)(dd - 35)) = *((int*)(ss - 35)); case 31: *((int*)(dd - 31)) = *((int*)(ss - 31)); case 27: *((int*)(dd - 27)) = *((int*)(ss - 27)); case 23: *((int*)(dd - 23)) = *((int*)(ss - 23)); case 19: *((int*)(dd - 19)) = *((int*)(ss - 19)); case 15: *((int*)(dd - 15)) = *((int*)(ss - 15)); case 11: *((int*)(dd - 11)) = *((int*)(ss - 11)); case 7: *((int*)(dd - 7)) = *((int*)(ss - 7)); *((int*)(dd - 4)) = *((int*)(ss - 4)); break; case 3: *((short*)(dd - 3)) = *((short*)(ss - 3)); dd[-1] = ss[-1]; break; case 66: *((int*)(dd - 66)) = *((int*)(ss - 66)); case 62: *((int*)(dd - 62)) = *((int*)(ss - 62)); case 58: *((int*)(dd - 58)) = *((int*)(ss - 58)); case 54: *((int*)(dd - 54)) = *((int*)(ss - 54)); case 50: *((int*)(dd - 50)) = *((int*)(ss - 50)); case 46: *((int*)(dd - 46)) = *((int*)(ss - 46)); case 42: *((int*)(dd - 42)) = *((int*)(ss - 42)); case 38: *((int*)(dd - 38)) = *((int*)(ss - 38)); case 34: *((int*)(dd - 34)) = *((int*)(ss - 34)); case 30: *((int*)(dd - 30)) = *((int*)(ss - 30)); case 26: *((int*)(dd - 26)) = *((int*)(ss - 26)); case 22: *((int*)(dd - 22)) = *((int*)(ss - 22)); case 18: *((int*)(dd - 18)) = *((int*)(ss - 18)); case 14: *((int*)(dd - 14)) = *((int*)(ss - 14)); case 10: *((int*)(dd - 10)) = *((int*)(ss - 10)); case 6: *((int*)(dd - 6)) = *((int*)(ss - 6)); case 2: *((short*)(dd - 2)) = *((short*)(ss - 2)); break; case 65: *((int*)(dd - 65)) = *((int*)(ss - 65)); case 61: *((int*)(dd - 61)) = *((int*)(ss - 61)); case 57: *((int*)(dd - 57)) = *((int*)(ss - 57)); case 53: *((int*)(dd - 53)) = *((int*)(ss - 53)); case 49: *((int*)(dd - 49)) = *((int*)(ss - 49)); case 45: *((int*)(dd - 45)) = *((int*)(ss - 45)); case 41: *((int*)(dd - 41)) = *((int*)(ss - 41)); case 37: *((int*)(dd - 37)) = *((int*)(ss - 37)); case 33: *((int*)(dd - 33)) = *((int*)(ss - 33)); case 29: *((int*)(dd - 29)) = *((int*)(ss - 29)); case 25: *((int*)(dd - 25)) = *((int*)(ss - 25)); case 21: *((int*)(dd - 21)) = *((int*)(ss - 21)); case 17: *((int*)(dd - 17)) = *((int*)(ss - 17)); case 13: *((int*)(dd - 13)) = *((int*)(ss - 13)); case 9: *((int*)(dd - 9)) = *((int*)(ss - 9)); case 5: *((int*)(dd - 5)) = *((int*)(ss - 5)); case 1: dd[-1] = ss[-1]; break; case 0: default: break; } return dd; } else { return memcpy(dst, src, len); } } typedef void *(*MemCpyProc)(void *dst, const void *src, size_t size); void benchmark(MemCpyProc memcpy1, MemCpyProc memcpy2, int x1, int x2) { static unsigned char card1[0x20015]; static unsigned char card2[0x20015]; char *align1 = (char*)card1 + ((16 - ((size_t)card1 & 15)) & 15); char *align2 = (char*)card2 + ((16 - ((size_t)card2 & 15)) & 15); long totalsize = 0x20000000; int size, t1, t2, i; for (size = 64; size >= 8; size = size * 2 / 3) { int times = totalsize / size; printf("memcpy: size=%6d\t times=%8d:\t ", size, times); Sleep(50); t1 = timeGetTime(); for (i = 0; i < times; i++) memcpy1(align1 + x1, align2 + x2, size); t1 = timeGetTime() - t1; Sleep(50); t2 = timeGetTime(); for (i = 0; i < times; i++) memcpy2(align1 + x1, align2 + x2, size); t2 = timeGetTime() - t2; printf("t1=%4d\t t2=%4d\n", t1, t2); } } //! flag: -O3 //! exe: int main(void) { printf("\ndst: aligned src: aligned\n"); benchmark(memcpy, memcpy_tiny, 0, 0); printf("\ndst: un-aligned src: aligned\n"); benchmark(memcpy, memcpy_tiny, 1, 0); printf("\ndst: aligned src: un-aligned\n"); benchmark(memcpy, memcpy_tiny, 0, 1); printf("\ndst: un-aligned src: un-aligned\n"); benchmark(memcpy, memcpy_tiny, 3, 1); return 0; }
该代码展开了循环,效率比rep movsd之类的高2-3倍,比memcpy平均快2倍,下面是测试数据,t1代表memcpy, t2代表memcpy_tiny,使用VC2010最大优化模式编译:
dst: aligned src: aligned
memcpy: size= 64 times= 8388608: t1= 239 t2= 243
memcpy: size= 42 times=12782640: t1= 341 t2= 170
memcpy: size= 28 times=19173961: t1= 225 t2= 205
memcpy: size= 18 times=29826161: t1= 363 t2= 232
memcpy: size= 12 times=44739242: t1= 448 t2= 294
memcpy: size= 8 times=67108864: t1= 699 t2= 406
dst: un-aligned src: aligned
memcpy: size= 64 times= 8388608: t1= 255 t2= 256
memcpy: size= 42 times=12782640: t1= 369 t2= 269
memcpy: size= 28 times=19173961: t1= 289 t2= 165
memcpy: size= 18 times=29826161: t1= 421 t2= 203
memcpy: size= 12 times=44739242: t1= 493 t2= 279
memcpy: size= 8 times=67108864: t1= 956 t2= 354
dst: aligned src: un-aligned
memcpy: size= 64 times= 8388608: t1= 241 t2= 257
memcpy: size= 42 times=12782640: t1= 303 t2= 218
memcpy: size= 28 times=19173961: t1= 258 t2= 181
memcpy: size= 18 times=29826161: t1= 319 t2= 261
memcpy: size= 12 times=44739242: t1= 448 t2= 325
memcpy: size= 8 times=67108864: t1= 646 t2= 401
dst: un-aligned src: un-aligned
memcpy: size= 64 times= 8388608: t1= 243 t2= 257
memcpy: size= 42 times=12782640: t1= 329 t2= 273
memcpy: size= 28 times=19173961: t1= 276 t2= 193
memcpy: size= 18 times=29826161: t1= 336 t2= 249
memcpy: size= 12 times=44739242: t1= 529 t2= 332
memcpy: size= 8 times=67108864: t1= 915 t2= 368
可见平均在64,42,28,18,12,8等几个大小的测试中,平均速度是memcpy的两倍。主要是依靠跳转表来实现的(VC INLINE ASM不能描述跳转表,因此该部分无法写成inline asm模式,索幸switch /case在大部分C编译器中都是用跳转表来实现的,因此我们可以用switch/case来模拟基于汇编指令跳转表的内存拷贝)。该部分反编译的结果:
; 18 : switch (len) 00016 8d 71 ff lea esi, DWORD PTR [ecx-1] 00019 03 c1 add eax, ecx 0001b 83 fe 43 cmp esi, 67 ; 00000043H 0001e 0f 87 fd 01 00 00 ja $LN2@memcpy_tin 00024 ff 24 b5 00 00 00 00 jmp DWORD PTR $LN77@memcpy_tin[esi*4] $LN70@memcpy_tin: ; 19 : { ; 20 : case 68: *((int*)(dd - 68)) = *((int*)(ss - 68)); 0002b 8b 74 0a bc mov esi, DWORD PTR [edx+ecx-68] 0002f 89 70 bc mov DWORD PTR [eax-68], esi $LN69@memcpy_tin: ; 21 : case 64: *((int*)(dd - 64)) = *((int*)(ss - 64)); 00032 8b 74 0a c0 mov esi, DWORD PTR [edx+ecx-64] 00036 89 70 c0 mov DWORD PTR [eax-64], esi $LN68@memcpy_tin: ; 22 : case 60: *((int*)(dd - 60)) = *((int*)(ss - 60)); 00039 8b 74 0a c4 mov esi, DWORD PTR [edx+ecx-60] 0003d 89 70 c4 mov DWORD PTR [eax-60], esi $LN67@memcpy_tin: ; 23 : case 56: *((int*)(dd - 56)) = *((int*)(ss - 56)); 00040 8b 74 0a c8 mov esi, DWORD PTR [edx+ecx-56] 00044 89 70 c8 mov DWORD PTR [eax-56], esi $LN66@memcpy_tin: ................................. $LN77@memcpy_tin: ; 105 : } ; 106 : } 0022c 00 00 00 00 DD $LN3@memcpy_tin 00230 00 00 00 00 DD $LN20@memcpy_tin 00234 00 00 00 00 DD $LN37@memcpy_tin 00238 00 00 00 00 DD $LN54@memcpy_tin 0023c 00 00 00 00 DD $LN4@memcpy_tin 00240 00 00 00 00 DD $LN21@memcpy_tin 00244 00 00 00 00 DD $LN38@memcpy_tin 00248 00 00 00 00 DD $LN55@memcpy_tin 0024c 00 00 00 00 DD $LN5@memcpy_tin 00250 00 00 00 00 DD $LN22@memcpy_tin 00254 00 00 00 00 DD $LN39@memcpy_tin 00258 00 00 00 00 DD $LN56@memcpy_tin 0025c 00 00 00 00 DD $LN6@memcpy_tin 00260 00 00 00 00 DD $LN23@memcpy_tin 00264 00 00 00 00 DD $LN40@memcpy_tin 00268 00 00 00 00 DD $LN57@memcpy_tin 0026c 00 00 00 00 DD $LN7@memcpy_tin 00270 00 00 00 00 DD $LN24@memcpy_tin 00274 00 00 00 00 DD $LN41@memcpy_tin 00278 00 00 00 00 DD $LN58@memcpy_tin 0027c 00 00 00 00 DD $LN8@memcpy_tin 00280 00 00 00 00 DD $LN25@memcpy_tin 00284 00 00 00 00 DD $LN42@memcpy_tin 00288 00 00 00 00 DD $LN59@memcpy_tin
大家发现诀窍没有?好了,其实在大部分情况下,比如脚本处理等,很多是64个字节以内的内存拷贝,所以小内存拷贝的优化是比较实在的,请大家继续关注“内存拷贝优化(2)-大内存拷贝优化”。
读了《内存拷贝优化(1)-小内存拷贝优化》,很好的文章,不过没有看到 《内存拷贝优化(2)-大内存拷贝优化》。有的话,能否给我发一份呢?十分感谢!
我的邮箱是:ymxie@junege.com。
期待您的回信。
最近太忙,隔久写,呵呵。只有代码。。。。。还需要写文章。。。。
期待中。。。
代码可以先看看吗,谢谢!
这个技巧不错
不过我刚才在Linux下实验了一下,8字节的版本,没有libc的memcpy性能高…..
@egmkang
-O3了没?为何改为8字节呢?
@skywind
-O2, 8字节和4字节没啥本质区别,都比libc的速度慢50%,就放弃了
@egmkang
x64 ?
@nike
大内存拷贝文章已经更新,看首页。
@nike
@egmkang
大内存拷贝文章已经更新,看首页。