diff --git a/src/Kernel/KUtils.ZC b/src/Kernel/KUtils.ZC index be4d6bfc..777d8874 100755 --- a/src/Kernel/KUtils.ZC +++ b/src/Kernel/KUtils.ZC @@ -68,7 +68,20 @@ _MEMCOPY:: MOV RDI, U64 SF_ARG1[RBP] // dst MOV RSI, U64 SF_ARG2[RBP] // src MOV RCX, U64 SF_ARG3[RBP] // count -@@05: // SSE 128-bit memcopy (count >= 16) + TEST RDI, 0xF // check if dst is 16byte aligned + JNZ @@05 // if not 16byte aligned, jump down to MOVUPS unaligned checks + TEST RSI, 0xF // check if src is 16byte aligned + JNZ @@05 // if not 16byte aligned, jump down to MOVUPS unaligned checks +@@03: // SSE 128-bit ALIGNED memcopy (count >= 16) + CMP RCX, 16 // count <, >, == 16 bytes? + JL @@10 // if count less than 16, jump down + MOVAPS XMM15, [RSI]// move 128 bits, src-->xmm ALIGNED + MOVAPS [RDI], XMM15// move 128 bits, xmm-->dst mem ALIGNED + ADD RSI, 16 // increment src addr by 128 bits + ADD RDI, 16 // increment dst addr by 128 bits + ADD RCX, -16 // decrement count by 128 bits + JMP @@03 // jump back to 16 byte check +@@05: // SSE 128-bit UNALIGNED memcopy (count >= 16) CMP RCX, 16 // count <, >, == 16 bytes? JL @@10 // if count less than 16, jump down MOVUPS XMM15, [RSI]// move 128 bits, src-->xmm diff --git a/src/System/Gr/GrInitB.ZC b/src/System/Gr/GrInitB.ZC index 484101e8..f2887283 100755 --- a/src/System/Gr/GrInitB.ZC +++ b/src/System/Gr/GrInitB.ZC @@ -187,7 +187,7 @@ U0 GrInit2() gr.to_8_bits = MAlloc(256 * sizeof(I64)); gr.to_8_colors = MAlloc(256 * sizeof(I64)); - gr.screen_cache = MAlloc(sys_framebuffer_width * sys_framebuffer_height); + gr.screen_cache = MAllocAligned(sys_framebuffer_width * sys_framebuffer_height, 16); gr.text_base = CAlloc(TEXT_ROWS * TEXT_COLS * sizeof(U32)); gr.win_uncovered_bitmap = CAlloc(65536 / 8); diff --git a/src/System/Gr/GrScreen.ZC b/src/System/Gr/GrScreen.ZC index 8edf1c58..f27e23b7 100755 --- a/src/System/Gr/GrScreen.ZC +++ b/src/System/Gr/GrScreen.ZC @@ -344,13 +344,12 @@ U0 GrUpdateTextFG() U0 DCBlotColor8(CDC *dc, CDC *img) { - U8 *src = img->body, *b0 = dc->body; - I64 reg j, reg jj, v = 0, k, d0 = img->width_internal * img->height; + U8 *b0 = dc->body; + I64 *src = img->body, reg j, reg jj, v = 0, k, d0 = img->width_internal * img->height; for (k = 0; k < d0; k += 8) { - j = *(src(U64 *)); - src += 8; + j = *src++;; do { jj = j & 0xFF; @@ -364,10 +363,13 @@ U0 DCBlotColor8(CDC *dc, CDC *img) } } -U0 GrCalcScreenUpdates() +//#define GR_CALC_SCREEN_UPDATES_SKIP 2 +//#define GR_CALC_SCREEN_UPDATES_CHUNKS 4 +$IV,0$U0 GrCalcScreenUpdates() { - U64 *screen, *last_screen = gr.screen_cache; - U64 i, *src = text.raw_screen, *dst = text.fb_alias, diffs_size = GR_WIDTH * GR_HEIGHT / 8; + U64 reg *screen, reg *last_screen = gr.screen_cache, i, ii, *src = text.raw_screen, *dst = text.fb_alias, reg RDX diffs_size = GR_WIDTH * GR_HEIGHT / 8, +// skip = gr.screen_cache[0], // use 1st U8 of cache as flag to skip cache MemCopy every-other call. +/* reg R9 skip_size64, reg R8 skip_diff, skip_size8*/; if (gr.screen_zoom == 1) screen = gr.dc2->body; @@ -376,38 +378,76 @@ U0 GrCalcScreenUpdates() for (i = 0; i < diffs_size; i++) if (screen[i] != last_screen[i]) - MemCopy(dst + i * 4, src + i * 4, 4 * 64); + { + ii = i * 4; + dst[ii] = src[ii++]; + dst[ii] = src[ii++]; + dst[ii] = src[ii++]; + dst[ii] = src[ii]; + } +/* + if (skip < GR_CALC_SCREEN_UPDATES_CHUNKS * GR_CALC_SCREEN_UPDATES_SKIP) + { + if (skip % GR_CALC_SCREEN_UPDATES_SKIP == 0) + { + skip_size64 = skip / GR_CALC_SCREEN_UPDATES_SKIP * text.buffer_size / GR_CALC_SCREEN_UPDATES_CHUNKS / 8; +// skip_diff = diffs_size * 8 / GR_CALC_SCREEN_UPDATES_CHUNKS; +// skip_size8 = skip / GR_CALC_SCREEN_UPDATES_SKIP * skip_diff; + MemCopy(dst + skip_size64, + src + skip_size64, + text.buffer_size / GR_CALC_SCREEN_UPDATES_CHUNKS); +// MemCopy(gr.screen_cache + skip_size8, screen(U8 *) + skip_size8, skip_diff); + } + } + else + skip = -1; + + gr.screen_cache[0] = ++skip; +*/ MemCopy(gr.screen_cache, screen, diffs_size * 8); + } U0 GrUpdateScreen32() { - U64 size, *dst, reg src64val; - U8 *src, reg a, reg b, c, reg d; + U64 size, *dst, src64val, *src; + U8 a, b, c, d; + + PUSHFD + CLI if (gr.screen_zoom == 1) { src = gr.dc2->body; - size = src + gr.dc2->height * gr.dc2->width_internal; + size = src(U8 *) + gr.dc2->height * gr.dc2->width_internal; } else { GrZoomInScreen; src = gr.zoomed_dc->body; - size = src + gr.zoomed_dc->height * gr.zoomed_dc->width_internal; + size = src(U8 *) + gr.zoomed_dc->height * gr.zoomed_dc->width_internal; } + POPFD + dst = text.raw_screen; - while (src < size) // draw 4 pixels at a time + + while (src < size) // draw 8 pixels at a time { src64val = *(src(U64 *)); a = src64val & 0xFF; b = (src64val >>= 8) & 0xFF; c = (src64val >>= 8) & 0xFF; d = (src64val >>= 8) & 0xFF; - src += 4; *dst++ = gr_palette[a] | gr_palette[b] << 32; *dst++ = gr_palette[c] | gr_palette[d] << 32; + a = (src64val >>= 8) & 0xFF; + b = (src64val >>= 8) & 0xFF; + c = (src64val >>= 8) & 0xFF; + d = (src64val >>= 8) & 0xFF; + *dst++ = gr_palette[a] | gr_palette[b] << 32; + *dst++ = gr_palette[c] | gr_palette[d] << 32; + src++; } GrCalcScreenUpdates;