To: vim-dev@vim.org Subject: Patch 7.1.310 Fcc: outbox From: Bram Moolenaar Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 8bit ------------ Patch 7.1.310 Problem: Incomplete utf-8 byte sequence at end of the file is not detected. Accessing memory that wasn't written. Solution: Check the last bytes in the buffer for being a valid utf-8 character. (mostly by Ben Schmidt) Also fix that the reported line number of the error was wrong. Files: src/fileio.c *** ../vim-7.1.309/src/fileio.c Wed May 7 19:05:55 2008 --- src/fileio.c Wed Jun 4 18:28:48 2008 *************** *** 1288,1299 **** #ifdef FEAT_MBYTE else if (conv_restlen > 0) { ! /* Reached end-of-file but some trailing bytes could ! * not be converted. Truncated file? */ ! if (conv_error == 0) ! conv_error = linecnt; ! if (bad_char_behavior != BAD_DROP) { fio_flags = 0; /* don't convert this */ # ifdef USE_ICONV if (iconv_fd != (iconv_t)-1) --- 1288,1336 ---- #ifdef FEAT_MBYTE else if (conv_restlen > 0) { ! /* ! * Reached end-of-file but some trailing bytes could ! * not be converted. Truncated file? ! */ ! ! /* When we did a conversion report an error. */ ! if (fio_flags != 0 ! # ifdef USE_ICONV ! || iconv_fd != (iconv_t)-1 ! # endif ! ) { + if (conv_error == 0) + conv_error = curbuf->b_ml.ml_line_count + - linecnt + 1; + } + /* Remember the first linenr with an illegal byte */ + else if (illegal_byte == 0) + illegal_byte = curbuf->b_ml.ml_line_count + - linecnt + 1; + if (bad_char_behavior == BAD_DROP) + { + *(ptr - conv_restlen) = NUL; + conv_restlen = 0; + } + else + { + /* Replace the trailing bytes with the replacement + * character if we were converting; if we weren't, + * leave the UTF8 checking code to do it, as it + * works slightly differently. */ + if (bad_char_behavior != BAD_KEEP && (fio_flags != 0 + # ifdef USE_ICONV + || iconv_fd != (iconv_t)-1 + # endif + )) + { + while (conv_restlen > 0) + { + *(--ptr) = bad_char_behavior; + --conv_restlen; + } + } fio_flags = 0; /* don't convert this */ # ifdef USE_ICONV if (iconv_fd != (iconv_t)-1) *************** *** 1302,1321 **** iconv_fd = (iconv_t)-1; } # endif - if (bad_char_behavior == BAD_KEEP) - { - /* Keep the trailing bytes as-is. */ - size = conv_restlen; - ptr -= conv_restlen; - } - else - { - /* Replace the trailing bytes with the - * replacement character. */ - size = 1; - *--ptr = bad_char_behavior; - } - conv_restlen = 0; } } #endif --- 1339,1344 ---- *************** *** 1397,1402 **** --- 1420,1430 ---- goto retry; } } + + /* Include not converted bytes. */ + ptr -= conv_restlen; + size += conv_restlen; + conv_restlen = 0; #endif /* * Break here for a read error or end-of-file. *************** *** 1406,1416 **** #ifdef FEAT_MBYTE - /* Include not converted bytes. */ - ptr -= conv_restlen; - size += conv_restlen; - conv_restlen = 0; - # ifdef USE_ICONV if (iconv_fd != (iconv_t)-1) { --- 1434,1439 ---- *************** *** 1872,1883 **** size = (long)((ptr + real_size) - dest); ptr = dest; } ! else if (enc_utf8 && conv_error == 0 && !curbuf->b_p_bin) { ! /* Reading UTF-8: Check if the bytes are valid UTF-8. ! * Need to start before "ptr" when part of the character was ! * read in the previous read() call. */ ! for (p = ptr - utf_head_off(buffer, ptr); ; ++p) { int todo = (int)((ptr + size) - p); int l; --- 1895,1906 ---- size = (long)((ptr + real_size) - dest); ptr = dest; } ! else if (enc_utf8 && !curbuf->b_p_bin) { ! int incomplete_tail = FALSE; ! ! /* Reading UTF-8: Check if the bytes are valid UTF-8. */ ! for (p = ptr; ; ++p) { int todo = (int)((ptr + size) - p); int l; *************** *** 1891,1933 **** * read() will get the next bytes, we'll check it * then. */ l = utf_ptr2len_len(p, todo); ! if (l > todo) { ! /* Incomplete byte sequence, the next read() ! * should get them and check the bytes. */ ! p += todo; ! break; } ! if (l == 1) { /* Illegal byte. If we can try another encoding ! * do that. */ ! if (can_retry) break; - - /* Remember the first linenr with an illegal byte */ - if (illegal_byte == 0) - illegal_byte = readfile_linenr(linecnt, ptr, p); # ifdef USE_ICONV /* When we did a conversion report an error. */ if (iconv_fd != (iconv_t)-1 && conv_error == 0) conv_error = readfile_linenr(linecnt, ptr, p); # endif /* Drop, keep or replace the bad byte. */ if (bad_char_behavior == BAD_DROP) { ! mch_memmove(p, p+1, todo - 1); --p; --size; } else if (bad_char_behavior != BAD_KEEP) *p = bad_char_behavior; } ! p += l - 1; } } ! if (p < ptr + size) { /* Detected a UTF-8 error. */ rewind_retry: --- 1914,1969 ---- * read() will get the next bytes, we'll check it * then. */ l = utf_ptr2len_len(p, todo); ! if (l > todo && !incomplete_tail) { ! /* Avoid retrying with a different encoding when ! * a truncated file is more likely, or attempting ! * to read the rest of an incomplete sequence when ! * we have already done so. */ ! if (p > ptr || filesize > 0) ! incomplete_tail = TRUE; ! /* Incomplete byte sequence, move it to conv_rest[] ! * and try to read the rest of it, unless we've ! * already done so. */ ! if (p > ptr) ! { ! conv_restlen = todo; ! mch_memmove(conv_rest, p, conv_restlen); ! size -= conv_restlen; ! break; ! } } ! if (l == 1 || l > todo) { /* Illegal byte. If we can try another encoding ! * do that, unless at EOF where a truncated ! * file is more likely than a conversion error. */ ! if (can_retry && !incomplete_tail) break; # ifdef USE_ICONV /* When we did a conversion report an error. */ if (iconv_fd != (iconv_t)-1 && conv_error == 0) conv_error = readfile_linenr(linecnt, ptr, p); # endif + /* Remember the first linenr with an illegal byte */ + if (conv_error == 0 && illegal_byte == 0) + illegal_byte = readfile_linenr(linecnt, ptr, p); /* Drop, keep or replace the bad byte. */ if (bad_char_behavior == BAD_DROP) { ! mch_memmove(p, p + 1, todo - 1); --p; --size; } else if (bad_char_behavior != BAD_KEEP) *p = bad_char_behavior; } ! else ! p += l - 1; } } ! if (p < ptr + size && !incomplete_tail) { /* Detected a UTF-8 error. */ rewind_retry: *** ../vim-7.1.309/src/version.c Wed Jun 4 15:27:43 2008 --- src/version.c Wed Jun 4 19:35:16 2008 *************** *** 668,669 **** --- 673,676 ---- { /* Add new patch number below this line */ + /**/ + 310, /**/ -- Normal people believe that if it ain't broke, don't fix it. Engineers believe that if it ain't broke, it doesn't have enough features yet. (Scott Adams - The Dilbert principle) /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net \\\ /// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\ \\\ download, build and distribute -- http://www.A-A-P.org /// \\\ help me help AIDS victims -- http://ICCF-Holland.org ///