Fossil

Changes On Branch bomRefactor
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch bomRefactor Excluding Merge-Ins

This is equivalent to a diff from a4cdc723 to 3f2f1e62

2013-03-19
17:40
Merge UTF-16 byte swapping fix and test-looks-like-utf command enhancements. ... (check-in: b4bec375 user: mistachkin tags: trunk)
17:37
Fix overly eager byte swapping when checking for UTF-16 text. ... (Closed-Leaf check-in: 3f2f1e62 user: mistachkin tags: bomRefactor)
08:59
Merge "cr-warning" branch to trunk: Fossil now warns before committing files with CR line-endings and offers to convert them to LF line-endings; fossil's diff cannot handle those. In checkin.c, use LOOK_BINARY in stead of LOOK_NUL, in case more flags are added to the BINARY detection. Rename LOOK_LENGTH to LOOK_LONG. ... (check-in: ea2598e4 user: jan.nijtmans tags: trunk)
08:34
Fix expected value of test-cases: The value of LOOK_LONE_CR is wrong in 19 cases. I leave it to Joe to fix the code. ... (check-in: 8af1541a user: jan.nijtmans tags: bomRefactor)
2013-03-18
23:47
Make sure that LOOK_CR is set even when a CR/LF pair is detected. Rename the LOOK_LENGTH flag to LOOK_LONG for clarify. Add LOOK_SHORT flag to indicate that the looks_like_utf16() function did not perform a full check. Support tests for UTF-16 in reverse byte order. Enhancements to the test-looks-like-utf command. ... (check-in: b0b3f2a4 user: mistachkin tags: bomRefactor)
12:37
Adapt test-case 112 such that it contains a reversed CR/LF, a case not covered before. Fix detection of reversed CR/LF and lone CR in reversed UTF-16 case, broken by [e3f9a42b58]. ... (check-in: a4cdc723 user: jan.nijtmans tags: trunk)
11:45
Add test-cases using reverse UTF-16 BOM, and the unicode characters U+0A00 and U+0D00 ... (check-in: d1f0c4b9 user: jan.nijtmans tags: trunk)

Changes to src/checkin.c.

908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
  const char *zFilename /* The full name of the file being committed. */
){
  int bReverse;           /* UTF-16 byte order is reversed? */
  int fUnicode;           /* return value of starts_with_utf16_bom() */
  int lookFlags;          /* output flags from looks_like_utf8/utf16() */
  int fHasNul;            /* the blob contains one or more NUL chars */
  int fHasCrLf;           /* the blob contains one or more CR/LF pairs */
  int fHasLength;         /* the blob contains an overly long line */
  char *zMsg;             /* Warning message */
  Blob fname;             /* Relative pathname of the file */
  static int allOk = 0;   /* Set to true to disable this routine */

  if( allOk ) return 0;
  fUnicode = could_be_utf16(p, &bReverse);
  if( fUnicode ){
    lookFlags = looks_like_utf16(p, bReverse);
  }else{
    lookFlags = looks_like_utf8(p);
  }
  fHasNul = (lookFlags & LOOK_NUL);
  fHasCrLf = (lookFlags & LOOK_CRLF);
  fHasLength = (lookFlags & LOOK_LENGTH);
  if( fHasNul || fHasLength || fHasCrLf || fUnicode ){
    const char *zWarning;
    const char *zDisable;
    const char *zConvert = "c=convert/";
    Blob ans;
    char cReply;

    if( fHasNul || fHasLength ){
      if( binOk ){
        return 0; /* We don't want binary warnings for this file. */
      }
      if( !fHasNul && fHasLength ){
        zWarning = "long lines";
      }else{
        zWarning = "binary data";
      }
      zDisable = "\"binary-glob\" setting";
      zConvert = ""; /* We cannot convert binary files. */
    }else if( fHasCrLf && fUnicode ){







|













|
|






|



|







908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
  const char *zFilename /* The full name of the file being committed. */
){
  int bReverse;           /* UTF-16 byte order is reversed? */
  int fUnicode;           /* return value of starts_with_utf16_bom() */
  int lookFlags;          /* output flags from looks_like_utf8/utf16() */
  int fHasNul;            /* the blob contains one or more NUL chars */
  int fHasCrLf;           /* the blob contains one or more CR/LF pairs */
  int fHasLong;           /* the blob contains an overly long line */
  char *zMsg;             /* Warning message */
  Blob fname;             /* Relative pathname of the file */
  static int allOk = 0;   /* Set to true to disable this routine */

  if( allOk ) return 0;
  fUnicode = could_be_utf16(p, &bReverse);
  if( fUnicode ){
    lookFlags = looks_like_utf16(p, bReverse);
  }else{
    lookFlags = looks_like_utf8(p);
  }
  fHasNul = (lookFlags & LOOK_NUL);
  fHasCrLf = (lookFlags & LOOK_CRLF);
  fHasLong = (lookFlags & LOOK_LONG);
  if( fHasNul || fHasLong || fHasCrLf || fUnicode ){
    const char *zWarning;
    const char *zDisable;
    const char *zConvert = "c=convert/";
    Blob ans;
    char cReply;

    if( fHasNul || fHasLong ){
      if( binOk ){
        return 0; /* We don't want binary warnings for this file. */
      }
      if( !fHasNul && fHasLong ){
        zWarning = "long lines";
      }else{
        zWarning = "binary data";
      }
      zDisable = "\"binary-glob\" setting";
      zConvert = ""; /* We cannot convert binary files. */
    }else if( fHasCrLf && fUnicode ){

Changes to src/diff.c.

72
73
74
75
76
77
78
79
80

81
82
83
84
85
86
87
88
#define LOOK_NONE    ((int)0x00000000) /* Nothing special was found. */
#define LOOK_NUL     ((int)0x00000001) /* One or more NUL chars were found. */
#define LOOK_CR      ((int)0x00000002) /* One or more CR chars were found. */
#define LOOK_LONE_CR ((int)0x00000004) /* An unpaired CR char was found. */
#define LOOK_LF      ((int)0x00000008) /* One or more LF chars were found. */
#define LOOK_LONE_LF ((int)0x00000010) /* An unpaired CR char was found. */
#define LOOK_CRLF    ((int)0x00000020) /* One or more CR/LF pairs were found. */
#define LOOK_LENGTH  ((int)0x00000040) /* An over length line was found. */
#define LOOK_ODD     ((int)0x00000080) /* An odd number of bytes was found. */

#define LOOK_BINARY  (LOOK_NUL | LOOK_LENGTH) /* Binary. */
#endif /* INTERFACE */

/*
** Maximum length of a line in a text file, in bytes.  (2**13 = 8192 bytes)
*/
#define LENGTH_MASK_SZ  13
#define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)







|

>
|







72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#define LOOK_NONE    ((int)0x00000000) /* Nothing special was found. */
#define LOOK_NUL     ((int)0x00000001) /* One or more NUL chars were found. */
#define LOOK_CR      ((int)0x00000002) /* One or more CR chars were found. */
#define LOOK_LONE_CR ((int)0x00000004) /* An unpaired CR char was found. */
#define LOOK_LF      ((int)0x00000008) /* One or more LF chars were found. */
#define LOOK_LONE_LF ((int)0x00000010) /* An unpaired CR char was found. */
#define LOOK_CRLF    ((int)0x00000020) /* One or more CR/LF pairs were found. */
#define LOOK_LONG    ((int)0x00000040) /* An over length line was found. */
#define LOOK_ODD     ((int)0x00000080) /* An odd number of bytes was found. */
#define LOOK_SHORT   ((int)0x00000100) /* Unable to perform full check. */
#define LOOK_BINARY  (LOOK_NUL | LOOK_LONG | LOOK_SHORT) /* May be binary. */
#endif /* INTERFACE */

/*
** Maximum length of a line in a text file, in bytes.  (2**13 = 8192 bytes)
*/
#define LENGTH_MASK_SZ  13
#define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306

307
308
309
310
311
312
313
    int c2 = c;
    c = *++z; ++j;
    if( c==0 ){
      flags |= LOOK_NUL;  /* NUL character in a file -> binary */
    }else if( c=='\n' ){
      flags |= LOOK_LF;
      if( c2=='\r' ){
        flags |= LOOK_CRLF;  /* Found LF preceded by CR */
      }else{
        flags |= LOOK_LONE_LF;
      }
      if( j>LENGTH_MASK ){
        flags |= LOOK_LENGTH;  /* Very long line -> binary */
      }
      j = 0;
    }else if( c=='\r' ){
      flags |= LOOK_CR;
      if( n<=1 || z[1]!='\n' ){
        flags |= LOOK_LONE_CR;  /* More chars, next char is not LF */
      }
    }
  }
  if( j>LENGTH_MASK ){
    flags |= LOOK_LENGTH;  /* Very long line -> binary */
  }
  return flags;
}

/*
** Define the type needed to represent a Unicode (UTF-16) character.
*/
#ifndef WCHAR_T
#  ifdef _WIN32
#    define WCHAR_T wchar_t
#  else
#    define WCHAR_T unsigned short
#  endif
#endif

/*
** Maximum length of a line in a text file, in UTF-16 characters.  (4096)
** The number of bytes represented by this value cannot exceed LENGTH_MASK
** bytes, because that is the line buffer size used by the diff engine.
*/
#define UTF16_LENGTH_MASK_SZ  (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
#define UTF16_LENGTH_MASK     ((1<<UTF16_LENGTH_MASK_SZ)-1)

/*
** This macro is used to swap the byte order of a UTF-16 character in the
** looks_like_utf16() function.
*/
#define UTF16_SWAP(ch)        (((ch) << 8) & 0xFF00) | (((ch) >> 8) & 0xFF)


/*
** This function attempts to scan each logical line within the blob to
** determine the type of content it appears to contain.  The return value
** is a combination of one or more of the LOOK_XXX flags (see above):
**
** !LOOK_BINARY -- The content appears to consist entirely of text; however,







|




|










|




















|
|





|
>







256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
    int c2 = c;
    c = *++z; ++j;
    if( c==0 ){
      flags |= LOOK_NUL;  /* NUL character in a file -> binary */
    }else if( c=='\n' ){
      flags |= LOOK_LF;
      if( c2=='\r' ){
        flags |= (LOOK_CR | LOOK_CRLF);  /* Found LF preceded by CR */
      }else{
        flags |= LOOK_LONE_LF;
      }
      if( j>LENGTH_MASK ){
        flags |= LOOK_LONG;  /* Very long line -> binary */
      }
      j = 0;
    }else if( c=='\r' ){
      flags |= LOOK_CR;
      if( n<=1 || z[1]!='\n' ){
        flags |= LOOK_LONE_CR;  /* More chars, next char is not LF */
      }
    }
  }
  if( j>LENGTH_MASK ){
    flags |= LOOK_LONG;  /* Very long line -> binary */
  }
  return flags;
}

/*
** Define the type needed to represent a Unicode (UTF-16) character.
*/
#ifndef WCHAR_T
#  ifdef _WIN32
#    define WCHAR_T wchar_t
#  else
#    define WCHAR_T unsigned short
#  endif
#endif

/*
** Maximum length of a line in a text file, in UTF-16 characters.  (4096)
** The number of bytes represented by this value cannot exceed LENGTH_MASK
** bytes, because that is the line buffer size used by the diff engine.
*/
#define UTF16_LENGTH_MASK_SZ   (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
#define UTF16_LENGTH_MASK      ((1<<UTF16_LENGTH_MASK_SZ)-1)

/*
** This macro is used to swap the byte order of a UTF-16 character in the
** looks_like_utf16() function.
*/
#define UTF16_SWAP(ch)         ((((ch) << 8) & 0xFF00) | (((ch) >> 8) & 0xFF))
#define UTF16_SWAP_IF(expr,ch) ((expr) ? UTF16_SWAP((ch)) : (ch))

/*
** This function attempts to scan each logical line within the blob to
** determine the type of content it appears to contain.  The return value
** is a combination of one or more of the LOOK_XXX flags (see above):
**
** !LOOK_BINARY -- The content appears to consist entirely of text; however,
344
345
346
347
348
349
350



351
352
353

354


355
356
357
358
359
360
361
362



363
364
365
366
367
368
369

370
371
372
373
374
375
376
377
378
379
380


381
382
383
384
385
386
387
388
389
390
391
392
393
394

  if( n==0 ) return flags;  /* Empty file -> text */
  if( n%sizeof(WCHAR_T) ){
    flags |= LOOK_ODD;  /* Odd number of bytes -> binary (UTF-8?) */
    if( n<sizeof(WCHAR_T) ) return flags;  /* One byte -> binary (UTF-8?) */
  }
  c = *z;



  if( c==0 ){
    flags |= LOOK_NUL;  /* NUL character in a file -> binary */
  }else if( bReverse ){

    c = UTF16_SWAP(c);


  }
  j = (c!='\n');
  if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF);  /* Found LF as first char */
  while( 1 ){
    int c2 = c;
    n -= sizeof(WCHAR_T);
    if( n<sizeof(WCHAR_T) ) break;
    c = *++z;



    ++j;
    if( c==0 ){
      flags |= LOOK_NUL;  /* NUL character in a file -> binary */
    }else if( bReverse ){
      c = UTF16_SWAP(c);
    }
    if( c=='\n' ){

      if( c2=='\r' ){
        flags |= (LOOK_CRLF | LOOK_CR | LOOK_LF);
      }else{
        flags |= (LOOK_LONE_LF | LOOK_LF);
      }
      if( j>UTF16_LENGTH_MASK ){
        flags |= LOOK_LENGTH;  /* Very long line -> binary */
      }
      j = 0;
    }else if( c2=='\r' ){
      flags |= (LOOK_CR | LOOK_LONE_CR);


    }
  }
  if( c=='\r' ){
    flags |= (LOOK_CR | LOOK_LONE_CR);  /* Found CR as last char */
  }
  if( j>UTF16_LENGTH_MASK ){
    flags |= LOOK_LENGTH;  /* Very long line -> binary */
  }
  return flags;
}

/*
** This function returns an array of bytes representing the byte-order-mark
** for UTF-8.







>
>
>


|
>
|
>
>








>
>
>



<
<
<
|
>

|

|


|


|
|
>
>
|
|
<
<


|







346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376



377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393


394
395
396
397
398
399
400
401
402
403

  if( n==0 ) return flags;  /* Empty file -> text */
  if( n%sizeof(WCHAR_T) ){
    flags |= LOOK_ODD;  /* Odd number of bytes -> binary (UTF-8?) */
    if( n<sizeof(WCHAR_T) ) return flags;  /* One byte -> binary (UTF-8?) */
  }
  c = *z;
  if( bReverse ){
    c = UTF16_SWAP(c);
  }
  if( c==0 ){
    flags |= LOOK_NUL;  /* NUL character in a file -> binary */
  }else if( c=='\r' ){
    flags |= LOOK_CR;
    if( n<=sizeof(WCHAR_T) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){
      flags |= LOOK_LONE_CR;  /* More chars, next char is not LF */
    }
  }
  j = (c!='\n');
  if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF);  /* Found LF as first char */
  while( 1 ){
    int c2 = c;
    n -= sizeof(WCHAR_T);
    if( n<sizeof(WCHAR_T) ) break;
    c = *++z;
    if( bReverse ){
      c = UTF16_SWAP(c);
    }
    ++j;
    if( c==0 ){
      flags |= LOOK_NUL;  /* NUL character in a file -> binary */



    }else if( c=='\n' ){
      flags |= LOOK_LF;
      if( c2=='\r' ){
        flags |= (LOOK_CR | LOOK_CRLF);  /* Found LF preceded by CR */
      }else{
        flags |= LOOK_LONE_LF;
      }
      if( j>UTF16_LENGTH_MASK ){
        flags |= LOOK_LONG;  /* Very long line -> binary */
      }
      j = 0;
    }else if( c=='\r' ){
      flags |= LOOK_CR;
      if( n<=sizeof(WCHAR_T) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){
        flags |= LOOK_LONE_CR;  /* More chars, next char is not LF */
      }
    }


  }
  if( j>UTF16_LENGTH_MASK ){
    flags |= LOOK_LONG;  /* Very long line -> binary */
  }
  return flags;
}

/*
** This function returns an array of bytes representing the byte-order-mark
** for UTF-8.
2521
2522
2523
2524
2525
2526
2527
2528

2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551

2552
2553
*/
void looks_like_utf_test_cmd(void){
  Blob blob;     /* the contents of the specified file */
  int fUtf8;     /* return value of starts_with_utf8_bom() */
  int fUtf16;    /* return value of starts_with_utf16_bom() */
  int fUnicode;  /* return value of could_be_utf16() */
  int lookFlags; /* output flags from looks_like_utf8/utf16() */
  int bReverse = 0; /* non-zero -> UTF-16 byte order reversed */

  if( g.argc<3 ) usage("FILENAME");
  blob_read_from_file(&blob, g.argv[2]);
  fUtf8 = starts_with_utf8_bom(&blob, 0);
  fUtf16 = starts_with_utf16_bom(&blob, 0, &bReverse);
  fUnicode = could_be_utf16(&blob, &bReverse);
  lookFlags = fUnicode ? looks_like_utf16(&blob, bReverse) :
                       looks_like_utf8(&blob);
  fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
  fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
  fossil_print("Starts with UTF-16 BOM: %s\n",
               fUtf16?(bReverse?"reversed":"yes"):"no");
  fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",
               (lookFlags&LOOK_BINARY)?"no":"yes");
  fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
  fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no");
  fossil_print("Has flag LOOK_LONE_CR: %s\n",
               (lookFlags&LOOK_LONE_CR)?"yes":"no");
  fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no");
  fossil_print("Has flag LOOK_LONE_LF: %s\n",
               (lookFlags&LOOK_LONE_LF)?"yes":"no");
  fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no");
  fossil_print("Has flag LOOK_LENGTH: %s\n",(lookFlags&LOOK_LENGTH)?"yes":"no");
  fossil_print("Has flag LOOK_ODD: %s\n",(lookFlags&LOOK_ODD)?"yes":"no");

  blob_reset(&blob);
}







|
>
|


|
|
|
|



|










|

>


2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
*/
void looks_like_utf_test_cmd(void){
  Blob blob;     /* the contents of the specified file */
  int fUtf8;     /* return value of starts_with_utf8_bom() */
  int fUtf16;    /* return value of starts_with_utf16_bom() */
  int fUnicode;  /* return value of could_be_utf16() */
  int lookFlags; /* output flags from looks_like_utf8/utf16() */
  int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */
  int bRevUnicode = 0; /* non-zero -> UTF-16 byte order reversed */
  if( g.argc!=3 ) usage("FILENAME");
  blob_read_from_file(&blob, g.argv[2]);
  fUtf8 = starts_with_utf8_bom(&blob, 0);
  fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16);
  fUnicode = could_be_utf16(&blob, &bRevUnicode);
  lookFlags = fUnicode ? looks_like_utf16(&blob, bRevUnicode) :
                         looks_like_utf8(&blob);
  fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
  fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
  fossil_print("Starts with UTF-16 BOM: %s\n",
               fUtf16?(bRevUtf16?"reversed":"yes"):"no");
  fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",
               (lookFlags&LOOK_BINARY)?"no":"yes");
  fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
  fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no");
  fossil_print("Has flag LOOK_LONE_CR: %s\n",
               (lookFlags&LOOK_LONE_CR)?"yes":"no");
  fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no");
  fossil_print("Has flag LOOK_LONE_LF: %s\n",
               (lookFlags&LOOK_LONE_LF)?"yes":"no");
  fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no");
  fossil_print("Has flag LOOK_LONG: %s\n",(lookFlags&LOOK_LONG)?"yes":"no");
  fossil_print("Has flag LOOK_ODD: %s\n",(lookFlags&LOOK_ODD)?"yes":"no");
  fossil_print("Has flag LOOK_SHORT: %s\n",(lookFlags&LOOK_SHORT)?"yes":"no");
  blob_reset(&blob);
}

Changes to test/utf.test.

more than 10,000 changes