Pikchr

Check-in [43eb7d9984]
Login
Overview
Comment:When outputing HTML-escaped text and an ampersand is encountered, only transform it to an HTML entity if it looks like it's not one already. HTML entities are left as is, noting that we do only a syntactic check and not a test for whether the entity is a standard/known one. This replaces [aca3d2dc1a00], reverting the doc changes made by that checkin.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 43eb7d998415bf725c6a87f7ad7d25d5f25a7a324d436c142c5c0046673a5ac0
User & Date: stephan 2022-06-14 14:32:16
References
2022-06-14
17:41
Expanded [43eb7d998415] to permit digits in HTML entities after the first character, to allow for sup2, frac12, and similar entities. Reported in [dcf1418ef16f]. (check-in: a28bee7108 user: stephan tags: trunk)
14:37 Reply: Invalid SVG for lone Ampersand (artifact: 0477e25d89 user: stephan)
Context
2022-06-14
15:25
Minor doc correction and code style tweak to the previous checkin. No functional changes. (check-in: b5a5779c8d user: stephan tags: trunk)
14:32
When outputing HTML-escaped text and an ampersand is encountered, only transform it to an HTML entity if it looks like it's not one already. HTML entities are left as is, noting that we do only a syntactic check and not a test for whether the entity is a standard/known one. This replaces [aca3d2dc1a00], reverting the doc changes made by that checkin. (check-in: 43eb7d9984 user: stephan tags: trunk)
2022-06-06
23:51
Removed some duplicated docs, one copy of which was out of date. (check-in: 1ecd389f95 user: stephan tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to pikchr.c.
4530
4531
4532
4533
4534
4535
4536





























4537
4538
4539
4540
4541
4542
4543
    p->zOut = z;
    p->nOutAlloc = nNew;
  }
  memcpy(p->zOut+p->nOut, zText, n);
  p->nOut += n;
  p->zOut[p->nOut] = 0;
}






























/*
** Append text to zOut with HTML characters escaped.
**
**   *  The space character is changed into non-breaking space (U+00a0)
**      if mFlags has the 0x01 bit set. This is needed when outputting
**      text to preserve leading and trailing whitespace.  Turns out we







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
    p->zOut = z;
    p->nOutAlloc = nNew;
  }
  memcpy(p->zOut+p->nOut, zText, n);
  p->nOut += n;
  p->zOut[p->nOut] = 0;
}

/*
** Given a string and its length, returns true if the string begins
** with a construct which syntactically matches an HTML entity escape
** sequence (without checking for whether it's a known entity). Always
** returns false if zText[0] is false or n<4. Entities match the
** equivalent of the regexes `&#[0-9]+;` and `&[a-zA-Z]+;`.
*/
static int pik_isentity(char const * zText, int n){
  int i = 0;
  if( n<4 || '&'!=zText[0] ) return 0;
  n--;
  zText++;
  if( '#'==zText[0] ){
    zText++;
    n--;
    for(i=0; i<n; i++){
      if( i>1 && ';'==zText[i] ) return 1;
      else if( zText[i]<'0' || zText[i]>'9' ) return 0;
    }
  }else{
    for( i=0; i<n; i++ ){
      if( i>1 && ';'==zText[i] ) return 1;
      else if( zText[i]<'A' || zText[i]>'z'
               || (zText[i]>'Z' && zText[i]<'a') ) return 0;
    }
  }
  return 0;
}

/*
** Append text to zOut with HTML characters escaped.
**
**   *  The space character is changed into non-breaking space (U+00a0)
**      if mFlags has the 0x01 bit set. This is needed when outputting
**      text to preserve leading and trailing whitespace.  Turns out we
4562
4563
4564
4565
4566
4567
4568
4569
4570



4571
4572
4573
4574
4575
4576
4577
      if( c=='&' && bQAmp ) break;
    }
    if( i ) pik_append(p, zText, i);
    if( i==n ) break;
    switch( c ){
      case '<': {  pik_append(p, "&lt;", 4);  break;  }
      case '>': {  pik_append(p, "&gt;", 4);  break;  }
      case '&': {  pik_append(p, "&amp;", 5);  break;  }
      case ' ': {  pik_append(p, "\302\240;", 2);  break;  }



    }
    i++;
    n -= i;
    zText += i;
    i = 0;
  }
}







<

>
>
>







4591
4592
4593
4594
4595
4596
4597

4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
      if( c=='&' && bQAmp ) break;
    }
    if( i ) pik_append(p, zText, i);
    if( i==n ) break;
    switch( c ){
      case '<': {  pik_append(p, "&lt;", 4);  break;  }
      case '>': {  pik_append(p, "&gt;", 4);  break;  }

      case ' ': {  pik_append(p, "\302\240;", 2);  break;  }
      case '&':
        if( pik_isentity(zText+i, n-i) ){ pik_append(p, "&", 1); }
        else { pik_append(p, "&amp;", 5); }
    }
    i++;
    n -= i;
    zText += i;
    i = 0;
  }
}
8094
8095
8096
8097
8098
8099
8100
8101
  return TCL_OK;
}


#endif /* PIKCHR_TCL */


#line 8126 "pikchr.c"







|
8125
8126
8127
8128
8129
8130
8131
8132
  return TCL_OK;
}


#endif /* PIKCHR_TCL */


#line 8157 "pikchr.c"
Changes to pikchr.y.
1922
1923
1924
1925
1926
1927
1928





























1929
1930
1931
1932
1933
1934
1935
    p->zOut = z;
    p->nOutAlloc = nNew;
  }
  memcpy(p->zOut+p->nOut, zText, n);
  p->nOut += n;
  p->zOut[p->nOut] = 0;
}






























/*
** Append text to zOut with HTML characters escaped.
**
**   *  The space character is changed into non-breaking space (U+00a0)
**      if mFlags has the 0x01 bit set. This is needed when outputting
**      text to preserve leading and trailing whitespace.  Turns out we







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
    p->zOut = z;
    p->nOutAlloc = nNew;
  }
  memcpy(p->zOut+p->nOut, zText, n);
  p->nOut += n;
  p->zOut[p->nOut] = 0;
}

/*
** Given a string and its length, returns true if the string begins
** with a construct which syntactically matches an HTML entity escape
** sequence (without checking for whether it's a known entity). Always
** returns false if zText[0] is false or n<4. Entities match the
** equivalent of the regexes `&#[0-9]+;` and `&[a-zA-Z]+;`.
*/
static int pik_isentity(char const * zText, int n){
  int i = 0;
  if( n<4 || '&'!=zText[0] ) return 0;
  n--;
  zText++;
  if( '#'==zText[0] ){
    zText++;
    n--;
    for(i=0; i<n; i++){
      if( i>1 && ';'==zText[i] ) return 1;
      else if( zText[i]<'0' || zText[i]>'9' ) return 0;
    }
  }else{
    for( i=0; i<n; i++ ){
      if( i>1 && ';'==zText[i] ) return 1;
      else if( zText[i]<'A' || zText[i]>'z'
               || (zText[i]>'Z' && zText[i]<'a') ) return 0;
    }
  }
  return 0;
}

/*
** Append text to zOut with HTML characters escaped.
**
**   *  The space character is changed into non-breaking space (U+00a0)
**      if mFlags has the 0x01 bit set. This is needed when outputting
**      text to preserve leading and trailing whitespace.  Turns out we
1954
1955
1956
1957
1958
1959
1960
1961
1962



1963
1964
1965
1966
1967
1968
1969
      if( c=='&' && bQAmp ) break;
    }
    if( i ) pik_append(p, zText, i);
    if( i==n ) break;
    switch( c ){
      case '<': {  pik_append(p, "&lt;", 4);  break;  }
      case '>': {  pik_append(p, "&gt;", 4);  break;  }
      case '&': {  pik_append(p, "&amp;", 5);  break;  }
      case ' ': {  pik_append(p, "\302\240;", 2);  break;  }



    }
    i++;
    n -= i;
    zText += i;
    i = 0;
  }
}







<

>
>
>







1983
1984
1985
1986
1987
1988
1989

1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
      if( c=='&' && bQAmp ) break;
    }
    if( i ) pik_append(p, zText, i);
    if( i==n ) break;
    switch( c ){
      case '<': {  pik_append(p, "&lt;", 4);  break;  }
      case '>': {  pik_append(p, "&gt;", 4);  break;  }

      case ' ': {  pik_append(p, "\302\240;", 2);  break;  }
      case '&':
        if( pik_isentity(zText+i, n-i) ){ pik_append(p, "&", 1); }
        else { pik_append(p, "&amp;", 5); }
    }
    i++;
    n -= i;
    zText += i;
    i = 0;
  }
}