Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HTML sanitizer for descriptions. #2785

Merged
merged 32 commits into from
Aug 3, 2021
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
2881e35
HTML sanitizer for descriptions.
zerline Feb 16, 2020
b79173f
Schema update.
zerline Feb 16, 2020
6b56c15
'description_html' can be modified.
zerline Feb 17, 2020
414ddd5
Attempt for a javascript-side sanitization. Based on DOMPurifier.
zerline May 18, 2020
e1025a9
Document all ToggleButton parameters.
zerline Jun 7, 2020
6a58fd5
Using sanitize-html.
zerline Jun 7, 2020
05b4079
TS compiling.
zerline Jun 7, 2020
e97263b
Adding a plaintext sanitizer (ie tags stripper).
zerline Jun 7, 2020
689d2c0
Schema update.
zerline Jun 7, 2020
17451ec
Merge branch 'master' into HTMLSanitizer
zerline Jun 7, 2020
7a03bb4
Putting things in the right place.
zerline Jun 8, 2020
fc0abfd
Dependency fix.
zerline Jun 8, 2020
99655c3
Test fix.
zerline Jun 8, 2020
58c1a58
Adding <span>.
zerline Jun 8, 2020
b64213a
Test notebook.
zerline Jun 8, 2020
5514bb9
Adding 'style' attribute.
zerline Jun 8, 2020
4b84b90
Little more styling.
zerline Jun 8, 2020
7646c54
No HTML is possible inside the tag '<button>'.
zerline Jun 8, 2020
2e8c38c
s/description_html/description_allow_html
zerline Jun 8, 2020
7f296f0
This almost works, but does not help to get a proper display for desc…
zerline Jun 15, 2020
85382cd
Question of length.
zerline Jun 15, 2020
1056286
Testing with Mathjax.
zerline Jun 15, 2020
dd5ef1f
Syntax correctness for lint.
zerline Jun 15, 2020
f28b5fc
Using LaTeX functions copied from Jupyterlab.
zerline Jul 1, 2020
3945f22
s/let/const.
zerline Jul 1, 2020
a6da1ad
Change plain text to just use textContent directly.
jasongrout Jul 10, 2021
d65b7c2
Merge remote-tracking branch 'origin/master' into HTMLSanitizer
jasongrout Jul 10, 2021
32665c4
Remove style tags during description sanitization
jasongrout Jul 10, 2021
be8adc4
Merge commit '280302e3cfc5262d805d658c910969fe0db50606' into HTMLSani…
jasongrout Jul 10, 2021
9a235d1
Lint
jasongrout Jul 10, 2021
7762fb7
Fix spec for new date and time pickers.
jasongrout Jul 10, 2021
df33205
Merge branch 'master' into HTMLSanitizer
jasongrout Jul 13, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion ipywidgets/widgets/widget_bool.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,13 @@ class ToggleButton(_Bool):
value : {True,False}
value of the toggle button: True-pressed, False-unpressed
description : str
description displayed next to the button
description displayed on the button
icon: str
font-awesome icon name
style: instance of DescriptionStyle
styling customizations
button_style: enum
button predefined styling
"""
_view_name = Unicode('ToggleButtonView').tag(sync=True)
_model_name = Unicode('ToggleButtonModel').tag(sync=True)
Expand Down
2 changes: 1 addition & 1 deletion ipywidgets/widgets/widget_button.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class Button(DOMWidget, CoreWidget):
Parameters
----------
description: str
description displayed next to the button
description displayed on the button
icon: str
font-awesome icon names, without the 'fa-' prefix
disabled: bool
Expand Down
3 changes: 2 additions & 1 deletion ipywidgets/widgets/widget_description.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

"""Contains the DOMWidget class"""

from traitlets import Unicode
from traitlets import Bool, Unicode
from .widget import Widget, widget_serialization, register
from .trait_types import InstanceDict
from .widget_style import Style
Expand All @@ -21,6 +21,7 @@ class DescriptionWidget(DOMWidget, CoreWidget):
"""Widget that has a description label to the side."""
_model_name = Unicode('DescriptionModel').tag(sync=True)
description = Unicode('', help="Description of the control.").tag(sync=True)
description_allow_html = Bool(False, help="Accept HTML in the description.").tag(sync=True)
style = InstanceDict(DescriptionStyle, help="Styling customizations").tag(sync=True, **widget_serialization)

def _repr_keys(self):
Expand Down
4 changes: 3 additions & 1 deletion packages/base-manager/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,16 @@
"@jupyter-widgets/base": "^4.0.0-alpha.0",
"@jupyterlab/services": "^5.0.2",
"@lumino/coreutils": "^1.4.2",
"base64-js": "^1.2.1"
"base64-js": "^1.2.1",
"sanitize-html": "^1.20"
jasongrout marked this conversation as resolved.
Show resolved Hide resolved
},
"devDependencies": {
"@types/base64-js": "^1.2.5",
"@types/chai": "^4.1.7",
"@types/chai-as-promised": "^7.1.0",
"@types/expect.js": "^0.3.29",
"@types/mocha": "^5.2.7",
"@types/sanitize-html": "^1.20",
"@types/sinon": "^7.0.13",
"@types/sinon-chai": "^3.2.2",
"chai": "^4.0.0",
Expand Down
192 changes: 192 additions & 0 deletions packages/base-manager/src/latex.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
/*-----------------------------------------------------------------------------
| Copyright (c) Jupyter Development Team.
| Distributed under the terms of the Modified BSD License.
|----------------------------------------------------------------------------*/
// Some magic for deferring mathematical expressions to MathJax
// by hiding them from the Markdown parser.
// Some of the code here is adapted with permission from Davide Cervone
// under the terms of the Apache2 license governing the MathJax project.
// Other minor modifications are also due to StackExchange and are used with
// permission.

const inline = '$'; // the inline math delimiter

// MATHSPLIT contains the pattern for math delimiters and special symbols
// needed for searching for math in the text input.
const MATHSPLIT = /(\$\$?|\\(?:begin|end)\{[a-z]*\*?\}|\\[{}$]|[{}]|(?:\n\s*)+|@@\d+@@|\\\\(?:\(|\)|\[|\]))/i;

/**
* Break up the text into its component parts and search
* through them for math delimiters, braces, linebreaks, etc.
* Math delimiters must match and braces must balance.
* Don't allow math to pass through a double linebreak
* (which will be a paragraph).
*/
export function removeMath(text: string): { text: string; math: string[] } {
const math: string[] = []; // stores math strings for later
let start: number | null = null;
let end: string | null = null;
let last: number | null = null;
let braces = 0;
let deTilde: (text: string) => string;

// Except for extreme edge cases, this should catch precisely those pieces of the markdown
// source that will later be turned into code spans. While MathJax will not TeXify code spans,
// we still have to consider them at this point; the following issue has happened several times:
//
// `$foo` and `$bar` are variables. --> <code>$foo ` and `$bar</code> are variables.
const hasCodeSpans = /`/.test(text);
if (hasCodeSpans) {
text = text
.replace(/~/g, '~T')
.replace(/(^|[^\\])(`+)([^\n]*?[^`\n])\2(?!`)/gm, wholematch =>
wholematch.replace(/\$/g, '~D')
);
deTilde = (text: string) => {
return text.replace(/~([TD])/g, (wholematch, character) =>
character === 'T' ? '~' : inline
);
};
} else {
deTilde = (text: string) => {
return text;
};
}

let blocks = text.replace(/\r\n?/g, '\n').split(MATHSPLIT);

for (let i = 1, m = blocks.length; i < m; i += 2) {
const block = blocks[i];
if (block.charAt(0) === '@') {
//
// Things that look like our math markers will get
// stored and then retrieved along with the math.
//
blocks[i] = '@@' + math.length + '@@';
math.push(block);
} else if (start !== null) {
//
// If we are in math, look for the end delimiter,
// but don't go past double line breaks, and
// and balance braces within the math.
//
if (block === end) {
if (braces) {
last = i;
} else {
blocks = processMath(start, i, deTilde, math, blocks);
start = null;
end = null;
last = null;
}
} else if (block.match(/\n.*\n/)) {
if (last !== null) {
i = last;
blocks = processMath(start, i, deTilde, math, blocks);
}
start = null;
end = null;
last = null;
braces = 0;
} else if (block === '{') {
braces++;
} else if (block === '}' && braces) {
braces--;
}
} else {
//
// Look for math start delimiters and when
// found, set up the end delimiter.
//
if (block === inline || block === '$$') {
start = i;
end = block;
braces = 0;
} else if (block === '\\\\(' || block === '\\\\[') {
start = i;
end = block.slice(-1) === '(' ? '\\\\)' : '\\\\]';
braces = 0;
} else if (block.substr(1, 5) === 'begin') {
start = i;
end = '\\end' + block.substr(6);
braces = 0;
}
}
}
if (start !== null && last !== null) {
blocks = processMath(start, last, deTilde, math, blocks);
start = null;
end = null;
last = null;
}
return { text: deTilde(blocks.join('')), math };
}

/**
* Put back the math strings that were saved,
* and clear the math array (no need to keep it around).
*/
export function replaceMath(text: string, math: string[]): string {
/**
* Replace a math placeholder with its corresponding group.
* The math delimiters "\\(", "\\[", "\\)" and "\\]" are replaced
* removing one backslash in order to be interpreted correctly by MathJax.
*/
const process = (match: string, n: number): string => {
let group = math[n];
if (
group.substr(0, 3) === '\\\\(' &&
group.substr(group.length - 3) === '\\\\)'
) {
group = '\\(' + group.substring(3, group.length - 3) + '\\)';
} else if (
group.substr(0, 3) === '\\\\[' &&
group.substr(group.length - 3) === '\\\\]'
) {
group = '\\[' + group.substring(3, group.length - 3) + '\\]';
}
return group;
};
// Replace all the math group placeholders in the text
// with the saved strings.
return text.replace(/@@(\d+)@@/g, process);
}

/**
* Process math blocks.
*
* The math is in blocks i through j, so
* collect it into one block and clear the others.
* Replace &, <, and > by named entities.
* For IE, put <br> at the ends of comments since IE removes \n.
* Clear the current math positions and store the index of the
* math, then push the math string onto the storage array.
* The preProcess function is called on all blocks if it has been passed in
*/
function processMath(
i: number,
j: number,
preProcess: (input: string) => string,
math: string[],
blocks: string[]
): string[] {
let block = blocks
.slice(i, j + 1)
.join('')
.replace(/&/g, '&amp;') // use HTML entity for &
.replace(/</g, '&lt;') // use HTML entity for <
.replace(/>/g, '&gt;'); // use HTML entity for >
if (navigator && navigator.appName === 'Microsoft Internet Explorer') {
block = block.replace(/(%[^\n]*)\n/g, '$1<br/>\n');
}
while (j > i) {
blocks[j] = '';
j--;
}
blocks[i] = '@@' + math.length + '@@'; // replace the current block text with a unique tag to find later
if (preProcess) {
block = preProcess(block);
}
math.push(block);
return blocks;
}
59 changes: 59 additions & 0 deletions packages/base-manager/src/manager-base.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,52 @@ import {
} from '@jupyter-widgets/base';

import { base64ToBuffer, bufferToBase64, hexToBuffer } from './utils';
import { removeMath, replaceMath } from './latex';
import sanitize from 'sanitize-html';

const PROTOCOL_MAJOR_VERSION = PROTOCOL_VERSION.split('.', 1)[0];

/**
* Strip unwanted tags from plaintext descriptions.
*/
function default_plaintext_sanitize(s: string): string {
return sanitize(s, {
allowedTags: [],
allowedAttributes: {}
});
}

/**
* Sanitize HTML-formatted descriptions.
*/
function default_inline_sanitize(s: string): string {
const allowedTags = [
'a',
'abbr',
'b',
'code',
'em',
'i',
'img',
'li',
'ol',
'span',
'strong',
'style',
jasongrout marked this conversation as resolved.
Show resolved Hide resolved
'ul'
];
const allowedAttributes = {
'*': ['aria-*', 'style', 'title'],
a: ['href'],
img: ['src'],
style: ['media', 'type']
};
return sanitize(s, {
allowedTags: allowedTags,
allowedAttributes: allowedAttributes
});
}

export interface IState extends PartialJSONObject {
buffers?: IBase64Buffers[];
model_name: string;
Expand Down Expand Up @@ -462,6 +505,22 @@ export abstract class ManagerBase implements IWidgetManager {
return Promise.resolve(url);
}

plaintext_sanitize(source: string): string {
// Separate math from normal markdown text.
const parts = removeMath(source);
// Extract plain text
const sanitized = default_plaintext_sanitize(parts['text']);
// Replace math and return.
return replaceMath(sanitized, parts['math']);
}

inline_sanitize(source: string): string {
const parts = removeMath(source);
// Sanitize tags for inline output.
const sanitized = default_inline_sanitize(parts['text']);
return replaceMath(sanitized, parts['math']);
}

/**
* The comm target name to register
*/
Expand Down
3 changes: 3 additions & 0 deletions packages/base/src/manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -190,4 +190,7 @@ export interface IWidgetManager {
* The default implementation just returns the original url.
*/
resolveUrl(url: string): Promise<string>;

plaintext_sanitize(s: string): string;
inline_sanitize(s: string): string;
}
7 changes: 7 additions & 0 deletions packages/base/test/src/dummy-manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,13 @@ export class DummyManager implements widgets.IWidgetManager {
return Promise.resolve(url);
}

plaintext_sanitize(s: string): string {
return s;
}
inline_sanitize(s: string): string {
return s;
}

/**
* Dictionary of model ids and model instance promises
*/
Expand Down
15 changes: 12 additions & 3 deletions packages/controls/src/widget_bool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,19 @@ export class CheckboxView extends DescriptionView {
return;
}
const description = this.model.get('description');
this.descriptionSpan.innerHTML = description;
const plaintext_description = this.model.widget_manager.plaintext_sanitize(
jasongrout marked this conversation as resolved.
Show resolved Hide resolved
description
);
if (this.model.get('description_allow_html')) {
this.descriptionSpan.innerHTML = this.model.widget_manager.inline_sanitize(
description
);
} else {
this.descriptionSpan.textContent = plaintext_description;
}
this.typeset(this.descriptionSpan);
this.descriptionSpan.title = description;
this.checkbox.title = description;
this.descriptionSpan.title = plaintext_description;
this.checkbox.title = plaintext_description;
}

/**
Expand Down
Loading