mirror of
https://github.com/danny-avila/LibreChat.git
synced 2026-05-13 16:07:30 +00:00
🌐 fix: air-gapped DOCX preview — embed mammoth fallback in CDN doc
Codex P2 review on PR #12934: the CDN-rendered DOCX path always pulled docx-preview + jszip from cdn.jsdelivr.net. Air-gapped or corporate- filtered networks where jsdelivr is blocked would degrade to a static "Preview unavailable" message even though the server already had a local mammoth renderer that could produce readable output. Now the dispatcher renders mammoth first and embeds the sanitized output inside the CDN document as a hidden `#lc-fallback` block. The iframe's existing `typeof docx === 'undefined'` check (which fires when the CDN scripts can't load) un-hides the fallback so the user sees a real preview. CDN-success path is unchanged: high-fidelity docx-preview output owns the viewport, mammoth fallback stays hidden. Two new safeguards in the dispatcher: - Size budget: if base64(binary) + mammoth body + wrapper > 512 KB (the `attachment.text` cache cap), drop to mammoth-only so a giant document still renders. The `OFFICE_HTML_OUTPUT_CAP` constant mirrors `MAX_TEXT_CACHE_BYTES` from extract.ts (separate constant to avoid a circular import; pinned by a unit test). - `lc-render` is hidden when fallback shows so the empty padded slot doesn't sit above the mammoth content. Tests: existing CDN-path tests updated for the new `wordDocToHtmlViaCdn(buffer, mammothBody)` signature; new test for the embedded fallback structure (`#lc-fallback`, mammoth body content, "High-fidelity renderer unavailable" notice, render-slot hide); new constant pin and per-fixture cap-respect assertion.
This commit is contained in:
parent
485eb2dfff
commit
0c0b0ce887
2 changed files with 151 additions and 32 deletions
|
|
@ -32,8 +32,16 @@ describe('Office HTML producers', () => {
|
|||
expect(html).toContain('id="lc-doc-data"');
|
||||
expect(html).toContain('docx.renderAsync');
|
||||
expect(html).toContain('cdn.jsdelivr.net/npm/docx-preview@');
|
||||
// Mammoth-path artifact must NOT appear.
|
||||
expect(html).not.toContain('<article class="lc-docx">');
|
||||
/* The CDN doc now ALSO embeds the mammoth-rendered fallback in a
|
||||
* hidden `#lc-fallback` block — Codex P2 review on PR #12934. The
|
||||
* iframe bootstrap reveals it whenever `docx-preview` can't load
|
||||
* (corporate firewall, offline) so air-gapped operators see a
|
||||
* readable preview instead of "Preview unavailable". The
|
||||
* fallback's `<article class="lc-docx">` wrapper is the
|
||||
* server-rendered mammoth output, sanitized through the same
|
||||
* pipeline as the standalone mammoth path. */
|
||||
expect(html).toContain('id="lc-fallback"');
|
||||
expect(html).toContain('<article class="lc-docx">');
|
||||
});
|
||||
|
||||
test('routes a docx above the size cap through the mammoth fallback', async () => {
|
||||
|
|
@ -68,9 +76,17 @@ describe('Office HTML producers', () => {
|
|||
* being a vehicle for outbound exfiltration or supply-chain
|
||||
* compromise. */
|
||||
|
||||
/* `wordDocToHtmlViaCdn` now takes a pre-rendered mammoth body
|
||||
* string as a second argument (Codex P2 review on PR #12934 —
|
||||
* the body is embedded inside `#lc-fallback` for air-gapped
|
||||
* deployments). Tests use a placeholder body to assert pure
|
||||
* wrapper-structure behavior; the dispatcher-level test above
|
||||
* exercises the real mammoth body from the sample fixture. */
|
||||
const FAKE_FALLBACK_BODY = '<p>fallback-body</p>';
|
||||
|
||||
test('embeds the binary as base64 that round-trips to the original bytes', async () => {
|
||||
const original = readFixture('sample.docx');
|
||||
const html = await _internal.wordDocToHtmlViaCdn(original);
|
||||
const html = await _internal.wordDocToHtmlViaCdn(original, FAKE_FALLBACK_BODY);
|
||||
const match = html.match(
|
||||
/<script id="lc-doc-data" type="application\/octet-stream;base64">([^<]*)<\/script>/,
|
||||
);
|
||||
|
|
@ -80,7 +96,10 @@ describe('Office HTML producers', () => {
|
|||
});
|
||||
|
||||
test('pins both CDN scripts to specific versions with SRI integrity', async () => {
|
||||
const html = await _internal.wordDocToHtmlViaCdn(readFixture('sample.docx'));
|
||||
const html = await _internal.wordDocToHtmlViaCdn(
|
||||
readFixture('sample.docx'),
|
||||
FAKE_FALLBACK_BODY,
|
||||
);
|
||||
// Both deps loaded from jsdelivr at pinned versions.
|
||||
expect(html).toContain('https://cdn.jsdelivr.net/npm/jszip@3.10.1/');
|
||||
expect(html).toContain('https://cdn.jsdelivr.net/npm/docx-preview@0.3.7/');
|
||||
|
|
@ -95,7 +114,10 @@ describe('Office HTML producers', () => {
|
|||
});
|
||||
|
||||
test('CSP locks the iframe down: no outbound connect, no eval, scripts only from jsdelivr', async () => {
|
||||
const html = await _internal.wordDocToHtmlViaCdn(readFixture('sample.docx'));
|
||||
const html = await _internal.wordDocToHtmlViaCdn(
|
||||
readFixture('sample.docx'),
|
||||
FAKE_FALLBACK_BODY,
|
||||
);
|
||||
const cspMatch = html.match(
|
||||
/<meta http-equiv="Content-Security-Policy" content="([^"]+)">/,
|
||||
);
|
||||
|
|
@ -114,15 +136,30 @@ describe('Office HTML producers', () => {
|
|||
expect(csp).not.toMatch(/unsafe-eval/);
|
||||
});
|
||||
|
||||
test('exposes a fallback message that surfaces if the renderer fails to load', async () => {
|
||||
const html = await _internal.wordDocToHtmlViaCdn(readFixture('sample.docx'));
|
||||
// Visible loading state and a fallback that swaps in on error.
|
||||
test('embeds the mammoth-rendered fallback body in #lc-fallback (air-gapped deployments)', async () => {
|
||||
const html = await _internal.wordDocToHtmlViaCdn(
|
||||
readFixture('sample.docx'),
|
||||
FAKE_FALLBACK_BODY,
|
||||
);
|
||||
/* Visible loading state. */
|
||||
expect(html).toContain('Loading preview…');
|
||||
expect(html).toContain('Preview unavailable');
|
||||
// The bootstrap script checks `typeof docx === 'undefined'` so
|
||||
// a CDN outage degrades gracefully rather than leaving a
|
||||
// permanently empty iframe.
|
||||
/* The fallback body now contains the server-rendered mammoth
|
||||
* output (the placeholder body in this test). When the iframe
|
||||
* detects `docx-preview` failed to load, `showFallback`
|
||||
* un-hides this block — Codex P2 review on PR #12934. The old
|
||||
* static "Preview unavailable" text is gone in favor of a
|
||||
* notice + the actual document content. */
|
||||
expect(html).toContain('id="lc-fallback"');
|
||||
expect(html).toContain(FAKE_FALLBACK_BODY);
|
||||
expect(html).toContain('High-fidelity renderer unavailable');
|
||||
/* The bootstrap script checks `typeof docx === 'undefined'`
|
||||
* so a CDN outage degrades to the fallback rather than an
|
||||
* empty iframe. */
|
||||
expect(html).toContain("typeof docx === 'undefined'");
|
||||
/* And it hides the empty render slot when fallback shows so
|
||||
* the mammoth content owns the viewport. */
|
||||
expect(html).toContain("document.getElementById('lc-render')");
|
||||
expect(html).toContain('render.hidden = true');
|
||||
});
|
||||
|
||||
test('size-fallback threshold is the documented 350 KB', async () => {
|
||||
|
|
@ -131,6 +168,27 @@ describe('Office HTML producers', () => {
|
|||
* `MAX_TEXT_CACHE_BYTES` reasoning above it. */
|
||||
expect(_internal.MAX_DOCX_CDN_BINARY_BYTES).toBe(350 * 1024);
|
||||
});
|
||||
|
||||
test('output cap mirrors `MAX_TEXT_CACHE_BYTES` from extract.ts', async () => {
|
||||
/* Pin the cycle-avoidance constant. If the upstream
|
||||
* `MAX_TEXT_CACHE_BYTES` ever changes (e.g. lifting the cap
|
||||
* for office types specifically), update both at the same
|
||||
* time or the dispatcher's size-budget path will misfire. */
|
||||
expect(_internal.OFFICE_HTML_OUTPUT_CAP).toBe(512 * 1024);
|
||||
});
|
||||
|
||||
test('output stays within the cache cap for the standard fixture', async () => {
|
||||
/* The fixture isn't large enough to hit the size-budget
|
||||
* fallback, but the resulting HTML *must* fit under the cap so
|
||||
* `attachment.text` doesn't get truncated mid-document.
|
||||
* Pinning this on the standard fixture catches regressions
|
||||
* where wrapper boilerplate or DOCX_EXTRA_CSS grows past the
|
||||
* 512 KB ceiling. Codex P2 review on PR #12934. */
|
||||
const html = await wordDocToHtml(readFixture('sample.docx'));
|
||||
expect(Buffer.byteLength(html, 'utf-8')).toBeLessThanOrEqual(
|
||||
_internal.OFFICE_HTML_OUTPUT_CAP,
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('OFFICE_PREVIEW_DISABLE_CDN escape hatch', () => {
|
||||
|
|
|
|||
|
|
@ -350,6 +350,19 @@ const DOCX_PREVIEW_CDN = {
|
|||
*/
|
||||
const MAX_DOCX_CDN_BINARY_BYTES = 350 * 1024;
|
||||
|
||||
/**
|
||||
* Mirror of `MAX_TEXT_CACHE_BYTES` from `~/files/code/extract` — the
|
||||
* 512 KB ceiling that `attachment.text` is truncated to before hitting
|
||||
* the SSE wire and the database. We mirror (rather than import) to
|
||||
* avoid the cycle: `extract.ts` already imports `bufferToOfficeHtml`
|
||||
* from this module. The dispatcher uses this to drop CDN-with-fallback
|
||||
* docs that would exceed the cap and fall back to mammoth-only.
|
||||
*
|
||||
* If the upstream constant ever changes, update this value too. The
|
||||
* `cap-mirrors-extract` test in `html.spec.ts` pins the relationship.
|
||||
*/
|
||||
const OFFICE_HTML_OUTPUT_CAP = 512 * 1024;
|
||||
|
||||
/**
|
||||
* Build the CDN-rendered HTML document for a DOCX. The base64 payload
|
||||
* lives inside a `<script type="application/octet-stream;base64">`
|
||||
|
|
@ -363,7 +376,7 @@ const MAX_DOCX_CDN_BINARY_BYTES = 350 * 1024;
|
|||
* for inline images), styles inline (`docx-preview` injects per-doc
|
||||
* styles into `<head>` at render time).
|
||||
*/
|
||||
function buildDocxCdnDocument(base64: string): string {
|
||||
function buildDocxCdnDocument(base64: string, mammothFallbackHtml: string): string {
|
||||
const csp = [
|
||||
"default-src 'none'",
|
||||
"script-src https://cdn.jsdelivr.net 'unsafe-inline'",
|
||||
|
|
@ -374,6 +387,15 @@ function buildDocxCdnDocument(base64: string): string {
|
|||
"base-uri 'none'",
|
||||
"form-action 'none'",
|
||||
].join('; ');
|
||||
/* Body styling for the embedded mammoth fallback. The CDN-rendered
|
||||
* path normally hides this content, but on air-gapped networks where
|
||||
* `cdn.jsdelivr.net` is blocked the fallback handler reveals it so
|
||||
* the user gets a readable preview instead of the legacy "Preview
|
||||
* unavailable" message — Codex P2 review on PR #12934. We inline the
|
||||
* shared `DOCX_EXTRA_CSS` rules here (rather than `<link>` to a
|
||||
* cross-origin sheet) because the CSP locks `style-src` to inline
|
||||
* only and the wrapped mammoth output uses the same `.lc-docx`
|
||||
* classes. */
|
||||
return `<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
|
|
@ -382,12 +404,14 @@ function buildDocxCdnDocument(base64: string): string {
|
|||
<meta http-equiv="Content-Security-Policy" content="${csp}">
|
||||
<title>Preview</title>
|
||||
<style>
|
||||
:root { color-scheme: light dark; --bg: #ffffff; --fg: #1f2937; --muted: #6b7280; }
|
||||
@media (prefers-color-scheme: dark) { :root { --bg: #1a1a2e; --fg: #e5e7eb; --muted: #9ca3af; } }
|
||||
:root { color-scheme: light dark; --bg: #ffffff; --fg: #1f2937; --muted: #6b7280; --link: #2563eb; --border: #e5e7eb; --header-bg: #f3f4f6; --row-alt: #f9fafb; }
|
||||
@media (prefers-color-scheme: dark) { :root { --bg: #1a1a2e; --fg: #e5e7eb; --muted: #9ca3af; --link: #93c5fd; --border: #2d3142; --header-bg: #232842; --row-alt: #1f2440; } }
|
||||
html, body { margin: 0; padding: 0; background: var(--bg); color: var(--fg); font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; }
|
||||
#lc-render { padding: 16px; }
|
||||
#lc-fallback { padding: 24px; font-size: 14px; line-height: 1.5; color: var(--muted); text-align: center; }
|
||||
#lc-fallback { padding: 24px 24px 32px; font-size: 14px; line-height: 1.5; color: var(--fg); }
|
||||
#lc-fallback-notice { font-size: 12px; color: var(--muted); border-bottom: 1px solid var(--border); padding-bottom: 8px; margin: 0 0 16px; }
|
||||
.lc-docx-loading { display: flex; align-items: center; justify-content: center; height: 60vh; color: var(--muted); font-size: 14px; }
|
||||
${DOCX_EXTRA_CSS}
|
||||
/* docx-preview emits its own per-document <style> tags inside #lc-render
|
||||
* — leave them be. These rules just keep the host frame consistent with
|
||||
* dark mode and bound the rendered document width. */
|
||||
|
|
@ -419,13 +443,21 @@ html, body { margin: 0; padding: 0; background: var(--bg); color: var(--fg); fon
|
|||
</head>
|
||||
<body>
|
||||
<div id="lc-render"><div class="lc-docx-loading">Loading preview…</div></div>
|
||||
<div id="lc-fallback" hidden>Preview unavailable. Please download the file to view it.</div>
|
||||
<div id="lc-fallback" hidden>
|
||||
<p id="lc-fallback-notice">High-fidelity renderer unavailable (CDN blocked or offline). Showing the simplified preview below.</p>
|
||||
<article class="lc-docx">${mammothFallbackHtml}</article>
|
||||
</div>
|
||||
<script id="lc-doc-data" type="application/octet-stream;base64">${base64}</script>
|
||||
<script>
|
||||
(function () {
|
||||
function showFallback(reason) {
|
||||
var loading = document.querySelector('.lc-docx-loading');
|
||||
if (loading) { loading.remove(); }
|
||||
/* Hide the empty render slot so the embedded mammoth preview owns
|
||||
* the viewport — otherwise a stripe of padded empty space sits
|
||||
* above the fallback content. Codex P2 review on PR #12934. */
|
||||
var render = document.getElementById('lc-render');
|
||||
if (render) { render.hidden = true; }
|
||||
var fallback = document.getElementById('lc-fallback');
|
||||
if (fallback) {
|
||||
fallback.hidden = false;
|
||||
|
|
@ -475,14 +507,25 @@ html, body { margin: 0; padding: 0; background: var(--bg); color: var(--fg); fon
|
|||
</html>`;
|
||||
}
|
||||
|
||||
async function wordDocToHtmlViaCdn(buffer: Buffer): Promise<string> {
|
||||
return buildDocxCdnDocument(buffer.toString('base64'));
|
||||
/**
|
||||
* Run mammoth + sanitization to produce the inner DOCX body HTML
|
||||
* (the `<article>` contents). Shared between the standalone mammoth
|
||||
* path and the CDN path's fallback embedding so both render through
|
||||
* the exact same pipeline — no diverging sanitization rules. Codex P2
|
||||
* review on PR #12934.
|
||||
*/
|
||||
async function renderMammothBody(buffer: Buffer): Promise<string> {
|
||||
const { convertToHtml } = await import('mammoth');
|
||||
const result = await convertToHtml({ buffer }, { styleMap: DOCX_STYLE_MAP });
|
||||
return sanitizeOfficeHtml(result.value);
|
||||
}
|
||||
|
||||
async function wordDocToHtmlViaCdn(buffer: Buffer, mammothFallbackBody: string): Promise<string> {
|
||||
return buildDocxCdnDocument(buffer.toString('base64'), mammothFallbackBody);
|
||||
}
|
||||
|
||||
async function wordDocToHtmlViaMammoth(buffer: Buffer): Promise<string> {
|
||||
const { convertToHtml } = await import('mammoth');
|
||||
const result = await convertToHtml({ buffer }, { styleMap: DOCX_STYLE_MAP });
|
||||
const sanitized = await sanitizeOfficeHtml(result.value);
|
||||
const sanitized = await renderMammothBody(buffer);
|
||||
return wrapAsDocument(`<article class="lc-docx">${sanitized}</article>`, DOCX_EXTRA_CSS);
|
||||
}
|
||||
|
||||
|
|
@ -516,12 +559,20 @@ function isOfficePreviewCdnDisabled(): boolean {
|
|||
* binary as base64 and lets `docx-preview` render it inside the
|
||||
* Sandpack iframe. High visual fidelity — preserves cell shading,
|
||||
* run-level colors/fonts, headers/footers, columns, and images.
|
||||
* 2. **Mammoth (fallback for larger files OR when the CDN path is
|
||||
* explicitly disabled via `OFFICE_PREVIEW_DISABLE_CDN=true`)**:
|
||||
* server-side semantic HTML conversion. Lower fidelity (flat
|
||||
* paragraphs, no shading) but produces compact output that fits
|
||||
* the `MAX_TEXT_CACHE_BYTES` (512 KB) cap on `attachment.text`
|
||||
* even for large documents, and works without external network.
|
||||
* The mammoth-rendered HTML is *also* embedded as a hidden
|
||||
* `<div id="lc-fallback">` block; the iframe's bootstrap script
|
||||
* reveals it whenever `docx-preview` fails to load (corporate
|
||||
* firewall blocking jsdelivr, offline desktop, etc.) so air-
|
||||
* gapped deployments still get a readable preview instead of a
|
||||
* "Preview unavailable" message — Codex P2 review on PR #12934.
|
||||
* 2. **Mammoth-only (fallback for larger files, files where the
|
||||
* combined CDN-doc-with-fallback would blow the cache cap, OR
|
||||
* when the CDN path is explicitly disabled via
|
||||
* `OFFICE_PREVIEW_DISABLE_CDN=true`)**: server-side semantic HTML
|
||||
* conversion. Lower fidelity (flat paragraphs, no shading) but
|
||||
* produces compact output that fits the `MAX_TEXT_CACHE_BYTES`
|
||||
* (512 KB) cap on `attachment.text` even for large documents,
|
||||
* and works without external network.
|
||||
*
|
||||
* Both paths pre-flight through `assertSafeZipSize` so a zip-bomb DOCX
|
||||
* is rejected before either renderer touches it — mammoth's internal
|
||||
|
|
@ -531,13 +582,21 @@ function isOfficePreviewCdnDisabled(): boolean {
|
|||
*/
|
||||
export async function wordDocToHtml(buffer: Buffer): Promise<string> {
|
||||
await assertSafeZipSize(buffer, { name: 'docx' });
|
||||
if (isOfficePreviewCdnDisabled()) {
|
||||
if (isOfficePreviewCdnDisabled() || buffer.length > MAX_DOCX_CDN_BINARY_BYTES) {
|
||||
return wordDocToHtmlViaMammoth(buffer);
|
||||
}
|
||||
if (buffer.length <= MAX_DOCX_CDN_BINARY_BYTES) {
|
||||
return wordDocToHtmlViaCdn(buffer);
|
||||
/* Render mammoth first so its sanitized output can be embedded as
|
||||
* the iframe's air-gapped fallback. If the combined size would
|
||||
* exceed the 512 KB cache cap, drop to mammoth-only — the user
|
||||
* loses high-fidelity rendering but still sees the document. The
|
||||
* size budget applies after mammoth runs because we can't know its
|
||||
* output size from the binary size alone. */
|
||||
const mammothBody = await renderMammothBody(buffer);
|
||||
const cdnDoc = await wordDocToHtmlViaCdn(buffer, mammothBody);
|
||||
if (Buffer.byteLength(cdnDoc, 'utf-8') > OFFICE_HTML_OUTPUT_CAP) {
|
||||
return wrapAsDocument(`<article class="lc-docx">${mammothBody}</article>`, DOCX_EXTRA_CSS);
|
||||
}
|
||||
return wordDocToHtmlViaMammoth(buffer);
|
||||
return cdnDoc;
|
||||
}
|
||||
|
||||
/* =============================================================================
|
||||
|
|
@ -1142,7 +1201,9 @@ async function pptxToSlideListHtmlInternal(buffer: Buffer): Promise<string> {
|
|||
export const _internal = {
|
||||
wordDocToHtmlViaCdn,
|
||||
wordDocToHtmlViaMammoth,
|
||||
renderMammothBody,
|
||||
MAX_DOCX_CDN_BINARY_BYTES,
|
||||
OFFICE_HTML_OUTPUT_CAP,
|
||||
DOCX_PREVIEW_CDN,
|
||||
pptxToHtmlViaCdn,
|
||||
MAX_PPTX_CDN_BINARY_BYTES,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue