From 01673d1c4a9bd1c439a44dc5b702f3a25746726e Mon Sep 17 00:00:00 2001 From: dal Date: Mon, 15 Sep 2025 14:45:54 -0600 Subject: [PATCH] Added HTML entity decoding functions to Slack package and updated message retrieval to decode text in thread messages. --- packages/slack/src/index.ts | 1 + packages/slack/src/threads.ts | 14 +- .../slack/src/utils/html-entities.test.ts | 152 ++++++++++++++++++ packages/slack/src/utils/html-entities.ts | 134 +++++++++++++++ 4 files changed, 298 insertions(+), 3 deletions(-) create mode 100644 packages/slack/src/utils/html-entities.test.ts create mode 100644 packages/slack/src/utils/html-entities.ts diff --git a/packages/slack/src/index.ts b/packages/slack/src/index.ts index 39ee2ab58..0983b18be 100644 --- a/packages/slack/src/index.ts +++ b/packages/slack/src/index.ts @@ -45,6 +45,7 @@ export * from './utils/validation-helpers'; export * from './utils/message-formatter'; export * from './utils/oauth-helpers'; export { convertMarkdownToSlack } from './utils/markdown-to-slack'; +export { decodeHtmlEntities, decodeSlackMessageText } from './utils/html-entities'; // Reactions export { addReaction, removeReaction, getReactions } from './reactions'; diff --git a/packages/slack/src/threads.ts b/packages/slack/src/threads.ts index 5f11220fc..6d7f2c3d1 100644 --- a/packages/slack/src/threads.ts +++ b/packages/slack/src/threads.ts @@ -1,4 +1,5 @@ import { WebClient } from '@slack/web-api'; +import { decodeSlackMessageText } from './utils/html-entities'; // Define our own simple types to avoid complex Slack API type issues interface SlackBlock { @@ -45,9 +46,12 @@ export async function getThreadMessages({ inclusive: true, // Include the parent message }); - // Cast the result to our SlackMessage type + // Cast the result to our SlackMessage type and decode HTML entities const messages = result.messages || []; - return messages as SlackMessage[]; + return messages.map((message) => ({ + ...message, + text: decodeSlackMessageText(message.text), + })) as SlackMessage[]; } catch (error) { console.error('Failed to get thread messages:', error); throw error; @@ -81,7 +85,11 @@ export async function getMessage({ }); if (result.messages && result.messages.length > 0) { - return result.messages[0] as SlackMessage; + const message = result.messages[0]; + return { + ...message, + text: decodeSlackMessageText(message?.text), + } as SlackMessage; } return null; diff --git a/packages/slack/src/utils/html-entities.test.ts b/packages/slack/src/utils/html-entities.test.ts new file mode 100644 index 000000000..089743488 --- /dev/null +++ b/packages/slack/src/utils/html-entities.test.ts @@ -0,0 +1,152 @@ +import { describe, expect, it } from 'vitest'; +import { decodeHtmlEntities, decodeSlackMessageText } from './html-entities'; + +describe('decodeHtmlEntities', () => { + it('should decode common HTML entities', () => { + expect(decodeHtmlEntities('<')).toBe('<'); + expect(decodeHtmlEntities('>')).toBe('>'); + expect(decodeHtmlEntities('&')).toBe('&'); + expect(decodeHtmlEntities('"')).toBe('"'); + expect(decodeHtmlEntities(''')).toBe("'"); + expect(decodeHtmlEntities(''')).toBe("'"); + }); + + it('should decode multiple entities in a string', () => { + expect(decodeHtmlEntities('<div>Hello & World</div>')).toBe( + '
Hello & World
' + ); + expect(decodeHtmlEntities('"Hello" & 'World'')).toBe( + '"Hello" & \'World\'' + ); + }); + + it('should decode numeric character references', () => { + // Decimal references + expect(decodeHtmlEntities('<')).toBe('<'); + expect(decodeHtmlEntities('>')).toBe('>'); + expect(decodeHtmlEntities('&')).toBe('&'); + expect(decodeHtmlEntities('{')).toBe('{'); + expect(decodeHtmlEntities('}')).toBe('}'); + + // Hexadecimal references + expect(decodeHtmlEntities('<')).toBe('<'); + expect(decodeHtmlEntities('>')).toBe('>'); + expect(decodeHtmlEntities('&')).toBe('&'); + expect(decodeHtmlEntities('{')).toBe('{'); + expect(decodeHtmlEntities('}')).toBe('}'); + + // Case insensitive hex + expect(decodeHtmlEntities('<')).toBe('<'); + expect(decodeHtmlEntities('>')).toBe('>'); + }); + + it('should handle special characters', () => { + expect(decodeHtmlEntities(' ')).toBe(' '); + expect(decodeHtmlEntities('–')).toBe('–'); + expect(decodeHtmlEntities('—')).toBe('—'); + expect(decodeHtmlEntities('…')).toBe('…'); + expect(decodeHtmlEntities('“')).toBe('\u201C'); // Left double quotation mark + expect(decodeHtmlEntities('”')).toBe('\u201D'); // Right double quotation mark + }); + + it('should handle empty or undefined input', () => { + expect(decodeHtmlEntities('')).toBe(''); + expect(decodeHtmlEntities(null as unknown as string)).toBe(null); + expect(decodeHtmlEntities(undefined as unknown as string)).toBe(undefined); + }); + + it('should preserve text without entities', () => { + expect(decodeHtmlEntities('Hello World')).toBe('Hello World'); + expect(decodeHtmlEntities('No entities here!')).toBe('No entities here!'); + }); + + it('should handle repeated entities', () => { + expect(decodeHtmlEntities('&&&')).toBe('&&&'); + expect(decodeHtmlEntities('<<<')).toBe('<<<'); + }); + + it('should decode entities in code examples', () => { + const input = 'if (x < 10 && y > 5) { console.log("Hello"); }'; + const expected = 'if (x < 10 && y > 5) { console.log("Hello"); }'; + expect(decodeHtmlEntities(input)).toBe(expected); + }); +}); + +describe('decodeSlackMessageText', () => { + it('should decode HTML entities while preserving Slack user mentions', () => { + const input = '<@U123456> said "Hello"'; + const expected = '<@U123456> said "Hello"'; + expect(decodeSlackMessageText(input)).toBe(expected); + + // When mention is already properly formatted + const input2 = '<@U123456> said "Hello"'; + const expected2 = '<@U123456> said "Hello"'; + expect(decodeSlackMessageText(input2)).toBe(expected2); + }); + + it('should preserve Slack channel mentions', () => { + const input = 'Check out <#C123456> for more info'; + const expected = 'Check out <#C123456> for more info'; + expect(decodeSlackMessageText(input)).toBe(expected); + + // When channel mention is already properly formatted + const input2 = 'Check out <#C123456> for more info'; + const expected2 = 'Check out <#C123456> for more info'; + expect(decodeSlackMessageText(input2)).toBe(expected2); + }); + + it('should preserve Slack links', () => { + const input = 'Visit <https://example.com|our website> for details'; + const expected = 'Visit for details'; + expect(decodeSlackMessageText(input)).toBe(expected); + + // When link is already properly formatted + const input2 = 'Visit for details'; + const expected2 = 'Visit for details'; + expect(decodeSlackMessageText(input2)).toBe(expected2); + }); + + it('should preserve simple Slack URLs', () => { + const input = 'Check <https://example.com>'; + const expected = 'Check '; + expect(decodeSlackMessageText(input)).toBe(expected); + + // When URL is already properly formatted + const input2 = 'Check '; + const expected2 = 'Check '; + expect(decodeSlackMessageText(input2)).toBe(expected2); + }); + + it('should decode entities in regular text while preserving Slack formatting', () => { + const input = '<@U123456> wrote: <div>Hello & welcome</div> in <#C789012>'; + const expected = '<@U123456> wrote:
Hello & welcome
in <#C789012>'; + expect(decodeSlackMessageText(input)).toBe(expected); + }); + + it('should handle mixed content with code blocks', () => { + const input = + 'Here's the code: if (x < 10 && y > 5) { alert("Hi"); }'; + const expected = 'Here\'s the code: if (x < 10 && y > 5) { alert("Hi"); }'; + expect(decodeSlackMessageText(input)).toBe(expected); + }); + + it('should handle undefined or empty input', () => { + expect(decodeSlackMessageText(undefined)).toBe(undefined); + expect(decodeSlackMessageText('')).toBe(''); + expect(decodeSlackMessageText(' ')).toBe(' '); + }); + + it('should handle complex Slack messages', () => { + const input = + '<@U123456> mentioned <@U789012> in <#C345678>: "Check this <https://example.com|link> for the <code> example"'; + const expected = + '<@U123456> mentioned <@U789012> in <#C345678>: "Check this for the example"'; + expect(decodeSlackMessageText(input)).toBe(expected); + }); + + it('should handle messages with multiple entity types', () => { + const input = 'Testing & more: <script>alert('XSS')</script>'; + const expected = "Testing & more: "; + expect(decodeSlackMessageText(input)).toBe(expected); + }); +}); diff --git a/packages/slack/src/utils/html-entities.ts b/packages/slack/src/utils/html-entities.ts new file mode 100644 index 000000000..1b96c626a --- /dev/null +++ b/packages/slack/src/utils/html-entities.ts @@ -0,0 +1,134 @@ +/** + * Decode HTML entities from Slack messages + * Slack API returns text with HTML entities encoded (e.g., < for <, > for >, & for &) + * This function decodes these entities back to their original characters + */ + +/** + * Map of HTML entities to their decoded characters + * Based on common entities found in Slack messages + */ +const HTML_ENTITIES: Record = { + '<': '<', + '>': '>', + '&': '&', + '"': '"', + ''': "'", + ''': "'", + ''': "'", + '/': '/', + '/': '/', + '`': '`', + '`': '`', + ' ': ' ', + ' ': ' ', + '–': '–', + '—': '—', + '…': '…', + '“': '\u201C', + '”': '\u201D', + '‘': '\u2018', + '’': '\u2019', +}; + +/** + * Decode HTML entities in a string + * @param text - The text containing HTML entities + * @returns The decoded text with HTML entities replaced by their characters + */ +export function decodeHtmlEntities(text: string): string { + if (!text) { + return text; + } + + // Replace known HTML entities + let decodedText = text; + for (const [entity, replacement] of Object.entries(HTML_ENTITIES)) { + // Use global replace to handle multiple occurrences + const regex = new RegExp(escapeRegExp(entity), 'g'); + decodedText = decodedText.replace(regex, replacement); + } + + // Handle numeric character references (e.g., { or {) + // Decimal: { + decodedText = decodedText.replace(/&#(\d+);/g, (_match, code) => { + const charCode = Number.parseInt(code, 10); + return String.fromCharCode(charCode); + }); + + // Hexadecimal: { or { + decodedText = decodedText.replace(/&#[xX]([0-9a-fA-F]+);/g, (_match, code) => { + const charCode = Number.parseInt(code, 16); + return String.fromCharCode(charCode); + }); + + return decodedText; +} + +/** + * Escape special regex characters in a string + * @param string - The string to escape + * @returns The escaped string safe for use in regex + */ +function escapeRegExp(string: string): string { + return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +/** + * Decode HTML entities in Slack message text while preserving Slack-specific formatting + * This function is aware of Slack's message format and preserves user/channel mentions + * @param slackText - The Slack message text with potential HTML entities + * @returns The decoded text with HTML entities replaced + */ +export function decodeSlackMessageText(slackText: string | undefined): string | undefined { + if (!slackText) { + return slackText; + } + + // Slack uses <@USERID> for user mentions and <#CHANNELID> for channel mentions + // These should not be decoded as HTML entities, so we need to be careful + // The < and > around these are actual HTML entities that should be decoded + // But the < and > that are already part of mentions should be preserved + + // First, protect Slack mentions by temporarily replacing them + const mentionPlaceholders = new Map(); + let placeholderIndex = 0; + + // Protect user mentions <@USERID> + let protectedText = slackText.replace(/<@[A-Z0-9]+>/g, (match) => { + const placeholder = `__SLACK_USER_MENTION_${placeholderIndex++}__`; + mentionPlaceholders.set(placeholder, match); + return placeholder; + }); + + // Protect channel mentions <#CHANNELID> + protectedText = protectedText.replace(/<#[A-Z0-9]+>/g, (match) => { + const placeholder = `__SLACK_CHANNEL_MENTION_${placeholderIndex++}__`; + mentionPlaceholders.set(placeholder, match); + return placeholder; + }); + + // Protect links + protectedText = protectedText.replace(/<[^>]+\|[^>]+>/g, (match) => { + const placeholder = `__SLACK_LINK_${placeholderIndex++}__`; + mentionPlaceholders.set(placeholder, match); + return placeholder; + }); + + // Protect simple links + protectedText = protectedText.replace(/<(https?:\/\/[^>]+)>/g, (match) => { + const placeholder = `__SLACK_URL_${placeholderIndex++}__`; + mentionPlaceholders.set(placeholder, match); + return placeholder; + }); + + // Now decode HTML entities + let decodedText = decodeHtmlEntities(protectedText); + + // Restore the protected Slack mentions + for (const [placeholder, original] of mentionPlaceholders) { + decodedText = decodedText.replace(placeholder, original); + } + + return decodedText; +}