diff --git a/apps/sim/connectors/zoom/zoom.test.ts b/apps/sim/connectors/zoom/zoom.test.ts
new file mode 100644
index 0000000000..040de394fb
--- /dev/null
+++ b/apps/sim/connectors/zoom/zoom.test.ts
@@ -0,0 +1,87 @@
+/**
+ * @vitest-environment node
+ */
+import { describe, expect, it } from 'vitest'
+import { parseVtt } from '@/connectors/zoom/zoom'
+
+const HEADER = 'WEBVTT\n\n'
+
+describe('parseVtt', () => {
+ it.concurrent('returns empty string for input with no cues', () => {
+ expect(parseVtt(HEADER)).toBe('')
+ })
+
+ it.concurrent('extracts plain spoken text from a single cue', () => {
+ const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nHello world\n`
+ expect(parseVtt(vtt)).toBe('Hello world')
+ })
+
+ it.concurrent('preserves WebVTT voice tags as "Speaker: text"', () => {
+ const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nhello there\n`
+ expect(parseVtt(vtt)).toBe('Alice: hello there')
+ })
+
+ it.concurrent('preserves voice tags with class suffix', () => {
+ const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nwelcome\n`
+ expect(parseVtt(vtt)).toBe('Bob: welcome')
+ })
+
+ it.concurrent('strips inline formatting tags but keeps text', () => {
+ const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nbold and italic\n`
+ expect(parseVtt(vtt)).toBe('bold and italic')
+ })
+
+ it.concurrent('strips karaoke timestamp tags', () => {
+ const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nhello <00:00:01.000>world\n`
+ expect(parseVtt(vtt)).toBe('hello world')
+ })
+
+ it.concurrent('strips class spans', () => {
+ const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nSHOUT\n`
+ expect(parseVtt(vtt)).toBe('SHOUT')
+ })
+
+ it.concurrent('skips cue identifier lines before timing', () => {
+ const vtt = `${HEADER}cue-1\n00:00:00.000 --> 00:00:02.000\nhello\n`
+ expect(parseVtt(vtt)).toBe('hello')
+ })
+
+ it.concurrent('joins multiple cues with newlines', () => {
+ const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nfirst\n\n00:00:02.000 --> 00:00:04.000\nsecond\n`
+ expect(parseVtt(vtt)).toBe('first\nsecond')
+ })
+
+ it.concurrent('collapses repeated whitespace within a cue', () => {
+ const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nhello world\n`
+ expect(parseVtt(vtt)).toBe('hello world')
+ })
+
+ it.concurrent('iteratively strips overlapping tags that reconstruct after one pass', () => {
+ const crafted = '<b>injectedb>'
+ const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n${crafted}\n`
+ const result = parseVtt(vtt)
+ expect(result).not.toMatch(/<\/?[^>]+>/)
+ expect(result).toContain('injected')
+ })
+
+ it.concurrent('iteratively strips nested script-like tag fragments', () => {
+ const crafted = 'ipt>alert(1)ipt>'
+ const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n${crafted}\n`
+ const result = parseVtt(vtt)
+ expect(result).not.toMatch(/<\/?[^>]+>/)
+ expect(result.toLowerCase()).not.toContain('script')
+ })
+
+ it.concurrent('sanitizes crafted speaker names that embed tag fragments', () => {
+ const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nEvil>payload\n`
+ const result = parseVtt(vtt)
+ expect(result).not.toMatch(/<\/?[^>]+>/)
+ })
+
+ it.concurrent('terminates on adversarial deeply-nested input', () => {
+ const crafted = `${'<'.repeat(50)}b${'>'.repeat(50)}text${'<'.repeat(50)}/b${'>'.repeat(50)}`
+ const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n${crafted}\n`
+ const result = parseVtt(vtt)
+ expect(result).not.toMatch(/<\/?[^>]+>/)
+ })
+})
diff --git a/apps/sim/connectors/zoom/zoom.ts b/apps/sim/connectors/zoom/zoom.ts
index 8027754892..240031d1cf 100644
--- a/apps/sim/connectors/zoom/zoom.ts
+++ b/apps/sim/connectors/zoom/zoom.ts
@@ -120,8 +120,10 @@ function findTranscriptFile(files?: ZoomRecordingFile[]): ZoomRecordingFile | un
* Extracts spoken text from a Zoom WebVTT transcript, stripping cue identifiers,
* timestamps, and inline markup. Handles both Zoom's `Speaker: text` convention
* and standard WebVTT `text` voice tags.
+ *
+ * Exported for unit tests; not part of the connector's public surface.
*/
-function parseVtt(vtt: string): string {
+export function parseVtt(vtt: string): string {
const lines = vtt.split(/\r?\n/)
const segments: string[] = []
let i = 0
@@ -152,10 +154,13 @@ function parseVtt(vtt: string): string {
if (textParts.length > 0) {
const raw = textParts.join(' ')
const withSpeakers = raw.replace(/]+)?\s+([^>]+)>([\s\S]*?)<\/v>/g, '$1: $2')
- const stripped = withSpeakers
- .replace(/<\/?[^>]+>/g, '')
- .replace(/\s+/g, ' ')
- .trim()
+ let withoutTags = withSpeakers
+ let previous: string
+ do {
+ previous = withoutTags
+ withoutTags = withoutTags.replace(/<\/?[^>]+>/g, '')
+ } while (withoutTags !== previous)
+ const stripped = withoutTags.replace(/\s+/g, ' ').trim()
if (stripped) segments.push(stripped)
}
}