diff --git a/apps/sim/connectors/zoom/zoom.test.ts b/apps/sim/connectors/zoom/zoom.test.ts
new file mode 100644
index 0000000000..040de394fb
--- /dev/null
+++ b/apps/sim/connectors/zoom/zoom.test.ts
@@ -0,0 +1,87 @@
+/**
+ * @vitest-environment node
+ */
+import { describe, expect, it } from 'vitest'
+import { parseVtt } from '@/connectors/zoom/zoom'
+
+const HEADER = 'WEBVTT\n\n'
+
+describe('parseVtt', () => {
+  it.concurrent('returns empty string for input with no cues', () => {
+    expect(parseVtt(HEADER)).toBe('')
+  })
+
+  it.concurrent('extracts plain spoken text from a single cue', () => {
+    const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nHello world\n`
+    expect(parseVtt(vtt)).toBe('Hello world')
+  })
+
+  it.concurrent('preserves WebVTT voice tags as "Speaker: text"', () => {
+    const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n<v Alice>hello there</v>\n`
+    expect(parseVtt(vtt)).toBe('Alice: hello there')
+  })
+
+  it.concurrent('preserves voice tags with class suffix', () => {
+    const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n<v.host Bob>welcome</v>\n`
+    expect(parseVtt(vtt)).toBe('Bob: welcome')
+  })
+
+  it.concurrent('strips inline formatting tags but keeps text', () => {
+    const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n<b>bold</b> and <i>italic</i>\n`
+    expect(parseVtt(vtt)).toBe('bold and italic')
+  })
+
+  it.concurrent('strips karaoke timestamp tags', () => {
+    const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nhello <00:00:01.000>world\n`
+    expect(parseVtt(vtt)).toBe('hello world')
+  })
+
+  it.concurrent('strips class spans', () => {
+    const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n<c.loud>SHOUT</c>\n`
+    expect(parseVtt(vtt)).toBe('SHOUT')
+  })
+
+  it.concurrent('skips cue identifier lines before timing', () => {
+    const vtt = `${HEADER}cue-1\n00:00:00.000 --> 00:00:02.000\nhello\n`
+    expect(parseVtt(vtt)).toBe('hello')
+  })
+
+  it.concurrent('joins multiple cues with newlines', () => {
+    const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nfirst\n\n00:00:02.000 --> 00:00:04.000\nsecond\n`
+    expect(parseVtt(vtt)).toBe('first\nsecond')
+  })
+
+  it.concurrent('collapses repeated whitespace within a cue', () => {
+    const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\nhello   world\n`
+    expect(parseVtt(vtt)).toBe('hello world')
+  })
+
+  it.concurrent('iteratively strips overlapping tags that reconstruct after one pass', () => {
+    const crafted = '<<b>b>injected</<b>b>'
+    const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n${crafted}\n`
+    const result = parseVtt(vtt)
+    expect(result).not.toMatch(/<\/?[^>]+>/)
+    expect(result).toContain('injected')
+  })
+
+  it.concurrent('iteratively strips nested script-like tag fragments', () => {
+    const crafted = '<scr<script>ipt>alert(1)</scr</script>ipt>'
+    const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n${crafted}\n`
+    const result = parseVtt(vtt)
+    expect(result).not.toMatch(/<\/?[^>]+>/)
+    expect(result.toLowerCase()).not.toContain('script')
+  })
+
+  it.concurrent('sanitizes crafted speaker names that embed tag fragments', () => {
+    const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n<v <b>Evil</b>>payload</v>\n`
+    const result = parseVtt(vtt)
+    expect(result).not.toMatch(/<\/?[^>]+>/)
+  })
+
+  it.concurrent('terminates on adversarial deeply-nested input', () => {
+    const crafted = `${'<'.repeat(50)}b${'>'.repeat(50)}text${'<'.repeat(50)}/b${'>'.repeat(50)}`
+    const vtt = `${HEADER}00:00:00.000 --> 00:00:02.000\n${crafted}\n`
+    const result = parseVtt(vtt)
+    expect(result).not.toMatch(/<\/?[^>]+>/)
+  })
+})
diff --git a/apps/sim/connectors/zoom/zoom.ts b/apps/sim/connectors/zoom/zoom.ts
index 8027754892..240031d1cf 100644
--- a/apps/sim/connectors/zoom/zoom.ts
+++ b/apps/sim/connectors/zoom/zoom.ts
@@ -120,8 +120,10 @@ function findTranscriptFile(files?: ZoomRecordingFile[]): ZoomRecordingFile | un
  * Extracts spoken text from a Zoom WebVTT transcript, stripping cue identifiers,
  * timestamps, and inline markup. Handles both Zoom's `Speaker: text` convention
  * and standard WebVTT `<v Speaker>text</v>` voice tags.
+ *
+ * Exported for unit tests; not part of the connector's public surface.
  */
-function parseVtt(vtt: string): string {
+export function parseVtt(vtt: string): string {
   const lines = vtt.split(/\r?\n/)
   const segments: string[] = []
   let i = 0
@@ -152,10 +154,13 @@ function parseVtt(vtt: string): string {
     if (textParts.length > 0) {
       const raw = textParts.join(' ')
       const withSpeakers = raw.replace(/<v(?:\.[^\s>]+)?\s+([^>]+)>([\s\S]*?)<\/v>/g, '$1: $2')
-      const stripped = withSpeakers
-        .replace(/<\/?[^>]+>/g, '')
-        .replace(/\s+/g, ' ')
-        .trim()
+      let withoutTags = withSpeakers
+      let previous: string
+      do {
+        previous = withoutTags
+        withoutTags = withoutTags.replace(/<\/?[^>]+>/g, '')
+      } while (withoutTags !== previous)
+      const stripped = withoutTags.replace(/\s+/g, ' ').trim()
       if (stripped) segments.push(stripped)
     }
   }