feat: add Turn class and implement turns property in Transcript model for aggregating utterances

2025-12-22 17:46:54 +01:00
parent 5c6d87dab7
commit e425733368
1 changed files with 44 additions and 0 deletions
--- a/src/helia/models/transcript.py
+++ b/src/helia/models/transcript.py
@@ -12,10 +12,54 @@ class Utterance(BaseModel):
    value: str


+class Turn(BaseModel):
+    speaker: Literal["Interviewer", "Participant"]
+    value: str
+    start_time: float
+    end_time: float
+    utterance_count: int
+
+
 class Transcript(Document):
    transcript_id: str
    utterances: list[Utterance]

+    @property
+    def turns(self) -> list[Turn]:
+        """
+        Aggregates consecutive utterances from the same speaker into a single Turn.
+        """
+        if not self.utterances:
+            return []
+
+        turns: list[Turn] = []
+        current_batch: list[Utterance] = []
+
+        for utterance in self.utterances:
+            if not current_batch:
+                current_batch.append(utterance)
+                continue
+
+            if utterance.speaker == current_batch[-1].speaker:
+                current_batch.append(utterance)
+            else:
+                turns.append(self._create_turn(current_batch))
+                current_batch = [utterance]
+
+        if current_batch:
+            turns.append(self._create_turn(current_batch))
+
+        return turns
+
+    def _create_turn(self, batch: list[Utterance]) -> Turn:
+        return Turn(
+            speaker=batch[0].speaker,
+            value=" ".join(u.value for u in batch),
+            start_time=batch[0].start_time,
+            end_time=batch[-1].end_time,
+            utterance_count=len(batch),
+        )
+
    class Settings:
        name = "transcripts"
        indexes: ClassVar = [