spych.core

  1from faster_whisper import WhisperModel
  2from spych.utils import Notify, Recorder, get_clean_audio_buffer
  3from typing import Union
  4
  5
  6class Spych(Notify):
  7    def __init__(
  8        self,
  9        whisper_model: str = "base.en",
 10        whisper_device: str = "cpu",
 11        whisper_compute_type: str = "int8",
 12        no_speech_threshold: float = 0.3,
 13        vad_speech_threshold: float = 0.5,
 14        vad_silence_threshold: float = 0.35,
 15        vad_silence_frames_threshold: int = 20,
 16        vad_speech_pad_frames: int = 5,
 17        vad_max_speech_duration_s: float = 30.0,
 18    ) -> None:
 19        """
 20        Usage:
 21
 22        - Initializes a transcription object using faster-whisper for fully offline
 23          speech-to-text
 24
 25        Optional:
 26
 27        - `whisper_model`:
 28            - Type: str
 29            - What: The faster-whisper model name to use for transcription
 30            - Default: "base.en"
 31            - Note: Larger models (small, medium, large) provide better accuracy at
 32              the cost of speed; smaller models (tiny, base) are faster but less accurate
 33
 34        - `whisper_device`:
 35            - Type: str
 36            - What: The device to run the whisper model on
 37            - Default: "cpu"
 38            - Options: "cpu", "cuda"
 39            - Note: Use "cuda" for GPU acceleration if available
 40
 41        - `whisper_compute_type`:
 42            - Type: str
 43            - What: The compute type to use for the whisper model
 44            - Default: "int8"
 45            - Options: "int8", "float16", "float32"
 46            - Note: "int8" offers a good balance of speed and accuracy on both CPU and GPU
 47
 48        - `no_speech_threshold`:
 49            - Type: float
 50            - What: The threshold for the `no_speech_prob` returned by faster-whisper
 51            - Default: 0.3
 52            - Note: Segments with a `no_speech_prob` above this threshold will be
 53              ignored to reduce false positives from silence or background noise
 54
 55        - `vad_speech_threshold`:
 56            - Type: float (0.0–1.0)
 57            - What: Silero probability above which a frame is considered speech onset
 58              when `listen` is called with `duration="auto"` or `duration=0`
 59            - Default: 0.5
 60            - Note: Raise to reduce false positives in noisy environments
 61
 62        - `vad_silence_threshold`:
 63            - Type: float (0.0–1.0)
 64            - What: Silero probability below which a frame is considered silence
 65              during an active utterance; must be less than `vad_speech_threshold`
 66              to create a hysteresis band that prevents rapid toggling
 67            - Default: 0.35
 68
 69        - `vad_silence_frames_threshold`:
 70            - Type: int
 71            - What: Consecutive silent frames (~32ms each) required to confirm the
 72              utterance has ended and return the buffer
 73            - Default: 20  (~640ms)
 74
 75        - `vad_speech_pad_frames`:
 76            - Type: int
 77            - What: Pre-roll frames captured before onset confirmation; also the
 78              number of consecutive voiced frames required to confirm speech onset
 79            - Default: 5  (~160ms)
 80
 81        - `vad_max_speech_duration_s`:
 82            - Type: float
 83            - What: Hard cap on a single VAD-captured utterance in seconds; forces
 84              a return even if the speaker never pauses
 85            - Default: 30.0
 86        """
 87        self.wake_model = WhisperModel(
 88            whisper_model,
 89            device=whisper_device,
 90            compute_type=whisper_compute_type,
 91        )
 92        self.no_speech_threshold = no_speech_threshold
 93        self.vad_speech_threshold = vad_speech_threshold
 94        self.vad_silence_threshold = vad_silence_threshold
 95        self.vad_silence_frames_threshold = vad_silence_frames_threshold
 96        self.vad_speech_pad_frames = vad_speech_pad_frames
 97        self.vad_max_speech_duration_s = vad_max_speech_duration_s
 98        self.recorder = Recorder()
 99
100    def listen(
101        self,
102        duration: Union[int, float, str] = 0,
103        device_index: int = -1,
104    ) -> str:
105        """
106        Usage:
107
108        - Records audio from the microphone and returns the transcription as a string.
109          Supports both fixed-duration recording and VAD-gated recording that
110          automatically stops at the end of a natural utterance.
111
112        Optional:
113
114        - `duration`:
115            - Type: int | float | str
116            - What: Controls how long to record
117            - Default: 0
118            - Options:
119                - int | float : Record for exactly this many seconds
120                - "auto" or 0 : Use Silero VAD to detect a complete utterance and
121                                stop automatically when the speaker finishes;
122                                honours the `vad_*` parameters set at init time
123            - Note: "auto" and 0 are equivalent; either signals VAD-gated recording
124
125        - `device_index`:
126            - Type: int
127            - What: The microphone device index to record from
128            - Default: -1
129            - Note: Use `-1` to select the system default input device
130
131        Returns:
132
133        - `transcription`:
134            - Type: str
135            - What: The transcribed text from the recorded audio
136            - Note: Multiple segments are joined with a single space
137        """
138        if duration == "auto" or duration == 0:
139            buffer = self.recorder.record_vad(
140                device_index=device_index,
141                speech_threshold=self.vad_speech_threshold,
142                silence_threshold=self.vad_silence_threshold,
143                silence_frames_threshold=self.vad_silence_frames_threshold,
144                speech_pad_frames=self.vad_speech_pad_frames,
145                max_speech_duration_s=self.vad_max_speech_duration_s,
146            )
147        else:
148            buffer = self.recorder.record(
149                device_index=device_index, duration=duration
150            )
151
152        audio_buffer = get_clean_audio_buffer(buffer)
153        segments, _ = self.wake_model.transcribe(audio_buffer, beam_size=2)
154        return " ".join(
155            [
156                segment.text
157                for segment in segments
158                if segment.no_speech_prob <= self.no_speech_threshold
159            ]
160        )
class Spych(spych.utils.Notify):
  7class Spych(Notify):
  8    def __init__(
  9        self,
 10        whisper_model: str = "base.en",
 11        whisper_device: str = "cpu",
 12        whisper_compute_type: str = "int8",
 13        no_speech_threshold: float = 0.3,
 14        vad_speech_threshold: float = 0.5,
 15        vad_silence_threshold: float = 0.35,
 16        vad_silence_frames_threshold: int = 20,
 17        vad_speech_pad_frames: int = 5,
 18        vad_max_speech_duration_s: float = 30.0,
 19    ) -> None:
 20        """
 21        Usage:
 22
 23        - Initializes a transcription object using faster-whisper for fully offline
 24          speech-to-text
 25
 26        Optional:
 27
 28        - `whisper_model`:
 29            - Type: str
 30            - What: The faster-whisper model name to use for transcription
 31            - Default: "base.en"
 32            - Note: Larger models (small, medium, large) provide better accuracy at
 33              the cost of speed; smaller models (tiny, base) are faster but less accurate
 34
 35        - `whisper_device`:
 36            - Type: str
 37            - What: The device to run the whisper model on
 38            - Default: "cpu"
 39            - Options: "cpu", "cuda"
 40            - Note: Use "cuda" for GPU acceleration if available
 41
 42        - `whisper_compute_type`:
 43            - Type: str
 44            - What: The compute type to use for the whisper model
 45            - Default: "int8"
 46            - Options: "int8", "float16", "float32"
 47            - Note: "int8" offers a good balance of speed and accuracy on both CPU and GPU
 48
 49        - `no_speech_threshold`:
 50            - Type: float
 51            - What: The threshold for the `no_speech_prob` returned by faster-whisper
 52            - Default: 0.3
 53            - Note: Segments with a `no_speech_prob` above this threshold will be
 54              ignored to reduce false positives from silence or background noise
 55
 56        - `vad_speech_threshold`:
 57            - Type: float (0.0–1.0)
 58            - What: Silero probability above which a frame is considered speech onset
 59              when `listen` is called with `duration="auto"` or `duration=0`
 60            - Default: 0.5
 61            - Note: Raise to reduce false positives in noisy environments
 62
 63        - `vad_silence_threshold`:
 64            - Type: float (0.0–1.0)
 65            - What: Silero probability below which a frame is considered silence
 66              during an active utterance; must be less than `vad_speech_threshold`
 67              to create a hysteresis band that prevents rapid toggling
 68            - Default: 0.35
 69
 70        - `vad_silence_frames_threshold`:
 71            - Type: int
 72            - What: Consecutive silent frames (~32ms each) required to confirm the
 73              utterance has ended and return the buffer
 74            - Default: 20  (~640ms)
 75
 76        - `vad_speech_pad_frames`:
 77            - Type: int
 78            - What: Pre-roll frames captured before onset confirmation; also the
 79              number of consecutive voiced frames required to confirm speech onset
 80            - Default: 5  (~160ms)
 81
 82        - `vad_max_speech_duration_s`:
 83            - Type: float
 84            - What: Hard cap on a single VAD-captured utterance in seconds; forces
 85              a return even if the speaker never pauses
 86            - Default: 30.0
 87        """
 88        self.wake_model = WhisperModel(
 89            whisper_model,
 90            device=whisper_device,
 91            compute_type=whisper_compute_type,
 92        )
 93        self.no_speech_threshold = no_speech_threshold
 94        self.vad_speech_threshold = vad_speech_threshold
 95        self.vad_silence_threshold = vad_silence_threshold
 96        self.vad_silence_frames_threshold = vad_silence_frames_threshold
 97        self.vad_speech_pad_frames = vad_speech_pad_frames
 98        self.vad_max_speech_duration_s = vad_max_speech_duration_s
 99        self.recorder = Recorder()
100
101    def listen(
102        self,
103        duration: Union[int, float, str] = 0,
104        device_index: int = -1,
105    ) -> str:
106        """
107        Usage:
108
109        - Records audio from the microphone and returns the transcription as a string.
110          Supports both fixed-duration recording and VAD-gated recording that
111          automatically stops at the end of a natural utterance.
112
113        Optional:
114
115        - `duration`:
116            - Type: int | float | str
117            - What: Controls how long to record
118            - Default: 0
119            - Options:
120                - int | float : Record for exactly this many seconds
121                - "auto" or 0 : Use Silero VAD to detect a complete utterance and
122                                stop automatically when the speaker finishes;
123                                honours the `vad_*` parameters set at init time
124            - Note: "auto" and 0 are equivalent; either signals VAD-gated recording
125
126        - `device_index`:
127            - Type: int
128            - What: The microphone device index to record from
129            - Default: -1
130            - Note: Use `-1` to select the system default input device
131
132        Returns:
133
134        - `transcription`:
135            - Type: str
136            - What: The transcribed text from the recorded audio
137            - Note: Multiple segments are joined with a single space
138        """
139        if duration == "auto" or duration == 0:
140            buffer = self.recorder.record_vad(
141                device_index=device_index,
142                speech_threshold=self.vad_speech_threshold,
143                silence_threshold=self.vad_silence_threshold,
144                silence_frames_threshold=self.vad_silence_frames_threshold,
145                speech_pad_frames=self.vad_speech_pad_frames,
146                max_speech_duration_s=self.vad_max_speech_duration_s,
147            )
148        else:
149            buffer = self.recorder.record(
150                device_index=device_index, duration=duration
151            )
152
153        audio_buffer = get_clean_audio_buffer(buffer)
154        segments, _ = self.wake_model.transcribe(audio_buffer, beam_size=2)
155        return " ".join(
156            [
157                segment.text
158                for segment in segments
159                if segment.no_speech_prob <= self.no_speech_threshold
160            ]
161        )
Spych( whisper_model: str = 'base.en', whisper_device: str = 'cpu', whisper_compute_type: str = 'int8', no_speech_threshold: float = 0.3, vad_speech_threshold: float = 0.5, vad_silence_threshold: float = 0.35, vad_silence_frames_threshold: int = 20, vad_speech_pad_frames: int = 5, vad_max_speech_duration_s: float = 30.0)
 8    def __init__(
 9        self,
10        whisper_model: str = "base.en",
11        whisper_device: str = "cpu",
12        whisper_compute_type: str = "int8",
13        no_speech_threshold: float = 0.3,
14        vad_speech_threshold: float = 0.5,
15        vad_silence_threshold: float = 0.35,
16        vad_silence_frames_threshold: int = 20,
17        vad_speech_pad_frames: int = 5,
18        vad_max_speech_duration_s: float = 30.0,
19    ) -> None:
20        """
21        Usage:
22
23        - Initializes a transcription object using faster-whisper for fully offline
24          speech-to-text
25
26        Optional:
27
28        - `whisper_model`:
29            - Type: str
30            - What: The faster-whisper model name to use for transcription
31            - Default: "base.en"
32            - Note: Larger models (small, medium, large) provide better accuracy at
33              the cost of speed; smaller models (tiny, base) are faster but less accurate
34
35        - `whisper_device`:
36            - Type: str
37            - What: The device to run the whisper model on
38            - Default: "cpu"
39            - Options: "cpu", "cuda"
40            - Note: Use "cuda" for GPU acceleration if available
41
42        - `whisper_compute_type`:
43            - Type: str
44            - What: The compute type to use for the whisper model
45            - Default: "int8"
46            - Options: "int8", "float16", "float32"
47            - Note: "int8" offers a good balance of speed and accuracy on both CPU and GPU
48
49        - `no_speech_threshold`:
50            - Type: float
51            - What: The threshold for the `no_speech_prob` returned by faster-whisper
52            - Default: 0.3
53            - Note: Segments with a `no_speech_prob` above this threshold will be
54              ignored to reduce false positives from silence or background noise
55
56        - `vad_speech_threshold`:
57            - Type: float (0.0–1.0)
58            - What: Silero probability above which a frame is considered speech onset
59              when `listen` is called with `duration="auto"` or `duration=0`
60            - Default: 0.5
61            - Note: Raise to reduce false positives in noisy environments
62
63        - `vad_silence_threshold`:
64            - Type: float (0.0–1.0)
65            - What: Silero probability below which a frame is considered silence
66              during an active utterance; must be less than `vad_speech_threshold`
67              to create a hysteresis band that prevents rapid toggling
68            - Default: 0.35
69
70        - `vad_silence_frames_threshold`:
71            - Type: int
72            - What: Consecutive silent frames (~32ms each) required to confirm the
73              utterance has ended and return the buffer
74            - Default: 20  (~640ms)
75
76        - `vad_speech_pad_frames`:
77            - Type: int
78            - What: Pre-roll frames captured before onset confirmation; also the
79              number of consecutive voiced frames required to confirm speech onset
80            - Default: 5  (~160ms)
81
82        - `vad_max_speech_duration_s`:
83            - Type: float
84            - What: Hard cap on a single VAD-captured utterance in seconds; forces
85              a return even if the speaker never pauses
86            - Default: 30.0
87        """
88        self.wake_model = WhisperModel(
89            whisper_model,
90            device=whisper_device,
91            compute_type=whisper_compute_type,
92        )
93        self.no_speech_threshold = no_speech_threshold
94        self.vad_speech_threshold = vad_speech_threshold
95        self.vad_silence_threshold = vad_silence_threshold
96        self.vad_silence_frames_threshold = vad_silence_frames_threshold
97        self.vad_speech_pad_frames = vad_speech_pad_frames
98        self.vad_max_speech_duration_s = vad_max_speech_duration_s
99        self.recorder = Recorder()

Usage:

  • Initializes a transcription object using faster-whisper for fully offline speech-to-text

Optional:

  • whisper_model:

    • Type: str
    • What: The faster-whisper model name to use for transcription
    • Default: "base.en"
    • Note: Larger models (small, medium, large) provide better accuracy at the cost of speed; smaller models (tiny, base) are faster but less accurate
  • whisper_device:

    • Type: str
    • What: The device to run the whisper model on
    • Default: "cpu"
    • Options: "cpu", "cuda"
    • Note: Use "cuda" for GPU acceleration if available
  • whisper_compute_type:

    • Type: str
    • What: The compute type to use for the whisper model
    • Default: "int8"
    • Options: "int8", "float16", "float32"
    • Note: "int8" offers a good balance of speed and accuracy on both CPU and GPU
  • no_speech_threshold:

    • Type: float
    • What: The threshold for the no_speech_prob returned by faster-whisper
    • Default: 0.3
    • Note: Segments with a no_speech_prob above this threshold will be ignored to reduce false positives from silence or background noise
  • vad_speech_threshold:

    • Type: float (0.0–1.0)
    • What: Silero probability above which a frame is considered speech onset when listen is called with duration="auto" or duration=0
    • Default: 0.5
    • Note: Raise to reduce false positives in noisy environments
  • vad_silence_threshold:

    • Type: float (0.0–1.0)
    • What: Silero probability below which a frame is considered silence during an active utterance; must be less than vad_speech_threshold to create a hysteresis band that prevents rapid toggling
    • Default: 0.35
  • vad_silence_frames_threshold:

    • Type: int
    • What: Consecutive silent frames (~32ms each) required to confirm the utterance has ended and return the buffer
    • Default: 20 (~640ms)
  • vad_speech_pad_frames:

    • Type: int
    • What: Pre-roll frames captured before onset confirmation; also the number of consecutive voiced frames required to confirm speech onset
    • Default: 5 (~160ms)
  • vad_max_speech_duration_s:

    • Type: float
    • What: Hard cap on a single VAD-captured utterance in seconds; forces a return even if the speaker never pauses
    • Default: 30.0
wake_model
no_speech_threshold
vad_speech_threshold
vad_silence_threshold
vad_silence_frames_threshold
vad_speech_pad_frames
vad_max_speech_duration_s
recorder
def listen(self, duration: int | float | str = 0, device_index: int = -1) -> str:
101    def listen(
102        self,
103        duration: Union[int, float, str] = 0,
104        device_index: int = -1,
105    ) -> str:
106        """
107        Usage:
108
109        - Records audio from the microphone and returns the transcription as a string.
110          Supports both fixed-duration recording and VAD-gated recording that
111          automatically stops at the end of a natural utterance.
112
113        Optional:
114
115        - `duration`:
116            - Type: int | float | str
117            - What: Controls how long to record
118            - Default: 0
119            - Options:
120                - int | float : Record for exactly this many seconds
121                - "auto" or 0 : Use Silero VAD to detect a complete utterance and
122                                stop automatically when the speaker finishes;
123                                honours the `vad_*` parameters set at init time
124            - Note: "auto" and 0 are equivalent; either signals VAD-gated recording
125
126        - `device_index`:
127            - Type: int
128            - What: The microphone device index to record from
129            - Default: -1
130            - Note: Use `-1` to select the system default input device
131
132        Returns:
133
134        - `transcription`:
135            - Type: str
136            - What: The transcribed text from the recorded audio
137            - Note: Multiple segments are joined with a single space
138        """
139        if duration == "auto" or duration == 0:
140            buffer = self.recorder.record_vad(
141                device_index=device_index,
142                speech_threshold=self.vad_speech_threshold,
143                silence_threshold=self.vad_silence_threshold,
144                silence_frames_threshold=self.vad_silence_frames_threshold,
145                speech_pad_frames=self.vad_speech_pad_frames,
146                max_speech_duration_s=self.vad_max_speech_duration_s,
147            )
148        else:
149            buffer = self.recorder.record(
150                device_index=device_index, duration=duration
151            )
152
153        audio_buffer = get_clean_audio_buffer(buffer)
154        segments, _ = self.wake_model.transcribe(audio_buffer, beam_size=2)
155        return " ".join(
156            [
157                segment.text
158                for segment in segments
159                if segment.no_speech_prob <= self.no_speech_threshold
160            ]
161        )

Usage:

  • Records audio from the microphone and returns the transcription as a string. Supports both fixed-duration recording and VAD-gated recording that automatically stops at the end of a natural utterance.

Optional:

  • duration:

    • Type: int | float | str
    • What: Controls how long to record
    • Default: 0
    • Options:
      • int | float : Record for exactly this many seconds
      • "auto" or 0 : Use Silero VAD to detect a complete utterance and stop automatically when the speaker finishes; honours the vad_* parameters set at init time
    • Note: "auto" and 0 are equivalent; either signals VAD-gated recording
  • device_index:

    • Type: int
    • What: The microphone device index to record from
    • Default: -1
    • Note: Use -1 to select the system default input device

Returns:

  • transcription:
    • Type: str
    • What: The transcribed text from the recorded audio
    • Note: Multiple segments are joined with a single space
Inherited Members
spych.utils.Notify
notify