Compare commits
No commits in common. "282208d53701ec09b5bc8acf3169a1fd3b8931ff" and "c0a1757cab2ed794e2a611ccc9d0e5dd12d9030c" have entirely different histories.
282208d537
...
c0a1757cab
15 changed files with 15 additions and 4184 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -4,4 +4,3 @@
|
|||
modules/
|
||||
**/result
|
||||
*.qcow
|
||||
**/target
|
||||
|
|
|
|||
|
|
@ -149,9 +149,6 @@ in
|
|||
kdePackages.plasma-browser-integration
|
||||
# kdePackages.plasma-workspace-wallpapers
|
||||
|
||||
# On-screen keyboard (Plasma Wayland)
|
||||
kdePackages.plasma-keyboard
|
||||
|
||||
# Panel applets required for widgets
|
||||
kdePackages.plasma-nm # org.kde.plasma.networkmanagement
|
||||
kdePackages.bluedevil # org.kde.plasma.bluetooth
|
||||
|
|
|
|||
|
|
@ -1,108 +0,0 @@
|
|||
# stt_ime - Speech-to-Text Input Method for Fcitx5
|
||||
|
||||
Local, privacy-preserving speech-to-text that integrates as a native Fcitx5 input method.
|
||||
|
||||
## Components
|
||||
|
||||
- **stt-stream**: Rust CLI that captures audio, runs VAD, and transcribes with Whisper
|
||||
- **fcitx5-stt**: C++ Fcitx5 addon that spawns stt-stream and commits text to apps
|
||||
|
||||
## Modes
|
||||
|
||||
- **Manual**: Press `Ctrl+Space` or `Ctrl+R` to start/stop recording
|
||||
- **Oneshot**: Automatically starts on speech, commits on silence, then resets
|
||||
- **Continuous**: Always listening, commits each utterance automatically
|
||||
|
||||
Press `Ctrl+M` while STT is active to cycle between modes.
|
||||
|
||||
## Keys (when STT input method is active)
|
||||
|
||||
| Key | Action |
|
||||
|-----|--------|
|
||||
| `Ctrl+Space` / `Ctrl+R` | Toggle recording (manual mode) |
|
||||
| `Ctrl+M` | Cycle mode (manual → oneshot → continuous) |
|
||||
| `Enter` | Accept current preedit text |
|
||||
| `Escape` | Cancel recording / clear preedit |
|
||||
|
||||
## Usage
|
||||
|
||||
### NixOS Module
|
||||
|
||||
```nix
|
||||
# In your host's flake.nix inputs:
|
||||
stt_ime.url = "git+https://git.ros.one/josh/nixos-config?dir=flakes/stt_ime";
|
||||
|
||||
# In your NixOS config:
|
||||
{
|
||||
imports = [ inputs.stt_ime.nixosModules.default ];
|
||||
|
||||
ringofstorms.sttIme = {
|
||||
enable = true;
|
||||
model = "base.en"; # tiny, base, small, medium, large-v3 (add .en for English-only)
|
||||
useGpu = false; # set true for CUDA acceleration
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
### Standalone CLI
|
||||
|
||||
```bash
|
||||
# Run with default settings (manual mode)
|
||||
stt-stream
|
||||
|
||||
# Run in continuous mode
|
||||
stt-stream --mode continuous
|
||||
|
||||
# Use a specific model
|
||||
stt-stream --model small-en
|
||||
|
||||
# Commands via stdin (manual mode):
|
||||
echo "start" | stt-stream # begin recording
|
||||
echo "stop" | stt-stream # stop and transcribe
|
||||
echo "cancel" | stt-stream # cancel without transcribing
|
||||
echo "shutdown" | stt-stream # exit
|
||||
```
|
||||
|
||||
### Output Format (NDJSON)
|
||||
|
||||
```json
|
||||
{"type":"ready"}
|
||||
{"type":"recording_started"}
|
||||
{"type":"partial","text":"hello worl"}
|
||||
{"type":"partial","text":"hello world"}
|
||||
{"type":"final","text":"Hello world."}
|
||||
{"type":"recording_stopped"}
|
||||
{"type":"shutdown"}
|
||||
```
|
||||
|
||||
## Models
|
||||
|
||||
Models are automatically downloaded from Hugging Face on first run and cached in `~/.cache/stt-stream/models/`.
|
||||
|
||||
| Model | Size | Speed | Quality |
|
||||
|-------|------|-------|---------|
|
||||
| tiny.en | ~75MB | Fastest | Basic |
|
||||
| base.en | ~150MB | Fast | Good (default) |
|
||||
| small.en | ~500MB | Medium | Better |
|
||||
| medium.en | ~1.5GB | Slow | Great |
|
||||
| large-v3 | ~3GB | Slowest | Best (multilingual) |
|
||||
|
||||
## Environment Variables
|
||||
|
||||
- `STT_STREAM_MODEL_PATH`: Path to a specific model file
|
||||
- `STT_STREAM_MODEL`: Model name (overridden by CLI)
|
||||
- `STT_STREAM_USE_GPU`: Set to "1" for GPU acceleration
|
||||
|
||||
## Building
|
||||
|
||||
```bash
|
||||
cd flakes/stt_ime
|
||||
nix build .#stt-stream # Rust CLI only
|
||||
nix build .#fcitx5-stt # Fcitx5 addon (includes stt-stream)
|
||||
nix build # Default: fcitx5-stt
|
||||
```
|
||||
|
||||
## Integration with de_plasma
|
||||
|
||||
The addon is automatically added to Fcitx5 when `ringofstorms.sttIme.enable = true`.
|
||||
It appears as "Speech to Text" (STT) in the input method switcher alongside US and Mozc.
|
||||
|
|
@ -1,42 +0,0 @@
|
|||
cmake_minimum_required(VERSION 3.16)
|
||||
project(fcitx5-stt VERSION 0.1.0 LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 20)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
# Find Fcitx5
|
||||
find_package(Fcitx5Core REQUIRED)
|
||||
find_package(Fcitx5Utils REQUIRED)
|
||||
|
||||
# Path to stt-stream binary (set by Nix)
|
||||
if(NOT DEFINED STT_STREAM_PATH)
|
||||
set(STT_STREAM_PATH "stt-stream")
|
||||
endif()
|
||||
|
||||
# Configure header with path
|
||||
configure_file(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/config.h.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/config.h
|
||||
)
|
||||
|
||||
# Build the addon shared library
|
||||
add_library(stt MODULE
|
||||
src/stt.cpp
|
||||
)
|
||||
|
||||
target_include_directories(stt PRIVATE
|
||||
${CMAKE_CURRENT_BINARY_DIR}
|
||||
)
|
||||
|
||||
target_link_libraries(stt PRIVATE
|
||||
Fcitx5::Core
|
||||
Fcitx5::Utils
|
||||
)
|
||||
|
||||
# Set output name without "lib" prefix
|
||||
set_target_properties(stt PROPERTIES PREFIX "")
|
||||
|
||||
# Install targets - use standard paths, Nix postInstall will handle fcitx5 paths
|
||||
install(TARGETS stt DESTINATION lib/fcitx5)
|
||||
install(FILES data/stt.conf DESTINATION share/fcitx5/addon)
|
||||
install(FILES data/stt-im.conf DESTINATION share/fcitx5/inputmethod)
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
[InputMethod]
|
||||
Name=Speech to Text
|
||||
Icon=audio-input-microphone
|
||||
Label=STT
|
||||
LangCode=
|
||||
Addon=stt
|
||||
Configurable=False
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
[Addon]
|
||||
Name=stt
|
||||
Category=InputMethod
|
||||
Library=stt
|
||||
Type=SharedLibrary
|
||||
OnDemand=True
|
||||
Configurable=False
|
||||
|
|
@ -1,4 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
// Path to stt-stream binary
|
||||
#define STT_STREAM_PATH "@STT_STREAM_PATH@"
|
||||
|
|
@ -1,533 +0,0 @@
|
|||
/*
|
||||
* fcitx5-stt: Speech-to-Text Input Method Engine for Fcitx5
|
||||
*
|
||||
* This is a thin shim that spawns the stt-stream Rust binary and
|
||||
* bridges its JSON events to Fcitx5's input method API.
|
||||
*
|
||||
* Modes:
|
||||
* - Oneshot: Record until silence, commit, reset
|
||||
* - Continuous: Always listen, commit on silence
|
||||
* - Manual: Start/stop via hotkey
|
||||
*
|
||||
* UX:
|
||||
* - Partial text shown as preedit (underlined)
|
||||
* - Final text committed on stop/silence
|
||||
* - Escape cancels without committing
|
||||
* - Enter accepts current preedit
|
||||
*/
|
||||
|
||||
#include <fcitx/addonfactory.h>
|
||||
#include <fcitx/addonmanager.h>
|
||||
#include <fcitx/inputcontext.h>
|
||||
#include <fcitx/inputcontextmanager.h>
|
||||
#include <fcitx/inputmethodengine.h>
|
||||
#include <fcitx/inputpanel.h>
|
||||
#include <fcitx/instance.h>
|
||||
#include <fcitx-utils/event.h>
|
||||
#include <fcitx-utils/i18n.h>
|
||||
#include <fcitx-utils/log.h>
|
||||
#include <fcitx-utils/utf8.h>
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unistd.h>
|
||||
#include <signal.h>
|
||||
#include <sys/wait.h>
|
||||
#include <fcntl.h>
|
||||
#include <cstring>
|
||||
#include <sstream>
|
||||
|
||||
#include "config.h"
|
||||
|
||||
namespace {
|
||||
|
||||
FCITX_DEFINE_LOG_CATEGORY(stt_log, "stt");
|
||||
#define STT_DEBUG() FCITX_LOGC(stt_log, Debug)
|
||||
#define STT_INFO() FCITX_LOGC(stt_log, Info)
|
||||
#define STT_WARN() FCITX_LOGC(stt_log, Warn)
|
||||
#define STT_ERROR() FCITX_LOGC(stt_log, Error)
|
||||
|
||||
// Operating modes
|
||||
enum class SttMode {
|
||||
Oneshot,
|
||||
Continuous,
|
||||
Manual
|
||||
};
|
||||
|
||||
// Simple JSON parsing (we only need a few fields)
|
||||
struct JsonEvent {
|
||||
std::string type;
|
||||
std::string text;
|
||||
std::string message;
|
||||
|
||||
static JsonEvent parse(const std::string& line) {
|
||||
JsonEvent ev;
|
||||
// Very basic JSON parsing - find "type" and "text" fields
|
||||
auto findValue = [&line](const std::string& key) -> std::string {
|
||||
std::string search = "\"" + key + "\":\"";
|
||||
auto pos = line.find(search);
|
||||
if (pos == std::string::npos) return "";
|
||||
pos += search.length();
|
||||
auto end = line.find("\"", pos);
|
||||
if (end == std::string::npos) return "";
|
||||
return line.substr(pos, end - pos);
|
||||
};
|
||||
|
||||
ev.type = findValue("type");
|
||||
ev.text = findValue("text");
|
||||
ev.message = findValue("message");
|
||||
return ev;
|
||||
}
|
||||
};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
class SttEngine;
|
||||
|
||||
class SttState : public fcitx::InputContextProperty {
|
||||
public:
|
||||
SttState(SttEngine* engine, fcitx::InputContext* ic)
|
||||
: engine_(engine), ic_(ic) {}
|
||||
|
||||
void setPreedit(const std::string& text);
|
||||
void commit(const std::string& text);
|
||||
void clear();
|
||||
|
||||
bool isRecording() const { return recording_; }
|
||||
void setRecording(bool r) { recording_ = r; }
|
||||
|
||||
const std::string& preeditText() const { return preedit_; }
|
||||
|
||||
private:
|
||||
SttEngine* engine_;
|
||||
fcitx::InputContext* ic_;
|
||||
std::string preedit_;
|
||||
bool recording_ = false;
|
||||
};
|
||||
|
||||
class SttEngine : public fcitx::InputMethodEngineV2 {
|
||||
public:
|
||||
SttEngine(fcitx::Instance* instance);
|
||||
~SttEngine() override;
|
||||
|
||||
// InputMethodEngine interface
|
||||
void activate(const fcitx::InputMethodEntry& entry,
|
||||
fcitx::InputContextEvent& event) override;
|
||||
void deactivate(const fcitx::InputMethodEntry& entry,
|
||||
fcitx::InputContextEvent& event) override;
|
||||
void keyEvent(const fcitx::InputMethodEntry& entry,
|
||||
fcitx::KeyEvent& keyEvent) override;
|
||||
void reset(const fcitx::InputMethodEntry& entry,
|
||||
fcitx::InputContextEvent& event) override;
|
||||
|
||||
// List input methods this engine provides
|
||||
std::vector<fcitx::InputMethodEntry> listInputMethods() override {
|
||||
std::vector<fcitx::InputMethodEntry> result;
|
||||
result.emplace_back(
|
||||
"stt", // unique name
|
||||
_("Speech to Text"), // display name
|
||||
"*", // language (any)
|
||||
"stt" // addon name
|
||||
);
|
||||
return result;
|
||||
}
|
||||
|
||||
fcitx::Instance* instance() { return instance_; }
|
||||
|
||||
// Process management
|
||||
void startProcess();
|
||||
void stopProcess();
|
||||
void sendCommand(const std::string& cmd);
|
||||
|
||||
// Mode
|
||||
SttMode mode() const { return mode_; }
|
||||
void setMode(SttMode m);
|
||||
void cycleMode();
|
||||
|
||||
private:
|
||||
void onProcessOutput();
|
||||
void handleEvent(const JsonEvent& ev);
|
||||
|
||||
fcitx::Instance* instance_;
|
||||
fcitx::FactoryFor<SttState> factory_;
|
||||
|
||||
// Process state
|
||||
pid_t childPid_ = -1;
|
||||
int stdinFd_ = -1;
|
||||
int stdoutFd_ = -1;
|
||||
std::unique_ptr<fcitx::EventSourceIO> ioEvent_;
|
||||
std::string readBuffer_;
|
||||
|
||||
// Mode
|
||||
SttMode mode_ = SttMode::Manual;
|
||||
|
||||
// Current state
|
||||
bool ready_ = false;
|
||||
fcitx::InputContext* activeIc_ = nullptr;
|
||||
};
|
||||
|
||||
// SttState implementation
|
||||
void SttState::setPreedit(const std::string& text) {
|
||||
preedit_ = text;
|
||||
if (ic_->hasFocus()) {
|
||||
fcitx::Text preeditText;
|
||||
preeditText.append(text, fcitx::TextFormatFlag::Underline);
|
||||
preeditText.setCursor(text.length());
|
||||
ic_->inputPanel().setClientPreedit(preeditText);
|
||||
ic_->updatePreedit();
|
||||
}
|
||||
}
|
||||
|
||||
void SttState::commit(const std::string& text) {
|
||||
if (!text.empty() && ic_->hasFocus()) {
|
||||
ic_->commitString(text);
|
||||
}
|
||||
clear();
|
||||
}
|
||||
|
||||
void SttState::clear() {
|
||||
preedit_.clear();
|
||||
if (ic_->hasFocus()) {
|
||||
ic_->inputPanel().reset();
|
||||
ic_->updatePreedit();
|
||||
ic_->updateUserInterface(fcitx::UserInterfaceComponent::InputPanel);
|
||||
}
|
||||
}
|
||||
|
||||
// SttEngine implementation
|
||||
SttEngine::SttEngine(fcitx::Instance* instance)
|
||||
: instance_(instance),
|
||||
factory_([this](fcitx::InputContext& ic) {
|
||||
return new SttState(this, &ic);
|
||||
}) {
|
||||
instance_->inputContextManager().registerProperty("sttState", &factory_);
|
||||
STT_INFO() << "SttEngine initialized";
|
||||
}
|
||||
|
||||
SttEngine::~SttEngine() {
|
||||
stopProcess();
|
||||
}
|
||||
|
||||
void SttEngine::activate(const fcitx::InputMethodEntry& entry,
|
||||
fcitx::InputContextEvent& event) {
|
||||
FCITX_UNUSED(entry);
|
||||
auto* ic = event.inputContext();
|
||||
activeIc_ = ic;
|
||||
|
||||
STT_INFO() << "STT activated";
|
||||
|
||||
// Start the backend process if not running
|
||||
if (childPid_ < 0) {
|
||||
startProcess();
|
||||
}
|
||||
|
||||
// In continuous mode, start recording automatically
|
||||
if (mode_ == SttMode::Continuous && ready_) {
|
||||
sendCommand("start");
|
||||
auto* state = ic->propertyFor(&factory_);
|
||||
state->setRecording(true);
|
||||
}
|
||||
}
|
||||
|
||||
void SttEngine::deactivate(const fcitx::InputMethodEntry& entry,
|
||||
fcitx::InputContextEvent& event) {
|
||||
FCITX_UNUSED(entry);
|
||||
auto* ic = event.inputContext();
|
||||
auto* state = ic->propertyFor(&factory_);
|
||||
|
||||
// Stop recording if active
|
||||
if (state->isRecording()) {
|
||||
sendCommand("cancel");
|
||||
state->setRecording(false);
|
||||
}
|
||||
state->clear();
|
||||
|
||||
activeIc_ = nullptr;
|
||||
STT_INFO() << "STT deactivated";
|
||||
}
|
||||
|
||||
void SttEngine::keyEvent(const fcitx::InputMethodEntry& entry,
|
||||
fcitx::KeyEvent& keyEvent) {
|
||||
FCITX_UNUSED(entry);
|
||||
auto* ic = keyEvent.inputContext();
|
||||
auto* state = ic->propertyFor(&factory_);
|
||||
|
||||
// Handle special keys
|
||||
if (keyEvent.isRelease()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto key = keyEvent.key();
|
||||
|
||||
// Escape: cancel recording/preedit
|
||||
if (key.check(FcitxKey_Escape)) {
|
||||
if (state->isRecording() || !state->preeditText().empty()) {
|
||||
sendCommand("cancel");
|
||||
state->setRecording(false);
|
||||
state->clear();
|
||||
keyEvent.filterAndAccept();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Enter/Return: accept preedit
|
||||
if (key.check(FcitxKey_Return) || key.check(FcitxKey_KP_Enter)) {
|
||||
if (!state->preeditText().empty()) {
|
||||
state->commit(state->preeditText());
|
||||
sendCommand("cancel");
|
||||
state->setRecording(false);
|
||||
keyEvent.filterAndAccept();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Space or Ctrl+R: toggle recording (in manual mode)
|
||||
if (mode_ == SttMode::Manual) {
|
||||
if (key.check(FcitxKey_space, fcitx::KeyState::Ctrl) ||
|
||||
key.check(FcitxKey_r, fcitx::KeyState::Ctrl)) {
|
||||
if (state->isRecording()) {
|
||||
sendCommand("stop");
|
||||
state->setRecording(false);
|
||||
} else {
|
||||
state->clear();
|
||||
sendCommand("start");
|
||||
state->setRecording(true);
|
||||
}
|
||||
keyEvent.filterAndAccept();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Ctrl+M: cycle mode
|
||||
if (key.check(FcitxKey_m, fcitx::KeyState::Ctrl)) {
|
||||
cycleMode();
|
||||
keyEvent.filterAndAccept();
|
||||
return;
|
||||
}
|
||||
|
||||
// In recording state, absorb most keys
|
||||
if (state->isRecording()) {
|
||||
keyEvent.filterAndAccept();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void SttEngine::reset(const fcitx::InputMethodEntry& entry,
|
||||
fcitx::InputContextEvent& event) {
|
||||
FCITX_UNUSED(entry);
|
||||
auto* ic = event.inputContext();
|
||||
auto* state = ic->propertyFor(&factory_);
|
||||
state->clear();
|
||||
}
|
||||
|
||||
void SttEngine::startProcess() {
|
||||
if (childPid_ > 0) {
|
||||
return; // Already running
|
||||
}
|
||||
|
||||
int stdinPipe[2];
|
||||
int stdoutPipe[2];
|
||||
|
||||
if (pipe(stdinPipe) < 0 || pipe(stdoutPipe) < 0) {
|
||||
STT_ERROR() << "Failed to create pipes";
|
||||
return;
|
||||
}
|
||||
|
||||
pid_t pid = fork();
|
||||
if (pid < 0) {
|
||||
STT_ERROR() << "Failed to fork";
|
||||
close(stdinPipe[0]);
|
||||
close(stdinPipe[1]);
|
||||
close(stdoutPipe[0]);
|
||||
close(stdoutPipe[1]);
|
||||
return;
|
||||
}
|
||||
|
||||
if (pid == 0) {
|
||||
// Child process
|
||||
close(stdinPipe[1]);
|
||||
close(stdoutPipe[0]);
|
||||
|
||||
dup2(stdinPipe[0], STDIN_FILENO);
|
||||
dup2(stdoutPipe[1], STDOUT_FILENO);
|
||||
|
||||
close(stdinPipe[0]);
|
||||
close(stdoutPipe[1]);
|
||||
|
||||
// Determine mode string
|
||||
const char* modeStr = "manual";
|
||||
switch (mode_) {
|
||||
case SttMode::Oneshot: modeStr = "oneshot"; break;
|
||||
case SttMode::Continuous: modeStr = "continuous"; break;
|
||||
case SttMode::Manual: modeStr = "manual"; break;
|
||||
}
|
||||
|
||||
execlp(STT_STREAM_PATH, "stt-stream", "--mode", modeStr, nullptr);
|
||||
_exit(127);
|
||||
}
|
||||
|
||||
// Parent process
|
||||
close(stdinPipe[0]);
|
||||
close(stdoutPipe[1]);
|
||||
|
||||
childPid_ = pid;
|
||||
stdinFd_ = stdinPipe[1];
|
||||
stdoutFd_ = stdoutPipe[0];
|
||||
|
||||
// Set stdout non-blocking
|
||||
int flags = fcntl(stdoutFd_, F_GETFL, 0);
|
||||
fcntl(stdoutFd_, F_SETFL, flags | O_NONBLOCK);
|
||||
|
||||
// Watch stdout for events
|
||||
ioEvent_ = instance_->eventLoop().addIOEvent(
|
||||
stdoutFd_,
|
||||
fcitx::IOEventFlag::In,
|
||||
[this](fcitx::EventSourceIO*, int, fcitx::IOEventFlags) {
|
||||
onProcessOutput();
|
||||
return true;
|
||||
}
|
||||
);
|
||||
|
||||
STT_INFO() << "Started stt-stream process (pid=" << childPid_ << ")";
|
||||
}
|
||||
|
||||
void SttEngine::stopProcess() {
|
||||
if (childPid_ < 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
ioEvent_.reset();
|
||||
|
||||
sendCommand("shutdown");
|
||||
close(stdinFd_);
|
||||
close(stdoutFd_);
|
||||
|
||||
// Wait for child to exit
|
||||
int status;
|
||||
waitpid(childPid_, &status, 0);
|
||||
|
||||
stdinFd_ = -1;
|
||||
stdoutFd_ = -1;
|
||||
childPid_ = -1;
|
||||
ready_ = false;
|
||||
|
||||
STT_INFO() << "Stopped stt-stream process";
|
||||
}
|
||||
|
||||
void SttEngine::sendCommand(const std::string& cmd) {
|
||||
if (stdinFd_ < 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::string line = cmd + "\n";
|
||||
write(stdinFd_, line.c_str(), line.length());
|
||||
}
|
||||
|
||||
void SttEngine::onProcessOutput() {
|
||||
char buf[4096];
|
||||
ssize_t n;
|
||||
|
||||
while ((n = read(stdoutFd_, buf, sizeof(buf) - 1)) > 0) {
|
||||
buf[n] = '\0';
|
||||
readBuffer_ += buf;
|
||||
|
||||
// Process complete lines
|
||||
size_t pos;
|
||||
while ((pos = readBuffer_.find('\n')) != std::string::npos) {
|
||||
std::string line = readBuffer_.substr(0, pos);
|
||||
readBuffer_ = readBuffer_.substr(pos + 1);
|
||||
|
||||
if (!line.empty()) {
|
||||
auto ev = JsonEvent::parse(line);
|
||||
handleEvent(ev);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SttEngine::handleEvent(const JsonEvent& ev) {
|
||||
STT_DEBUG() << "Event: type=" << ev.type << " text=" << ev.text;
|
||||
|
||||
if (ev.type == "ready") {
|
||||
ready_ = true;
|
||||
STT_INFO() << "stt-stream ready";
|
||||
} else if (ev.type == "recording_started") {
|
||||
// Update UI to show recording state
|
||||
if (activeIc_) {
|
||||
auto* state = activeIc_->propertyFor(&factory_);
|
||||
state->setRecording(true);
|
||||
}
|
||||
} else if (ev.type == "recording_stopped") {
|
||||
if (activeIc_) {
|
||||
auto* state = activeIc_->propertyFor(&factory_);
|
||||
state->setRecording(false);
|
||||
}
|
||||
} else if (ev.type == "partial") {
|
||||
if (activeIc_) {
|
||||
auto* state = activeIc_->propertyFor(&factory_);
|
||||
state->setPreedit(ev.text);
|
||||
}
|
||||
} else if (ev.type == "final") {
|
||||
if (activeIc_) {
|
||||
auto* state = activeIc_->propertyFor(&factory_);
|
||||
state->commit(ev.text);
|
||||
state->setRecording(false);
|
||||
|
||||
// In oneshot mode, we're done
|
||||
// In continuous mode, keep listening
|
||||
if (mode_ == SttMode::Continuous && ready_) {
|
||||
sendCommand("start");
|
||||
state->setRecording(true);
|
||||
}
|
||||
}
|
||||
} else if (ev.type == "error") {
|
||||
STT_ERROR() << "stt-stream error: " << ev.message;
|
||||
} else if (ev.type == "shutdown") {
|
||||
ready_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
void SttEngine::setMode(SttMode m) {
|
||||
if (mode_ == m) return;
|
||||
|
||||
mode_ = m;
|
||||
|
||||
// Notify the backend
|
||||
const char* modeStr = "manual";
|
||||
switch (m) {
|
||||
case SttMode::Oneshot: modeStr = "oneshot"; break;
|
||||
case SttMode::Continuous: modeStr = "continuous"; break;
|
||||
case SttMode::Manual: modeStr = "manual"; break;
|
||||
}
|
||||
|
||||
std::string cmd = "{\"cmd\":\"set_mode\",\"mode\":\"";
|
||||
cmd += modeStr;
|
||||
cmd += "\"}";
|
||||
sendCommand(cmd);
|
||||
|
||||
STT_INFO() << "Mode changed to: " << modeStr;
|
||||
}
|
||||
|
||||
void SttEngine::cycleMode() {
|
||||
switch (mode_) {
|
||||
case SttMode::Manual:
|
||||
setMode(SttMode::Oneshot);
|
||||
break;
|
||||
case SttMode::Oneshot:
|
||||
setMode(SttMode::Continuous);
|
||||
break;
|
||||
case SttMode::Continuous:
|
||||
setMode(SttMode::Manual);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Addon factory
|
||||
class SttEngineFactory : public fcitx::AddonFactory {
|
||||
public:
|
||||
fcitx::AddonInstance* create(fcitx::AddonManager* manager) override {
|
||||
return new SttEngine(manager->instance());
|
||||
}
|
||||
};
|
||||
|
||||
FCITX_ADDON_FACTORY(SttEngineFactory);
|
||||
77
flakes/stt_ime/flake.lock
generated
77
flakes/stt_ime/flake.lock
generated
|
|
@ -1,77 +0,0 @@
|
|||
{
|
||||
"nodes": {
|
||||
"crane": {
|
||||
"locked": {
|
||||
"lastModified": 1768319649,
|
||||
"narHash": "sha256-VFkNyxHxkqGp8gf8kfFMW1j6XeBy609kv6TE9uF/0Js=",
|
||||
"owner": "ipetkov",
|
||||
"repo": "crane",
|
||||
"rev": "4b6527687cfd20da3c2ef8287e01b74c2d6c705b",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "ipetkov",
|
||||
"repo": "crane",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-utils": {
|
||||
"inputs": {
|
||||
"systems": "systems"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1731533236,
|
||||
"narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1768127708,
|
||||
"narHash": "sha256-1Sm77VfZh3mU0F5OqKABNLWxOuDeHIlcFjsXeeiPazs=",
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "ffbc9f8cbaacfb331b6017d5a5abb21a492c9a38",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nixos",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"crane": "crane",
|
||||
"flake-utils": "flake-utils",
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
},
|
||||
"systems": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
|
|
@ -1,166 +0,0 @@
|
|||
{
|
||||
description = "Local speech-to-text input method for Fcitx5";
|
||||
|
||||
inputs = {
|
||||
nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
|
||||
crane.url = "github:ipetkov/crane";
|
||||
flake-utils.url = "github:numtide/flake-utils";
|
||||
};
|
||||
|
||||
outputs =
|
||||
{
|
||||
self,
|
||||
nixpkgs,
|
||||
crane,
|
||||
flake-utils,
|
||||
...
|
||||
}:
|
||||
let
|
||||
# Systems we support
|
||||
supportedSystems = [
|
||||
"x86_64-linux"
|
||||
"aarch64-linux"
|
||||
];
|
||||
in
|
||||
flake-utils.lib.eachSystem supportedSystems (
|
||||
system:
|
||||
let
|
||||
pkgs = nixpkgs.legacyPackages.${system};
|
||||
craneLib = crane.mkLib pkgs;
|
||||
|
||||
# Rust STT streaming CLI
|
||||
stt-stream = craneLib.buildPackage {
|
||||
pname = "stt-stream";
|
||||
version = "0.1.0";
|
||||
src = craneLib.cleanCargoSource ./stt-stream;
|
||||
|
||||
nativeBuildInputs = with pkgs; [
|
||||
pkg-config
|
||||
cmake # for whisper-rs
|
||||
clang
|
||||
llvmPackages.libclang
|
||||
];
|
||||
|
||||
buildInputs = with pkgs; [
|
||||
alsa-lib
|
||||
openssl
|
||||
# whisper.cpp dependencies
|
||||
openblas
|
||||
];
|
||||
|
||||
# For bindgen to find libclang
|
||||
LIBCLANG_PATH = "${pkgs.llvmPackages.libclang.lib}/lib";
|
||||
|
||||
# Enable CUDA if available (user can override)
|
||||
WHISPER_CUBLAS = "OFF";
|
||||
};
|
||||
|
||||
# Fcitx5 C++ shim addon
|
||||
fcitx5-stt = pkgs.stdenv.mkDerivation {
|
||||
pname = "fcitx5-stt";
|
||||
version = "0.1.0";
|
||||
src = ./fcitx5-stt;
|
||||
|
||||
nativeBuildInputs = with pkgs; [
|
||||
cmake
|
||||
extra-cmake-modules
|
||||
pkg-config
|
||||
];
|
||||
|
||||
buildInputs = with pkgs; [
|
||||
fcitx5
|
||||
];
|
||||
|
||||
cmakeFlags = [
|
||||
"-DSTT_STREAM_PATH=${stt-stream}/bin/stt-stream"
|
||||
];
|
||||
|
||||
# Install to fcitx5 addon paths
|
||||
postInstall = ''
|
||||
mkdir -p $out/share/fcitx5/addon
|
||||
mkdir -p $out/share/fcitx5/inputmethod
|
||||
mkdir -p $out/lib/fcitx5
|
||||
'';
|
||||
};
|
||||
in
|
||||
{
|
||||
packages = {
|
||||
inherit stt-stream fcitx5-stt;
|
||||
default = fcitx5-stt;
|
||||
};
|
||||
|
||||
# Expose as runnable apps
|
||||
apps = {
|
||||
stt-stream = {
|
||||
type = "app";
|
||||
program = "${stt-stream}/bin/stt-stream";
|
||||
};
|
||||
default = {
|
||||
type = "app";
|
||||
program = "${stt-stream}/bin/stt-stream";
|
||||
};
|
||||
};
|
||||
|
||||
devShells.default = pkgs.mkShell {
|
||||
inputsFrom = [ stt-stream ];
|
||||
packages = with pkgs; [
|
||||
rust-analyzer
|
||||
rustfmt
|
||||
clippy
|
||||
fcitx5
|
||||
];
|
||||
};
|
||||
}
|
||||
)
|
||||
// {
|
||||
# NixOS module for integration
|
||||
nixosModules.default =
|
||||
{
|
||||
config,
|
||||
lib,
|
||||
pkgs,
|
||||
...
|
||||
}:
|
||||
let
|
||||
cfg = config.ringofstorms.sttIme;
|
||||
sttPkgs = self.packages.${pkgs.stdenv.hostPlatform.system};
|
||||
in
|
||||
{
|
||||
options.ringofstorms.sttIme = {
|
||||
enable = lib.mkEnableOption "Speech-to-text input method for Fcitx5";
|
||||
|
||||
model = lib.mkOption {
|
||||
type = lib.types.str;
|
||||
default = "base.en";
|
||||
description = "Whisper model to use (tiny, base, small, medium, large)";
|
||||
};
|
||||
|
||||
useGpu = lib.mkOption {
|
||||
type = lib.types.bool;
|
||||
default = false;
|
||||
description = "Whether to use GPU acceleration (CUDA)";
|
||||
};
|
||||
};
|
||||
|
||||
config = lib.mkIf cfg.enable {
|
||||
# Ensure fcitx5 addon is available
|
||||
i18n.inputMethod.fcitx5.addons = [ sttPkgs.fcitx5-stt ];
|
||||
|
||||
# Add STT to the Fcitx5 input method group
|
||||
# This assumes de_plasma sets up Groups/0 with keyboard-us (0) and mozc (1)
|
||||
i18n.inputMethod.fcitx5.settings.inputMethod = {
|
||||
"Groups/0/Items/2".Name = "stt";
|
||||
};
|
||||
|
||||
# Make stt-stream available system-wide
|
||||
environment.systemPackages = [ sttPkgs.stt-stream ];
|
||||
|
||||
# Set default model via environment
|
||||
environment.sessionVariables = {
|
||||
STT_STREAM_MODEL = cfg.model;
|
||||
STT_STREAM_USE_GPU = if cfg.useGpu then "1" else "0";
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
2487
flakes/stt_ime/stt-stream/Cargo.lock
generated
2487
flakes/stt_ime/stt-stream/Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -1,50 +0,0 @@
|
|||
[package]
|
||||
name = "stt-stream"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
description = "Local speech-to-text streaming CLI for Fcitx5 integration"
|
||||
license = "MIT"
|
||||
|
||||
[dependencies]
|
||||
# Audio capture
|
||||
cpal = "0.15"
|
||||
# Resampling (48k -> 16k)
|
||||
rubato = "0.15"
|
||||
# Whisper inference
|
||||
whisper-rs = "0.12"
|
||||
# Voice activity detection
|
||||
# Using silero via ONNX (reserved for future use)
|
||||
# ort = { version = "2.0.0-rc.9", default-features = false, features = ["load-dynamic"] }
|
||||
# ndarray = "0.16"
|
||||
|
||||
# Async runtime
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
|
||||
# CLI
|
||||
clap = { version = "4", features = ["derive"] }
|
||||
|
||||
# Serialization for IPC protocol
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
|
||||
# Error handling
|
||||
anyhow = "1"
|
||||
thiserror = "1"
|
||||
|
||||
# Logging
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
|
||||
# Ring buffer for audio (reserved for future use)
|
||||
# ringbuf = "0.4"
|
||||
|
||||
# For downloading models
|
||||
hf-hub = "0.3"
|
||||
|
||||
[features]
|
||||
default = []
|
||||
cuda = ["whisper-rs/cuda"]
|
||||
|
||||
[profile.release]
|
||||
lto = true
|
||||
codegen-units = 1
|
||||
|
|
@ -1,599 +0,0 @@
|
|||
//! stt-stream: Local speech-to-text streaming CLI
|
||||
//!
|
||||
//! Captures audio from microphone, performs VAD, transcribes with Whisper,
|
||||
//! and outputs JSON events to stdout for Fcitx5 integration.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use clap::{Parser, ValueEnum};
|
||||
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
|
||||
use rubato::{FftFixedInOut, Resampler};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::io::{BufRead, Write};
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use tokio::sync::mpsc;
|
||||
use tracing::{error, info, warn};
|
||||
use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters};
|
||||
|
||||
/// Operating mode for the STT engine
|
||||
#[derive(Debug, Clone, Copy, ValueEnum, PartialEq, Eq)]
|
||||
pub enum Mode {
|
||||
/// Record until silence, transcribe, then reset (one-shot)
|
||||
Oneshot,
|
||||
/// Always listen, emit text when speech detected (continuous)
|
||||
Continuous,
|
||||
/// Manual start/stop via stdin commands
|
||||
Manual,
|
||||
}
|
||||
|
||||
/// Whisper model size
|
||||
#[derive(Debug, Clone, Copy, ValueEnum)]
|
||||
pub enum ModelSize {
|
||||
Tiny,
|
||||
TinyEn,
|
||||
Base,
|
||||
BaseEn,
|
||||
Small,
|
||||
SmallEn,
|
||||
Medium,
|
||||
MediumEn,
|
||||
LargeV3,
|
||||
}
|
||||
|
||||
impl ModelSize {
|
||||
fn model_name(&self) -> &'static str {
|
||||
match self {
|
||||
ModelSize::Tiny => "tiny",
|
||||
ModelSize::TinyEn => "tiny.en",
|
||||
ModelSize::Base => "base",
|
||||
ModelSize::BaseEn => "base.en",
|
||||
ModelSize::Small => "small",
|
||||
ModelSize::SmallEn => "small.en",
|
||||
ModelSize::Medium => "medium",
|
||||
ModelSize::MediumEn => "medium.en",
|
||||
ModelSize::LargeV3 => "large-v3",
|
||||
}
|
||||
}
|
||||
|
||||
fn hf_repo(&self) -> &'static str {
|
||||
"ggerganov/whisper.cpp"
|
||||
}
|
||||
|
||||
fn hf_filename(&self) -> String {
|
||||
format!("ggml-{}.bin", self.model_name())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "stt-stream")]
|
||||
#[command(about = "Local speech-to-text streaming for Fcitx5")]
|
||||
struct Args {
|
||||
/// Operating mode
|
||||
#[arg(short, long, value_enum, default_value = "manual")]
|
||||
mode: Mode,
|
||||
|
||||
/// Whisper model size
|
||||
#[arg(short = 'M', long, value_enum, default_value = "base-en")]
|
||||
model: ModelSize,
|
||||
|
||||
/// Path to whisper model file (overrides --model)
|
||||
#[arg(long)]
|
||||
model_path: Option<String>,
|
||||
|
||||
/// VAD threshold (0.0-1.0)
|
||||
#[arg(long, default_value = "0.5")]
|
||||
vad_threshold: f32,
|
||||
|
||||
/// Silence duration (ms) to end utterance
|
||||
#[arg(long, default_value = "800")]
|
||||
silence_ms: u64,
|
||||
|
||||
/// Emit partial transcripts while speaking
|
||||
#[arg(long, default_value = "true")]
|
||||
partials: bool,
|
||||
|
||||
/// Partial transcript interval (ms)
|
||||
#[arg(long, default_value = "500")]
|
||||
partial_interval_ms: u64,
|
||||
|
||||
/// Language code (e.g., "en", "ja", "auto")
|
||||
#[arg(short, long, default_value = "en")]
|
||||
language: String,
|
||||
|
||||
/// Use GPU acceleration
|
||||
#[arg(long)]
|
||||
gpu: bool,
|
||||
}
|
||||
|
||||
/// Events emitted to stdout as NDJSON
|
||||
#[derive(Debug, Serialize)]
|
||||
#[serde(tag = "type", rename_all = "snake_case")]
|
||||
pub enum SttEvent {
|
||||
/// STT engine is ready
|
||||
Ready,
|
||||
/// Recording started
|
||||
RecordingStarted,
|
||||
/// Recording stopped
|
||||
RecordingStopped,
|
||||
/// Partial (unstable) transcript
|
||||
Partial { text: String },
|
||||
/// Final transcript
|
||||
Final { text: String },
|
||||
/// Error occurred
|
||||
Error { message: String },
|
||||
/// Engine shutting down
|
||||
Shutdown,
|
||||
}
|
||||
|
||||
/// Commands received from stdin as NDJSON
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[serde(tag = "cmd", rename_all = "snake_case")]
|
||||
pub enum SttCommand {
|
||||
/// Start recording
|
||||
Start,
|
||||
/// Stop recording and transcribe
|
||||
Stop,
|
||||
/// Cancel current recording without transcribing
|
||||
Cancel,
|
||||
/// Shutdown the engine
|
||||
Shutdown,
|
||||
/// Switch mode
|
||||
SetMode { mode: String },
|
||||
}
|
||||
|
||||
fn emit_event(event: &SttEvent) {
|
||||
if let Ok(json) = serde_json::to_string(event) {
|
||||
let mut stdout = std::io::stdout().lock();
|
||||
let _ = writeln!(stdout, "{}", json);
|
||||
let _ = stdout.flush();
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple energy-based VAD (placeholder for Silero VAD)
|
||||
/// Returns true if the audio chunk likely contains speech
|
||||
fn simple_vad(samples: &[f32], threshold: f32) -> bool {
|
||||
if samples.is_empty() {
|
||||
return false;
|
||||
}
|
||||
let energy: f32 = samples.iter().map(|s| s * s).sum::<f32>() / samples.len() as f32;
|
||||
let db = 10.0 * energy.max(1e-10).log10();
|
||||
// Typical speech is around -20 to -10 dB, silence is < -40 dB
|
||||
// Map threshold 0-1 to dB range -50 to -20
|
||||
let threshold_db = -50.0 + (threshold * 30.0);
|
||||
db > threshold_db
|
||||
}
|
||||
|
||||
/// Download or locate the Whisper model
|
||||
fn get_model_path(args: &Args) -> Result<String> {
|
||||
if let Some(ref path) = args.model_path {
|
||||
return Ok(path.clone());
|
||||
}
|
||||
|
||||
// Check environment variable
|
||||
if let Ok(path) = std::env::var("STT_STREAM_MODEL_PATH") {
|
||||
if std::path::Path::new(&path).exists() {
|
||||
return Ok(path);
|
||||
}
|
||||
}
|
||||
|
||||
// Check XDG cache
|
||||
let cache_dir = dirs::cache_dir()
|
||||
.unwrap_or_else(|| std::path::PathBuf::from("."))
|
||||
.join("stt-stream")
|
||||
.join("models");
|
||||
|
||||
let model_file = cache_dir.join(args.model.hf_filename());
|
||||
if model_file.exists() {
|
||||
return Ok(model_file.to_string_lossy().to_string());
|
||||
}
|
||||
|
||||
// Download from Hugging Face
|
||||
info!("Downloading model {} from Hugging Face...", args.model.model_name());
|
||||
std::fs::create_dir_all(&cache_dir)?;
|
||||
|
||||
let api = hf_hub::api::sync::Api::new()?;
|
||||
let repo = api.model(args.model.hf_repo().to_string());
|
||||
let path = repo.get(&args.model.hf_filename())?;
|
||||
|
||||
Ok(path.to_string_lossy().to_string())
|
||||
}
|
||||
|
||||
/// Audio processing state
|
||||
struct AudioState {
|
||||
/// Audio samples buffer (16kHz mono)
|
||||
buffer: Vec<f32>,
|
||||
/// Whether we're currently recording
|
||||
is_recording: bool,
|
||||
/// Whether speech was detected in current segment
|
||||
speech_detected: bool,
|
||||
/// Samples since last speech
|
||||
silence_samples: usize,
|
||||
/// Last partial emission time
|
||||
last_partial: std::time::Instant,
|
||||
}
|
||||
|
||||
impl AudioState {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
buffer: Vec::with_capacity(16000 * 30), // 30 seconds max
|
||||
is_recording: false,
|
||||
speech_detected: false,
|
||||
silence_samples: 0,
|
||||
last_partial: std::time::Instant::now(),
|
||||
}
|
||||
}
|
||||
|
||||
fn clear(&mut self) {
|
||||
self.buffer.clear();
|
||||
self.speech_detected = false;
|
||||
self.silence_samples = 0;
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
// Initialize logging
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(
|
||||
tracing_subscriber::EnvFilter::from_default_env()
|
||||
.add_directive("stt_stream=info".parse().unwrap()),
|
||||
)
|
||||
.with_writer(std::io::stderr)
|
||||
.init();
|
||||
|
||||
let args = Args::parse();
|
||||
info!("Starting stt-stream with mode: {:?}", args.mode);
|
||||
|
||||
// Load Whisper model
|
||||
let model_path = get_model_path(&args).context("Failed to get model path")?;
|
||||
info!("Loading Whisper model from: {}", model_path);
|
||||
|
||||
let ctx_params = WhisperContextParameters::default();
|
||||
let whisper_ctx = WhisperContext::new_with_params(&model_path, ctx_params)
|
||||
.context("Failed to load Whisper model")?;
|
||||
|
||||
let whisper_ctx = Arc::new(Mutex::new(whisper_ctx));
|
||||
|
||||
// Audio capture setup
|
||||
let host = cpal::default_host();
|
||||
let device = host
|
||||
.default_input_device()
|
||||
.context("No input device available")?;
|
||||
|
||||
info!("Using input device: {}", device.name().unwrap_or_default());
|
||||
|
||||
let config = device.default_input_config()?;
|
||||
let sample_rate = config.sample_rate().0;
|
||||
let channels = config.channels() as usize;
|
||||
|
||||
info!("Input config: {}Hz, {} channels", sample_rate, channels);
|
||||
|
||||
// Resampler: input rate -> 16kHz
|
||||
let resampler = if sample_rate != 16000 {
|
||||
Some(Arc::new(Mutex::new(
|
||||
FftFixedInOut::<f32>::new(sample_rate as usize, 16000, 1024, 1)
|
||||
.context("Failed to create resampler")?,
|
||||
)))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Shared state
|
||||
let audio_state = Arc::new(Mutex::new(AudioState::new()));
|
||||
let running = Arc::new(AtomicBool::new(true));
|
||||
let mode = Arc::new(Mutex::new(args.mode));
|
||||
|
||||
// Channel for audio data
|
||||
let (audio_tx, mut audio_rx) = mpsc::channel::<Vec<f32>>(100);
|
||||
|
||||
// Audio callback
|
||||
let resampler_clone = resampler.clone();
|
||||
let running_clone = running.clone();
|
||||
|
||||
let stream = device.build_input_stream(
|
||||
&config.into(),
|
||||
move |data: &[f32], _: &cpal::InputCallbackInfo| {
|
||||
if !running_clone.load(Ordering::Relaxed) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Convert to mono if needed
|
||||
let mono: Vec<f32> = if channels > 1 {
|
||||
data.chunks(channels)
|
||||
.map(|frame| frame.iter().sum::<f32>() / channels as f32)
|
||||
.collect()
|
||||
} else {
|
||||
data.to_vec()
|
||||
};
|
||||
|
||||
// Resample if needed
|
||||
let resampled = if let Some(ref resampler) = resampler_clone {
|
||||
if let Ok(mut r) = resampler.lock() {
|
||||
// Pad input to required length
|
||||
let input_frames = r.input_frames_next();
|
||||
if mono.len() >= input_frames {
|
||||
let input = vec![mono[..input_frames].to_vec()];
|
||||
match r.process(&input, None) {
|
||||
Ok(output) => output.into_iter().flatten().collect(),
|
||||
Err(_) => return,
|
||||
}
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
mono
|
||||
};
|
||||
|
||||
let _ = audio_tx.blocking_send(resampled);
|
||||
},
|
||||
|err| {
|
||||
error!("Audio stream error: {}", err);
|
||||
},
|
||||
None,
|
||||
)?;
|
||||
|
||||
stream.play()?;
|
||||
emit_event(&SttEvent::Ready);
|
||||
|
||||
// Stdin command reader
|
||||
let running_stdin = running.clone();
|
||||
let mode_stdin = mode.clone();
|
||||
let audio_state_stdin = audio_state.clone();
|
||||
|
||||
let stdin_handle = std::thread::spawn(move || {
|
||||
let stdin = std::io::stdin();
|
||||
for line in stdin.lock().lines() {
|
||||
if !running_stdin.load(Ordering::Relaxed) {
|
||||
break;
|
||||
}
|
||||
|
||||
let line = match line {
|
||||
Ok(l) => l,
|
||||
Err(_) => continue,
|
||||
};
|
||||
|
||||
let cmd: SttCommand = match serde_json::from_str(&line) {
|
||||
Ok(c) => c,
|
||||
Err(_) => {
|
||||
// Try simple text commands
|
||||
match line.trim().to_lowercase().as_str() {
|
||||
"start" => SttCommand::Start,
|
||||
"stop" => SttCommand::Stop,
|
||||
"cancel" => SttCommand::Cancel,
|
||||
"shutdown" | "quit" | "exit" => SttCommand::Shutdown,
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
match cmd {
|
||||
SttCommand::Start => {
|
||||
if let Ok(mut state) = audio_state_stdin.lock() {
|
||||
state.is_recording = true;
|
||||
state.clear();
|
||||
emit_event(&SttEvent::RecordingStarted);
|
||||
}
|
||||
}
|
||||
SttCommand::Stop => {
|
||||
if let Ok(mut state) = audio_state_stdin.lock() {
|
||||
state.is_recording = false;
|
||||
emit_event(&SttEvent::RecordingStopped);
|
||||
}
|
||||
}
|
||||
SttCommand::Cancel => {
|
||||
if let Ok(mut state) = audio_state_stdin.lock() {
|
||||
state.is_recording = false;
|
||||
state.clear();
|
||||
emit_event(&SttEvent::RecordingStopped);
|
||||
}
|
||||
}
|
||||
SttCommand::Shutdown => {
|
||||
running_stdin.store(false, Ordering::Relaxed);
|
||||
break;
|
||||
}
|
||||
SttCommand::SetMode { mode: m } => {
|
||||
if let Ok(mut current_mode) = mode_stdin.lock() {
|
||||
*current_mode = match m.as_str() {
|
||||
"oneshot" => Mode::Oneshot,
|
||||
"continuous" => Mode::Continuous,
|
||||
"manual" => Mode::Manual,
|
||||
_ => continue,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Main processing loop
|
||||
let vad_threshold = args.vad_threshold;
|
||||
let silence_samples_threshold = (args.silence_ms as f32 * 16.0) as usize; // 16 samples per ms at 16kHz
|
||||
let partial_interval = std::time::Duration::from_millis(args.partial_interval_ms);
|
||||
let emit_partials = args.partials;
|
||||
let language = args.language.clone();
|
||||
|
||||
while running.load(Ordering::Relaxed) {
|
||||
// Receive audio data
|
||||
let samples = match tokio::time::timeout(
|
||||
std::time::Duration::from_millis(100),
|
||||
audio_rx.recv(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(Some(s)) => s,
|
||||
Ok(None) => break,
|
||||
Err(_) => continue, // Timeout, check running flag
|
||||
};
|
||||
|
||||
let current_mode = *mode.lock().unwrap();
|
||||
let mut state = audio_state.lock().unwrap();
|
||||
|
||||
// Mode-specific behavior
|
||||
match current_mode {
|
||||
Mode::Manual => {
|
||||
if !state.is_recording {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
Mode::Oneshot | Mode::Continuous => {
|
||||
// Auto-start on speech detection
|
||||
let has_speech = simple_vad(&samples, vad_threshold);
|
||||
|
||||
if !state.is_recording && has_speech {
|
||||
state.is_recording = true;
|
||||
state.clear();
|
||||
emit_event(&SttEvent::RecordingStarted);
|
||||
}
|
||||
|
||||
if !state.is_recording {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Accumulate audio
|
||||
state.buffer.extend_from_slice(&samples);
|
||||
|
||||
// VAD check
|
||||
let has_speech = simple_vad(&samples, vad_threshold);
|
||||
if has_speech {
|
||||
state.speech_detected = true;
|
||||
state.silence_samples = 0;
|
||||
} else {
|
||||
state.silence_samples += samples.len();
|
||||
}
|
||||
|
||||
// Emit partial transcript if enabled
|
||||
if emit_partials
|
||||
&& state.speech_detected
|
||||
&& state.last_partial.elapsed() > partial_interval
|
||||
&& state.buffer.len() > 16000 // At least 1 second
|
||||
{
|
||||
state.last_partial = std::time::Instant::now();
|
||||
let buffer_copy = state.buffer.clone();
|
||||
let ctx = whisper_ctx.clone();
|
||||
let lang = language.clone();
|
||||
|
||||
// Transcribe in background
|
||||
tokio::task::spawn_blocking(move || {
|
||||
if let Ok(text) = transcribe(&ctx, &buffer_copy, &lang, false) {
|
||||
if !text.is_empty() {
|
||||
emit_event(&SttEvent::Partial { text });
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Check for end of utterance
|
||||
let should_finalize = match current_mode {
|
||||
Mode::Manual => !state.is_recording && state.speech_detected,
|
||||
Mode::Oneshot | Mode::Continuous => {
|
||||
state.speech_detected && state.silence_samples > silence_samples_threshold
|
||||
}
|
||||
};
|
||||
|
||||
if should_finalize && !state.buffer.is_empty() {
|
||||
let buffer_copy = state.buffer.clone();
|
||||
let ctx = whisper_ctx.clone();
|
||||
let lang = language.clone();
|
||||
|
||||
// Final transcription
|
||||
match transcribe(&ctx, &buffer_copy, &lang, true) {
|
||||
Ok(text) => {
|
||||
if !text.is_empty() {
|
||||
emit_event(&SttEvent::Final { text });
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
emit_event(&SttEvent::Error {
|
||||
message: e.to_string(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
state.clear();
|
||||
state.is_recording = current_mode == Mode::Continuous;
|
||||
|
||||
if current_mode == Mode::Oneshot {
|
||||
emit_event(&SttEvent::RecordingStopped);
|
||||
}
|
||||
}
|
||||
|
||||
// Prevent buffer from growing too large
|
||||
if state.buffer.len() > 16000 * 30 {
|
||||
warn!("Buffer too large, truncating");
|
||||
let start = state.buffer.len() - 16000 * 20;
|
||||
state.buffer = state.buffer[start..].to_vec();
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
drop(stream);
|
||||
emit_event(&SttEvent::Shutdown);
|
||||
let _ = stdin_handle.join();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Transcribe audio buffer using Whisper
|
||||
fn transcribe(
|
||||
ctx: &Arc<Mutex<WhisperContext>>,
|
||||
samples: &[f32],
|
||||
language: &str,
|
||||
is_final: bool,
|
||||
) -> Result<String> {
|
||||
let ctx = ctx.lock().map_err(|_| anyhow::anyhow!("Lock poisoned"))?;
|
||||
let mut state = ctx.create_state()?;
|
||||
|
||||
let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 });
|
||||
|
||||
// Configure for speed vs accuracy
|
||||
if is_final {
|
||||
params.set_n_threads(4);
|
||||
} else {
|
||||
params.set_n_threads(2);
|
||||
params.set_no_context(true);
|
||||
}
|
||||
|
||||
params.set_language(Some(language));
|
||||
params.set_print_special(false);
|
||||
params.set_print_progress(false);
|
||||
params.set_print_realtime(false);
|
||||
params.set_print_timestamps(false);
|
||||
params.set_suppress_blank(true);
|
||||
params.set_suppress_non_speech_tokens(true);
|
||||
|
||||
// Run inference
|
||||
state.full(params, samples)?;
|
||||
|
||||
// Collect segments
|
||||
let num_segments = state.full_n_segments()?;
|
||||
let mut text = String::new();
|
||||
|
||||
for i in 0..num_segments {
|
||||
if let Ok(segment) = state.full_get_segment_text(i) {
|
||||
text.push_str(&segment);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(text.trim().to_string())
|
||||
}
|
||||
|
||||
/// Stub for dirs crate functionality
|
||||
mod dirs {
|
||||
use std::path::PathBuf;
|
||||
|
||||
pub fn cache_dir() -> Option<PathBuf> {
|
||||
std::env::var("XDG_CACHE_HOME")
|
||||
.map(PathBuf::from)
|
||||
.ok()
|
||||
.or_else(|| {
|
||||
std::env::var("HOME")
|
||||
.map(|h| PathBuf::from(h).join(".cache"))
|
||||
.ok()
|
||||
})
|
||||
}
|
||||
}
|
||||
101
hosts/lio/flake.lock
generated
101
hosts/lio/flake.lock
generated
|
|
@ -87,21 +87,6 @@
|
|||
"type": "github"
|
||||
}
|
||||
},
|
||||
"crane_2": {
|
||||
"locked": {
|
||||
"lastModified": 1768319649,
|
||||
"narHash": "sha256-VFkNyxHxkqGp8gf8kfFMW1j6XeBy609kv6TE9uF/0Js=",
|
||||
"owner": "ipetkov",
|
||||
"repo": "crane",
|
||||
"rev": "4b6527687cfd20da3c2ef8287e01b74c2d6c705b",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "ipetkov",
|
||||
"repo": "crane",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"darwin": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
|
|
@ -131,14 +116,20 @@
|
|||
"plasma-manager": "plasma-manager"
|
||||
},
|
||||
"locked": {
|
||||
"path": "../../flakes/de_plasma",
|
||||
"type": "path"
|
||||
"dir": "flakes/de_plasma",
|
||||
"lastModified": 1768233301,
|
||||
"narHash": "sha256-m7Og7WuCT8VdQdLhsR6J7ZCR+aFM5ddJ7A1Kt2LBXQs=",
|
||||
"ref": "refs/heads/master",
|
||||
"rev": "128209e4aa8927b7514bcfd2acaf097ac0d59310",
|
||||
"revCount": 1122,
|
||||
"type": "git",
|
||||
"url": "https://git.joshuabell.xyz/ringofstorms/dotfiles"
|
||||
},
|
||||
"original": {
|
||||
"path": "../../flakes/de_plasma",
|
||||
"type": "path"
|
||||
},
|
||||
"parent": []
|
||||
"dir": "flakes/de_plasma",
|
||||
"type": "git",
|
||||
"url": "https://git.joshuabell.xyz/ringofstorms/dotfiles"
|
||||
}
|
||||
},
|
||||
"flake-utils": {
|
||||
"inputs": {
|
||||
|
|
@ -158,24 +149,6 @@
|
|||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-utils_2": {
|
||||
"inputs": {
|
||||
"systems": "systems_3"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1731533236,
|
||||
"narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flatpaks": {
|
||||
"inputs": {
|
||||
"nix-flatpak": "nix-flatpak"
|
||||
|
|
@ -388,22 +361,6 @@
|
|||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs_7": {
|
||||
"locked": {
|
||||
"lastModified": 1768127708,
|
||||
"narHash": "sha256-1Sm77VfZh3mU0F5OqKABNLWxOuDeHIlcFjsXeeiPazs=",
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "ffbc9f8cbaacfb331b6017d5a5abb21a492c9a38",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nixos",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nvim_plugin-Almo7aya/openingh.nvim": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
|
|
@ -1340,8 +1297,7 @@
|
|||
"opencode": "opencode",
|
||||
"ros_neovim": "ros_neovim",
|
||||
"secrets": "secrets",
|
||||
"secrets-bao": "secrets-bao",
|
||||
"stt_ime": "stt_ime"
|
||||
"secrets-bao": "secrets-bao"
|
||||
}
|
||||
},
|
||||
"ros_neovim": {
|
||||
|
|
@ -1491,22 +1447,6 @@
|
|||
},
|
||||
"parent": []
|
||||
},
|
||||
"stt_ime": {
|
||||
"inputs": {
|
||||
"crane": "crane_2",
|
||||
"flake-utils": "flake-utils_2",
|
||||
"nixpkgs": "nixpkgs_7"
|
||||
},
|
||||
"locked": {
|
||||
"path": "../../flakes/stt_ime",
|
||||
"type": "path"
|
||||
},
|
||||
"original": {
|
||||
"path": "../../flakes/stt_ime",
|
||||
"type": "path"
|
||||
},
|
||||
"parent": []
|
||||
},
|
||||
"systems": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
|
|
@ -1536,21 +1476,6 @@
|
|||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"systems_3": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
|
|
|
|||
|
|
@ -16,10 +16,8 @@
|
|||
flatpaks.url = "git+https://git.joshuabell.xyz/ringofstorms/dotfiles?dir=flakes/flatpaks";
|
||||
# beszel.url = "path:../../flakes/beszel";
|
||||
beszel.url = "git+https://git.joshuabell.xyz/ringofstorms/dotfiles?dir=flakes/beszel";
|
||||
de_plasma.url = "path:../../flakes/de_plasma";
|
||||
# de_plasma.url = "git+https://git.joshuabell.xyz/ringofstorms/dotfiles?dir=flakes/de_plasma";
|
||||
stt_ime.url = "path:../../flakes/stt_ime";
|
||||
# stt_ime.url = "git+https://git.joshuabell.xyz/ringofstorms/dotfiles?dir=flakes/stt_ime";
|
||||
# de_plasma.url = "path:../../flakes/de_plasma";
|
||||
de_plasma.url = "git+https://git.joshuabell.xyz/ringofstorms/dotfiles?dir=flakes/de_plasma";
|
||||
|
||||
opencode.url = "github:sst/opencode?ref=latest";
|
||||
ros_neovim.url = "git+https://git.joshuabell.xyz/ringofstorms/nvim";
|
||||
|
|
@ -71,14 +69,6 @@
|
|||
# sddm.autologinUser = "josh";
|
||||
};
|
||||
})
|
||||
inputs.stt_ime.nixosModules.default
|
||||
({
|
||||
ringofstorms.sttIme = {
|
||||
enable = true;
|
||||
useGpu = true;
|
||||
};
|
||||
})
|
||||
|
||||
secrets.nixosModules.default
|
||||
ros_neovim.nixosModules.default
|
||||
({
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue