[{"data":1,"prerenderedAt":978},["ShallowReactive",2],{"content-/zh/advanced-tutorial/audio-workflow":3},{"id":4,"title":5,"body":6,"description":970,"extension":971,"meta":972,"navigation":973,"path":974,"seo":975,"stem":976,"__hash__":977},"content/zh/advanced-tutorial/audio-workflow.md","ComfyUI 语音控制工作流教程：用语音命令生成图像",{"type":7,"value":8,"toc":936},"minimark",[9,13,22,25,29,36,52,55,70,73,75,78,81,106,110,195,197,201,205,227,231,284,288,322,324,328,331,335,338,368,372,377,395,399,423,427,439,443,446,516,520,559,564,566,570,573,577,580,598,602,607,618,621,669,673,739,743,759,764,766,770,774,785,789,803,807,821,825,836,840,857,859,863,910,913],[10,11,5],"h1",{"id":12},"comfyui-语音控制工作流教程用语音命令生成图像",[14,15,16,17,21],"p",{},"本篇分步教程将教你在 ComfyUI 中搭建",[18,19,20],"strong",{},"语音控制 AI 图像生成","工作流——从语音转文字（STT）集成，到可通过语音调节的各项参数。",[23,24],"hr",{},[26,27,28],"h2",{"id":28},"简介",[14,30,31,32,35],{},"ComfyUI 的语音控制工作流允许你使用",[18,33,34],{},"自然语音","控制 AI 图像生成，无需手动输入提示词或拖动滑块。这套方案非常适合：",[37,38,39,43,46,49],"ul",{},[40,41,42],"li",{},"免手动操作（如手绘、多任务同时进行时）",[40,44,45],{},"快速迭代（实时口述调整提示词）",[40,47,48],{},"无障碍使用（偏好语音而非文字的用户）",[40,50,51],{},"动态参数控制（用语音调节风格、分辨率、强度等）",[14,53,54],{},"本教程包含两大核心工作流：",[56,57,58,64],"ol",{},[40,59,60,63],{},[18,61,62],{},"基础版","：语音转文字（STT）→ 文生图（T2I）",[40,65,66,69],{},[18,67,68],{},"高级版","：语音控制参数（口述风格、强度、分辨率，无需操作界面）",[14,71,72],{},"所有工具均免费、开源，并兼容 ComfyUI 的节点化系统。",[23,74],{},[26,76,77],{"id":77},"准备工作",[14,79,80],{},"开始前请确保你已准备：",[37,82,83,94,97,100,103],{},[40,84,85,86,93],{},"已更新的 ComfyUI（推荐每日构建版；",[87,88,92],"a",{"href":89,"rel":90},"https://comfyui-box.com/basic-tutorial/installation",[91],"nofollow","安装指南","）",[40,95,96],{},"显存 ≥8GB 的 GPU（大型模型流畅运行建议 ≥12GB）",[40,98,99],{},"麦克风（内置或外置，语音输入必需）",[40,101,102],{},"稳定的网络（用于下载 STT 模型与插件）",[40,104,105],{},"掌握 ComfyUI 核心节点基础（Load Checkpoint、Sampler 等）",[107,108,109],"h3",{"id":109},"必需工具与模型",[111,112,113,129],"table",{},[114,115,116],"thead",{},[117,118,119,123,126],"tr",{},[120,121,122],"th",{},"组件",[120,124,125],{},"用途",[120,127,128],{},"下载/安装链接",[130,131,132,149,165,181],"tbody",{},[117,133,134,138,141],{},[135,136,137],"td",{},"ComfyUI 语音输入插件",[135,139,140],{},"为 ComfyUI 添加语音识别节点",[135,142,143,148],{},[87,144,147],{"href":145,"rel":146},"https://github.com/Comfy-Org/ComfyUI-Voice-Input",[91],"GitHub","（可通过 ComfyUI 管理器安装）",[117,150,151,154,157],{},[135,152,153],{},"Whisper STT 模型",[135,155,156],{},"开源语音转文字",[135,158,159,164],{},[87,160,163],{"href":161,"rel":162},"https://huggingface.co/openai/whisper-small",[91],"Hugging Face","（低显存可选 base 版）",[117,166,167,170,173],{},[135,168,169],{},"文生图模型",[135,171,172],{},"核心图像生成（如 SD 1.5/XL）",[135,174,175,180],{},[87,176,179],{"href":177,"rel":178},"https://civitai.com/models/433319/sd-xl-base-10",[91],"Civitai"," 或 ComfyUI 默认模型",[117,182,183,186,189],{},[135,184,185],{},"可选：ControlNet",[135,187,188],{},"用于语音控制姿态/风格",[135,190,191],{},[87,192,179],{"href":193,"rel":194},"https://civitai.com/models/38780/controlnet-11-openpose",[91],[23,196],{},[26,198,200],{"id":199},"第一部分在-comfyui-中配置语音输入","第一部分：在 ComfyUI 中配置语音输入",[107,202,204],{"id":203},"步骤-1安装语音输入插件","步骤 1：安装语音输入插件",[56,206,207,214,220],{},[40,208,209,210,213],{},"打开 ComfyUI，进入右上角 ",[18,211,212],{},"ComfyUI 管理器","。",[40,215,216,217,213],{},"在插件商店搜索 ",[18,218,219],{},"Voice Input",[40,221,222,223,226],{},"点击安装，",[18,224,225],{},"重启 ComfyUI"," 使插件生效。",[107,228,230],{"id":229},"步骤-2下载-whisper-stt-模型","步骤 2：下载 Whisper STT 模型",[56,232,233,262,273],{},[40,234,235,236,240,241],{},"进入 ",[87,237,239],{"href":161,"rel":238},[91],"Whisper 模型库","，根据显存选择：\n",[37,242,243,250,256],{},[40,244,245,249],{},[246,247,248],"code",{},"whisper-tiny","（1.1GB）：≥4GB 显存，更快但精度较低",[40,251,252,255],{},[246,253,254],{},"whisper-small","（4.1GB）：≥8GB 显存，速度与精度平衡",[40,257,258,261],{},[246,259,260],{},"whisper-medium","（13GB）：≥12GB 显存，精度最高",[40,263,264,265,268,269,272],{},"下载 ",[246,266,267],{},".bin"," 模型文件（如 ",[246,270,271],{},"pytorch_model.bin","）。",[40,274,275,276,279,280,283],{},"在 ",[246,277,278],{},"ComfyUI/models/"," 下新建文件夹 ",[246,281,282],{},"whisper/","，将模型放入。",[107,285,287],{"id":286},"步骤-3验证语音输入配置","步骤 3：验证语音输入配置",[56,289,290,299],{},[40,291,292,293,296,297,213],{},"重新打开 ComfyUI，在画布右键 → ",[18,294,295],{},"添加节点"," → ",[18,298,219],{},[40,300,301,302],{},"确认出现以下节点（说明插件安装成功）：\n",[37,303,304,310,316],{},[40,305,306,309],{},[246,307,308],{},"Voice Recorder","：录制麦克风输入",[40,311,312,315],{},[246,313,314],{},"Whisper STT","：语音转文字",[40,317,318,321],{},[246,319,320],{},"Voice Parameter Parser","：可选，从语音中提取参数",[23,323],{},[26,325,327],{"id":326},"第二部分基础工作流-语音转文字-文生图","第二部分：基础工作流 —— 语音转文字 → 文生图",[14,329,330],{},"本工作流将你的语音提示词转为文字，再用文字生成图像。",[107,332,334],{"id":333},"步骤-1添加核心节点","步骤 1：添加核心节点",[14,336,337],{},"在画布右键添加以下节点：",[56,339,340,344,348,353,358,363],{},[40,341,342],{},[18,343,308],{},[40,345,346],{},[18,347,314],{},[40,349,350],{},[18,351,352],{},"Load Checkpoint",[40,354,355],{},[18,356,357],{},"CLIP Text Encode",[40,359,360],{},[18,361,362],{},"KSampler",[40,364,365],{},[18,366,367],{},"Save Image",[107,369,371],{"id":370},"步骤-2配置节点","步骤 2：配置节点",[373,374,376],"h4",{"id":375},"_1-voice-recorder","1. Voice Recorder",[37,378,379,389],{},[40,380,381,384,385,388],{},[246,382,383],{},"device"," 保持 ",[246,386,387],{},"default","（使用系统默认麦克风）",[40,390,391,394],{},[246,392,393],{},"sample_rate"," 设置为 16000（Whisper 标准采样率）",[373,396,398],{"id":397},"_2-whisper-stt","2. Whisper STT",[37,400,401,409],{},[40,402,403,406,407,93],{},[246,404,405],{},"model_name"," 选择你下载的模型（如 ",[246,408,254],{},[40,410,411,414,415,418,419,422],{},[246,412,413],{},"language"," 设置为 ",[246,416,417],{},"zh","（中文）或 ",[246,420,421],{},"en","（英文）",[373,424,426],{"id":425},"_3-load-checkpoint","3. Load Checkpoint",[37,428,429],{},[40,430,431,432,435,436,93],{},"选择一个文生图模型（如 ",[246,433,434],{},"SDXL-Base-1.0","、",[246,437,438],{},"Realistic Vision 6.0",[107,440,442],{"id":441},"步骤-3连接工作流关键","步骤 3：连接工作流（关键）",[14,444,445],{},"按以下顺序连接：",[56,447,448,460,471,482,493,505],{},[40,449,450,452,453,296,456,452,458],{},[246,451,308],{}," 的 ",[246,454,455],{},"audio",[246,457,314],{},[246,459,455],{},[40,461,462,452,464,296,467,452,469],{},[246,463,314],{},[246,465,466],{},"text",[246,468,357],{},[246,470,466],{},[40,472,473,452,475,296,478,452,480],{},[246,474,352],{},[246,476,477],{},"model",[246,479,362],{},[246,481,477],{},[40,483,484,452,486,296,489,452,491],{},[246,485,352],{},[246,487,488],{},"clip",[246,490,357],{},[246,492,488],{},[40,494,495,452,497,296,500,452,502],{},[246,496,357],{},[246,498,499],{},"conditioning",[246,501,362],{},[246,503,504],{},"positive",[40,506,507,452,509,296,512,452,514],{},[246,508,362],{},[246,510,511],{},"image",[246,513,367],{},[246,515,511],{},[107,517,519],{"id":518},"步骤-4运行语音控制工作流","步骤 4：运行语音控制工作流",[56,521,522,532,541,547,552],{},[40,523,524,525,527,528,531],{},"点击 ",[246,526,308],{}," 上的 ",[18,529,530],{},"Record"," 按钮（变红表示正在录制）。",[40,533,534,535],{},"清晰说出提示词，例如：\n",[536,537,538],"blockquote",{},[14,539,540],{},"“赛博朋克城市夜晚，霓虹灯，照片级真实感，8K分辨率”",[40,542,524,543,546],{},[18,544,545],{},"Stop","（或静音5秒后自动停止）。",[40,548,549,551],{},[246,550,314],{}," 会将语音转为文字。",[40,553,554,555,558],{},"点击右上角 ",[18,556,557],{},"Queue Prompt"," 生成图像。",[536,560,561],{},[14,562,563],{},"专业提示\n长提示词请在句子间短暂停顿，Whisper 对自然停顿识别效果更好。若文字不准确，放慢语速、清晰咬字重新录制。",[23,565],{},[26,567,569],{"id":568},"第三部分高级工作流-语音控制参数","第三部分：高级工作流 —— 语音控制参数",[14,571,572],{},"进一步使用语音命令调节生成参数（风格、强度、分辨率等），无需编辑节点。",[107,574,576],{"id":575},"步骤-1添加高级节点","步骤 1：添加高级节点",[14,578,579],{},"在基础工作流上添加：",[56,581,582,586,592],{},[40,583,584],{},[18,585,320],{},[40,587,588,591],{},[18,589,590],{},"Float / Integer"," 节点（用于参数数值）",[40,593,594,597],{},[18,595,596],{},"ControlNet Loader","（可选，用于语音控制风格）",[107,599,601],{"id":600},"步骤-2定义可语音控制的参数","步骤 2：定义可语音控制的参数",[14,603,604,606],{},[246,605,320],{}," 可将语音命令映射为数值。\n示例：",[37,608,609,612,615],{},[40,610,611],{},"“set strength to 0.8” → 调整 LoRA/ControlNet 强度",[40,613,614],{},"“Resolution 1024x768” → 设置图像宽高",[40,616,617],{},"“Style realistic” → 切换写实风格模型",[14,619,620],{},"配置方法：",[56,622,623,629],{},[40,624,625,626,213],{},"在节点中点击 ",[246,627,628],{},"add parameter",[40,630,631,632],{},"为每个参数设置：\n",[37,633,634,652,663],{},[40,635,636,639,640,435,643,435,646,435,649],{},[246,637,638],{},"Parameter Name","：如 ",[246,641,642],{},"strength",[246,644,645],{},"width",[246,647,648],{},"height",[246,650,651],{},"style",[40,653,654,639,657,435,660],{},[246,655,656],{},"Command Prefix",[246,658,659],{},"set strength to",[246,661,662],{},"resolution width",[40,664,665,668],{},[246,666,667],{},"Default Value","：默认值",[107,670,672],{"id":671},"步骤-3连接参数节点","步骤 3：连接参数节点",[56,674,675,685,706,724],{},[40,676,677,452,679,296,681,452,683],{},[246,678,314],{},[246,680,466],{},[246,682,320],{},[246,684,466],{},[40,686,687,688],{},"强度控制：\n",[37,689,690],{},[40,691,692,694,695,698,699,702,703],{},[246,693,642],{}," 输出连接到 ",[246,696,697],{},"Float"," 节点，再连到 ",[246,700,701],{},"Load LoRA"," 或 ",[246,704,705],{},"ControlNet",[40,707,708,709],{},"分辨率控制：\n",[37,710,711],{},[40,712,713,715,716,718,719,698,722],{},[246,714,645],{},"/",[246,717,648],{}," 连接到 ",[246,720,721],{},"Integer",[246,723,362],{},[40,725,726,727],{},"风格控制（高级）：\n",[37,728,729,736],{},[40,730,731,732,735],{},"添加 ",[246,733,734],{},"Model Switch"," 节点",[40,737,738],{},"通过解析器将“style anime”“style realistic”映射到不同模型",[107,740,742],{"id":741},"步骤-4测试语音控制生成","步骤 4：测试语音控制生成",[56,744,745,753,756],{},[40,746,747,748],{},"录制带参数的语音命令，例如：\n",[536,749,750],{},[14,751,752],{},"“风格动漫，强度0.9，分辨率1024x1024，一只带翅膀的可爱小猫”",[40,754,755],{},"解析器会自动提取数值并传给对应节点。",[40,757,758],{},"提交任务生成图像。",[536,760,761],{},[14,762,763],{},"重要提示\n参数命令请保持简单、一致。避免模糊语句，例如用“set width to 1280”而不是“make it bigger”。",[23,765],{},[26,767,769],{"id":768},"第四部分常见问题排查","第四部分：常见问题排查",[107,771,773],{"id":772},"问题-1voice-recorder-无法捕获声音","问题 1：Voice Recorder 无法捕获声音",[37,775,776,779,782],{},[40,777,778],{},"检查麦克风是否设为默认设备",[40,780,781],{},"关闭其他占用麦克风的软件（Zoom、Discord 等）",[40,783,784],{},"重装语音输入插件",[107,786,788],{"id":787},"问题-2whisper-转文字不准确","问题 2：Whisper 转文字不准确",[37,790,791,794,797],{},[40,792,793],{},"使用更大的 Whisper 模型",[40,795,796],{},"放慢语速、减少背景噪音",[40,798,799,800,802],{},"在节点中设置正确语言（如 ",[246,801,417],{}," 中文）",[107,804,806],{"id":805},"问题-3参数无法通过语音更新","问题 3：参数无法通过语音更新",[37,808,809,815,818],{},[40,810,811,812,814],{},"检查 ",[246,813,656],{}," 是否与你说的语句一致",[40,816,817],{},"确认解析器输出已正确连接到目标节点",[40,819,820],{},"先用简单单参数命令测试，再用复杂命令",[107,822,824],{"id":823},"问题-4显存不足oom","问题 4：显存不足（OOM）",[37,826,827,830,833],{},[40,828,829],{},"使用更小的 Whisper 模型",[40,831,832],{},"降低图像分辨率",[40,834,835],{},"开启模型卸载（ComfyUI 设置 → 性能）",[107,837,839],{"id":838},"问题-5工作流运行但不出图","问题 5：工作流运行但不出图",[37,841,842,845,851],{},[40,843,844],{},"检查所有节点是否正确连接",[40,846,847,848,850],{},"确认 ",[246,849,367],{}," 路径有效",[40,852,853,854,856],{},"确保 ",[246,855,362],{}," 有正确的正面提示词",[23,858],{},[26,860,862],{"id":861},"第五部分最佳实践","第五部分：最佳实践",[56,864,865,880,886,892,898,904],{},[40,866,867,870,871,875,876,879],{},[18,868,869],{},"固定语音句式","：统一使用“set ",[872,873,874],"span",{},"参数"," to ",[872,877,878],{},"数值","”结构，识别更稳定。",[40,881,882,885],{},[18,883,884],{},"减少环境噪音","：在安静环境录制，或使用降噪麦克风。",[40,887,888,891],{},[18,889,890],{},"先文字测试","：先用文字测试提示词，再用语音，避免重复录制。",[40,893,894,897],{},[18,895,896],{},"保存工作流","：将可用的语音工作流保存，方便重复使用。",[40,899,900,903],{},[18,901,902],{},"定期更新","：语音插件与 Whisper 模型更新频繁，新版本会提升精度与速度。",[40,905,906,909],{},[18,907,908],{},"控制参数数量","：单次命令不要超过 2–3 个参数，识别更稳定。",[14,911,912],{},"更多资源：",[37,914,915,922,929],{},[40,916,917],{},[87,918,921],{"href":919,"rel":920},"https://github.com/Comfy-Org/ComfyUI-Voice-Input/blob/main/README.md",[91],"ComfyUI 语音输入插件文档",[40,923,924],{},[87,925,928],{"href":926,"rel":927},"https://github.com/openai/whisper",[91],"OpenAI Whisper 官方文档",[40,930,931],{},[87,932,935],{"href":933,"rel":934},"https://comfyui.workflows/",[91],"ComfyUI 高级工作流画廊",{"title":937,"searchDepth":938,"depth":938,"links":939},"",2,[940,941,945,950,956,962,969],{"id":28,"depth":938,"text":28},{"id":77,"depth":938,"text":77,"children":942},[943],{"id":109,"depth":944,"text":109},3,{"id":199,"depth":938,"text":200,"children":946},[947,948,949],{"id":203,"depth":944,"text":204},{"id":229,"depth":944,"text":230},{"id":286,"depth":944,"text":287},{"id":326,"depth":938,"text":327,"children":951},[952,953,954,955],{"id":333,"depth":944,"text":334},{"id":370,"depth":944,"text":371},{"id":441,"depth":944,"text":442},{"id":518,"depth":944,"text":519},{"id":568,"depth":938,"text":569,"children":957},[958,959,960,961],{"id":575,"depth":944,"text":576},{"id":600,"depth":944,"text":601},{"id":671,"depth":944,"text":672},{"id":741,"depth":944,"text":742},{"id":768,"depth":938,"text":769,"children":963},[964,965,966,967,968],{"id":772,"depth":944,"text":773},{"id":787,"depth":944,"text":788},{"id":805,"depth":944,"text":806},{"id":823,"depth":944,"text":824},{"id":838,"depth":944,"text":839},{"id":861,"depth":938,"text":862},"本篇分步教程将教你在 ComfyUI 中搭建语音控制 AI 图像生成工作流——从语音转文字（STT）集成，到可通过语音调节的各项参数。","md",{},true,"/zh/advanced-tutorial/audio-workflow",{"title":5,"description":970},"zh/advanced-tutorial/audio-workflow","B5Udk1Hy4zy6SxLJb-nJWq0pFstgAPsU-CxMGGZXtO0",1773986044745]