Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 44 additions & 13 deletions OpenUtau.Core/DiffSinger/DiffSingerVariance.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ public class DsVariance : IDisposable{
IG2p g2p;
float frameMs;
DiffSingerSpeakerEmbedManager speakerEmbedManager;
readonly Dictionary<ulong, VariancePatchState> variancePatchStates = new Dictionary<ulong, VariancePatchState>();

public float FrameMs => frameMs;

Expand Down Expand Up @@ -195,34 +196,41 @@ public VarianceResult Process(RenderPhrase phrase){
pitch = pitch.Zip(toneShift, (x, d) => x + d).ToArray();

var varianceInputs = new List<NamedOnnxValue>();
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("encoder_out", encoder_out));
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("ph_dur",
var variancePatchInputs = new List<NamedOnnxValue>();
void AddVarianceInput(NamedOnnxValue input, bool includeInPatchKey = true) {
varianceInputs.Add(input);
if (includeInPatchKey) {
variancePatchInputs.Add(input);
}
}
AddVarianceInput(NamedOnnxValue.CreateFromTensor("encoder_out", encoder_out));
AddVarianceInput(NamedOnnxValue.CreateFromTensor("ph_dur",
new DenseTensor<Int64>(ph_dur.Select(x=>(Int64)x).ToArray(), new int[] { ph_dur.Length }, false)
.Reshape(new int[] { 1, ph_dur.Length })));
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("pitch",
AddVarianceInput(NamedOnnxValue.CreateFromTensor("pitch",
new DenseTensor<float>(pitch, new int[] { pitch.Length }, false)
.Reshape(new int[] { 1, totalFrames })));
.Reshape(new int[] { 1, totalFrames })), includeInPatchKey: false);
if (dsConfig.predict_energy) {
var energy = Enumerable.Repeat(0f, totalFrames).ToArray();
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("energy",
AddVarianceInput(NamedOnnxValue.CreateFromTensor("energy",
new DenseTensor<float>(energy, new int[] { energy.Length }, false)
.Reshape(new int[] { 1, totalFrames })));
}
if (dsConfig.predict_breathiness) {
var breathiness = Enumerable.Repeat(0f, totalFrames).ToArray();
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("breathiness",
AddVarianceInput(NamedOnnxValue.CreateFromTensor("breathiness",
new DenseTensor<float>(breathiness, new int[] { breathiness.Length }, false)
.Reshape(new int[] { 1, totalFrames })));
}
if (dsConfig.predict_voicing) {
var voicing = Enumerable.Repeat(0f, totalFrames).ToArray();
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("voicing",
AddVarianceInput(NamedOnnxValue.CreateFromTensor("voicing",
new DenseTensor<float>(voicing, new int[] { voicing.Length }, false)
.Reshape(new int[] { 1, totalFrames })));
}
if (dsConfig.predict_tension) {
var tension = Enumerable.Repeat(0f, totalFrames).ToArray();
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("tension",
AddVarianceInput(NamedOnnxValue.CreateFromTensor("tension",
new DenseTensor<float>(tension, new int[] { tension.Length }, false)
.Reshape(new int[] { 1, totalFrames })));
}
Expand All @@ -234,27 +242,33 @@ public VarianceResult Process(RenderPhrase phrase){
dsConfig.predict_tension,
}.Sum(Convert.ToInt32);
var retake = Enumerable.Repeat(true, totalFrames * numVariances).ToArray();
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("retake",
AddVarianceInput(NamedOnnxValue.CreateFromTensor("retake",
new DenseTensor<bool>(retake, new int[] { retake.Length }, false)
.Reshape(new int[] { 1, totalFrames, numVariances })));
var steps = Preferences.Default.DiffSingerStepsVariance;
if (dsConfig.useContinuousAcceleration) {
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("steps",
AddVarianceInput(NamedOnnxValue.CreateFromTensor("steps",
new DenseTensor<long>(new long[] { steps }, new int[] { 1 }, false)));
} else {
// find a largest integer speedup that are less than 1000 / steps and is a factor of 1000
long speedup = Math.Max(1, 1000 / steps);
while (1000 % speedup != 0 && speedup > 1) {
speedup--;
}
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("speedup",
AddVarianceInput(NamedOnnxValue.CreateFromTensor("speedup",
new DenseTensor<long>(new long[] { speedup }, new int[] { 1 },false)));
}
//Speaker
if(dsConfig.speakers != null) {
var speakerEmbedManager = getSpeakerEmbedManager();
var spkEmbedTensor = speakerEmbedManager.PhraseSpeakerEmbedByFrame(phrase, ph_dur, frameMs, totalFrames, headFrames, tailFrames);
varianceInputs.Add(NamedOnnxValue.CreateFromTensor("spk_embed", spkEmbedTensor));
AddVarianceInput(NamedOnnxValue.CreateFromTensor("spk_embed", spkEmbedTensor));
}
ulong? variancePatchKey = null;
if (Preferences.Default.DiffSingerTensorCache &&
Preferences.Default.DiffSingerVarianceLocalPitchPatch) {
var baseHash = new DiffSingerCache(varianceHash, variancePatchInputs).Hash;
variancePatchKey = DiffSingerVariancePatch.BuildStateKey(baseHash, phrase.position, phrase.end);
}
Onnx.VerifyInputNames(varianceModel, varianceInputs);
var varianceCache = Preferences.Default.DiffSingerTensorCache
Expand Down Expand Up @@ -290,7 +304,7 @@ public VarianceResult Process(RenderPhrase phrase){
.First()
.AsTensor<float>()
: null;
return new VarianceResult{
var result = new VarianceResult{
energy = energy_pred?.ToArray(),
breathiness = breathiness_pred?.ToArray(),
voicing = voicing_pred?.ToArray(),
Expand All @@ -300,6 +314,23 @@ public VarianceResult Process(RenderPhrase phrase){
tailFrames = tailFrames,
totalFrames = totalFrames,
};
if (variancePatchKey.HasValue) {
result = ApplyVariancePatch(variancePatchKey.Value, pitch, result);
}
return result;
}

VarianceResult ApplyVariancePatch(ulong patchKey, float[] pitch, VarianceResult result) {
try {
variancePatchStates.TryGetValue(patchKey, out var previous);
var merged = DiffSingerVariancePatch.Merge(previous, pitch, result);
variancePatchStates[patchKey] = new VariancePatchState(pitch, merged);
return merged;
} catch (Exception e) {
Log.Warning(e, "Failed to apply DiffSinger variance local pitch patch.");
variancePatchStates[patchKey] = new VariancePatchState(pitch, result);
return result;
}
}

private bool disposedValue;
Expand Down
143 changes: 143 additions & 0 deletions OpenUtau.Core/DiffSinger/DiffSingerVariancePatch.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
using System;
using System.Collections.Generic;
using System.Linq;

namespace OpenUtau.Core.DiffSinger {
internal readonly struct VariancePatchRange {
public readonly int start;
public readonly int end;

public VariancePatchRange(int start, int end) {
this.start = start;
this.end = end;
}
}

internal class VariancePatchState {
public readonly float[] pitch;
public readonly VarianceResult result;

public VariancePatchState(float[] pitch, VarianceResult result) {
this.pitch = pitch.ToArray();
this.result = DiffSingerVariancePatch.CloneResult(result);
}
}

internal static class DiffSingerVariancePatch {
const float PitchEpsilon = 1e-4f;
const float CrossfadeMs = 50f;

public static ulong BuildStateKey(ulong baseHash, int phrasePosition, int phraseEnd) {
unchecked {
ulong hash = baseHash;
hash = (hash ^ (uint)phrasePosition) * 1099511628211UL;
hash = (hash ^ (uint)phraseEnd) * 1099511628211UL;
return hash;
}
}

public static VarianceResult Merge(
VariancePatchState? previous,
float[] currentPitch,
VarianceResult current) {
if (previous == null ||
previous.pitch.Length != currentPitch.Length ||
!IsMetadataCompatible(previous.result, current)) {
return CloneResult(current);
}
var ranges = FindChangedRanges(previous.pitch, currentPitch, PitchEpsilon);
if (ranges.Count == 0) {
return CloneResult(previous.result);
}
int crossfadeFrames = Math.Clamp((int)Math.Round(CrossfadeMs / current.frameMs), 1, 20);
var weights = BuildWeights(currentPitch.Length, ranges, crossfadeFrames);
return new VarianceResult {
energy = Blend(previous.result.energy, current.energy, weights),
breathiness = Blend(previous.result.breathiness, current.breathiness, weights),
voicing = Blend(previous.result.voicing, current.voicing, weights),
tension = Blend(previous.result.tension, current.tension, weights),
frameMs = current.frameMs,
headFrames = current.headFrames,
tailFrames = current.tailFrames,
totalFrames = current.totalFrames,
};
}

internal static List<VariancePatchRange> FindChangedRanges(
IReadOnlyList<float> previousPitch,
IReadOnlyList<float> currentPitch,
float epsilon) {
var ranges = new List<VariancePatchRange>();
int length = Math.Min(previousPitch.Count, currentPitch.Count);
int start = -1;
for (int i = 0; i < length; ++i) {
bool changed = Math.Abs(previousPitch[i] - currentPitch[i]) > epsilon;
if (changed && start < 0) {
start = i;
} else if (!changed && start >= 0) {
ranges.Add(new VariancePatchRange(start, i));
start = -1;
}
}
if (start >= 0) {
ranges.Add(new VariancePatchRange(start, length));
}
return ranges;
}

internal static float[] BuildWeights(int length, IReadOnlyList<VariancePatchRange> ranges, int crossfadeFrames) {
var weights = new float[length];
foreach (var range in ranges) {
int start = Math.Clamp(range.start, 0, length);
int end = Math.Clamp(range.end, start, length);
for (int i = start; i < end; ++i) {
weights[i] = 1f;
}
int leftStart = Math.Max(0, start - crossfadeFrames);
int leftLength = start - leftStart;
for (int i = leftStart; i < start; ++i) {
float weight = (float)(i - leftStart + 1) / (leftLength + 1);
weights[i] = Math.Max(weights[i], weight);
}
int rightEnd = Math.Min(length, end + crossfadeFrames);
int rightLength = rightEnd - end;
for (int i = end; i < rightEnd; ++i) {
float weight = 1f - (float)(i - end + 1) / (rightLength + 1);
weights[i] = Math.Max(weights[i], weight);
}
}
return weights;
}

internal static float[]? Blend(float[]? previous, float[]? current, IReadOnlyList<float> weights) {
if (previous == null || current == null || previous.Length != current.Length || previous.Length != weights.Count) {
return current?.ToArray();
}
var result = new float[current.Length];
for (int i = 0; i < result.Length; ++i) {
result[i] = previous[i] * (1f - weights[i]) + current[i] * weights[i];
}
return result;
}

internal static VarianceResult CloneResult(VarianceResult result) {
return new VarianceResult {
energy = result.energy?.ToArray(),
breathiness = result.breathiness?.ToArray(),
voicing = result.voicing?.ToArray(),
tension = result.tension?.ToArray(),
frameMs = result.frameMs,
headFrames = result.headFrames,
tailFrames = result.tailFrames,
totalFrames = result.totalFrames,
};
}

static bool IsMetadataCompatible(VarianceResult previous, VarianceResult current) {
return previous.totalFrames == current.totalFrames &&
previous.headFrames == current.headFrames &&
previous.tailFrames == current.tailFrames &&
Math.Abs(previous.frameMs - current.frameMs) < 1e-4f;
}
}
}
3 changes: 3 additions & 0 deletions OpenUtau.Core/Properties/AssemblyInfo.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
using System.Runtime.CompilerServices;

[assembly: InternalsVisibleTo("OpenUtau.Test")]
1 change: 1 addition & 0 deletions OpenUtau.Core/Util/Preferences.cs
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ public class SerializablePreferences {
public int DiffSingerStepsVariance = 20;
public int DiffSingerStepsPitch = 10;
public bool DiffSingerTensorCache = true;
public bool DiffSingerVarianceLocalPitchPatch = false;
public bool DiffSingerLangCodeHide = false;
public bool SkipRenderingMutedTracks = false;
public string Language = string.Empty;
Expand Down
69 changes: 69 additions & 0 deletions OpenUtau.Test/Core/DiffSinger/DiffSingerVariancePatchTest.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
using System.Linq;
using OpenUtau.Core.DiffSinger;
using Xunit;

namespace OpenUtau.Core {
public class DiffSingerVariancePatchTest {
[Fact]
public void FindChangedRangesGroupsContiguousPitchChanges() {
var previous = new[] { 1f, 1f, 1f, 1f, 1f, 1f };
var current = new[] { 1f, 2f, 2f, 1f, 2f, 1f };

var ranges = DiffSingerVariancePatch.FindChangedRanges(previous, current, 1e-4f);

Assert.Equal(2, ranges.Count);
Assert.Equal(1, ranges[0].start);
Assert.Equal(3, ranges[0].end);
Assert.Equal(4, ranges[1].start);
Assert.Equal(5, ranges[1].end);
}

[Fact]
public void MergeKeepsPreviousResultWhenPitchDoesNotChange() {
var previousResult = Result(new[] { 1f, 2f, 3f });
var currentResult = Result(new[] { 10f, 20f, 30f });
var previous = new VariancePatchState(new[] { 60f, 61f, 62f }, previousResult);

var merged = DiffSingerVariancePatch.Merge(previous, new[] { 60f, 61f, 62f }, currentResult);

Assert.Equal(previousResult.energy!, merged.energy!);
}

[Fact]
public void MergeBlendsOnlyChangedPitchRange() {
var previousResult = Result(Enumerable.Repeat(0f, 6).ToArray(), frameMs: 50);
var currentResult = Result(Enumerable.Repeat(10f, 6).ToArray(), frameMs: 50);
var previous = new VariancePatchState(
new[] { 60f, 60f, 60f, 60f, 60f, 60f },
previousResult);

var merged = DiffSingerVariancePatch.Merge(
previous,
new[] { 60f, 60f, 61f, 61f, 60f, 60f },
currentResult);

Assert.Equal(new[] { 0f, 5f, 10f, 10f, 5f, 0f }, merged.energy!);
}

[Fact]
public void MergeFallsBackToCurrentResultWhenMetadataChanges() {
var previousResult = Result(new[] { 1f, 2f, 3f }, frameMs: 50);
var currentResult = Result(new[] { 10f, 20f, 30f }, frameMs: 60);
var previous = new VariancePatchState(new[] { 60f, 61f, 62f }, previousResult);

var merged = DiffSingerVariancePatch.Merge(previous, new[] { 60f, 62f, 62f }, currentResult);

Assert.Equal(currentResult.energy!, merged.energy!);
}

static VarianceResult Result(float[] energy, float frameMs = 50) {
return new VarianceResult {
energy = energy,
frameMs = frameMs,
headFrames = 1,
tailFrames = 1,
totalFrames = energy.Length,
};
}
}
}
1 change: 1 addition & 0 deletions OpenUtau/Strings/Strings.axaml
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,7 @@ Warning: this option removes custom presets.</system:String>
<system:String x:Key="prefs.rendering.diffsingersteps">DiffSinger Render Steps for Acoustic</system:String>
<system:String x:Key="prefs.rendering.diffsingerstepspitch">DiffSinger Render Steps for Pitch</system:String>
<system:String x:Key="prefs.rendering.diffsingerstepsvariance">DiffSinger Render Steps for Variance</system:String>
<system:String x:Key="prefs.rendering.diffsingervariancelocalpitchpatch">DiffSinger Local Variance Update for Pitch Edits</system:String>
<system:String x:Key="prefs.rendering.onnxgpu">GPU</system:String>
<system:String x:Key="prefs.rendering.onnxrunner">Machine Learning Runner</system:String>
<system:String x:Key="prefs.rendering.phasecomp">Phase Compensation</system:String>
Expand Down
7 changes: 7 additions & 0 deletions OpenUtau/ViewModels/PreferencesViewModel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ public int SafeMaxThreadCount {
[Reactive] public int DiffSingerStepsPitch { get; set; }
[Reactive] public double DiffSingerDepth { get; set; }
[Reactive] public bool DiffSingerTensorCache { get; set; }
[Reactive] public bool DiffSingerVarianceLocalPitchPatch { get; set; }
[Reactive] public bool DiffSingerLangCodeHide { get; set; }

// Advanced
Expand Down Expand Up @@ -174,6 +175,7 @@ public PreferencesViewModel() {
DiffSingerStepsVariance = Preferences.Default.DiffSingerStepsVariance;
DiffSingerStepsPitch = Preferences.Default.DiffSingerStepsPitch;
DiffSingerTensorCache = Preferences.Default.DiffSingerTensorCache;
DiffSingerVarianceLocalPitchPatch = Preferences.Default.DiffSingerVarianceLocalPitchPatch;
DiffSingerLangCodeHide = Preferences.Default.DiffSingerLangCodeHide;
SkipRenderingMutedTracks = Preferences.Default.SkipRenderingMutedTracks;
ThemeName = Preferences.Default.ThemeName;
Expand Down Expand Up @@ -393,6 +395,11 @@ public PreferencesViewModel() {
Preferences.Default.DiffSingerTensorCache = useCache;
Preferences.Save();
});
this.WhenAnyValue(vm => vm.DiffSingerVarianceLocalPitchPatch)
.Subscribe(useLocalPatch => {
Preferences.Default.DiffSingerVarianceLocalPitchPatch = useLocalPatch;
Preferences.Save();
});
this.WhenAnyValue(vm => vm.DiffSingerLangCodeHide)
.Subscribe(useCache => {
Preferences.Default.DiffSingerLangCodeHide = useCache;
Expand Down
Loading
Loading