diff --git a/OpenUtau.Core/DiffSinger/DiffSingerVariance.cs b/OpenUtau.Core/DiffSinger/DiffSingerVariance.cs index 5f12b9802..6d4193458 100644 --- a/OpenUtau.Core/DiffSinger/DiffSingerVariance.cs +++ b/OpenUtau.Core/DiffSinger/DiffSingerVariance.cs @@ -35,6 +35,7 @@ public class DsVariance : IDisposable{ IG2p g2p; float frameMs; DiffSingerSpeakerEmbedManager speakerEmbedManager; + readonly Dictionary variancePatchStates = new Dictionary(); public float FrameMs => frameMs; @@ -195,34 +196,41 @@ public VarianceResult Process(RenderPhrase phrase){ pitch = pitch.Zip(toneShift, (x, d) => x + d).ToArray(); var varianceInputs = new List(); - varianceInputs.Add(NamedOnnxValue.CreateFromTensor("encoder_out", encoder_out)); - varianceInputs.Add(NamedOnnxValue.CreateFromTensor("ph_dur", + var variancePatchInputs = new List(); + void AddVarianceInput(NamedOnnxValue input, bool includeInPatchKey = true) { + varianceInputs.Add(input); + if (includeInPatchKey) { + variancePatchInputs.Add(input); + } + } + AddVarianceInput(NamedOnnxValue.CreateFromTensor("encoder_out", encoder_out)); + AddVarianceInput(NamedOnnxValue.CreateFromTensor("ph_dur", new DenseTensor(ph_dur.Select(x=>(Int64)x).ToArray(), new int[] { ph_dur.Length }, false) .Reshape(new int[] { 1, ph_dur.Length }))); - varianceInputs.Add(NamedOnnxValue.CreateFromTensor("pitch", + AddVarianceInput(NamedOnnxValue.CreateFromTensor("pitch", new DenseTensor(pitch, new int[] { pitch.Length }, false) - .Reshape(new int[] { 1, totalFrames }))); + .Reshape(new int[] { 1, totalFrames })), includeInPatchKey: false); if (dsConfig.predict_energy) { var energy = Enumerable.Repeat(0f, totalFrames).ToArray(); - varianceInputs.Add(NamedOnnxValue.CreateFromTensor("energy", + AddVarianceInput(NamedOnnxValue.CreateFromTensor("energy", new DenseTensor(energy, new int[] { energy.Length }, false) .Reshape(new int[] { 1, totalFrames }))); } if (dsConfig.predict_breathiness) { var breathiness = Enumerable.Repeat(0f, totalFrames).ToArray(); - varianceInputs.Add(NamedOnnxValue.CreateFromTensor("breathiness", + AddVarianceInput(NamedOnnxValue.CreateFromTensor("breathiness", new DenseTensor(breathiness, new int[] { breathiness.Length }, false) .Reshape(new int[] { 1, totalFrames }))); } if (dsConfig.predict_voicing) { var voicing = Enumerable.Repeat(0f, totalFrames).ToArray(); - varianceInputs.Add(NamedOnnxValue.CreateFromTensor("voicing", + AddVarianceInput(NamedOnnxValue.CreateFromTensor("voicing", new DenseTensor(voicing, new int[] { voicing.Length }, false) .Reshape(new int[] { 1, totalFrames }))); } if (dsConfig.predict_tension) { var tension = Enumerable.Repeat(0f, totalFrames).ToArray(); - varianceInputs.Add(NamedOnnxValue.CreateFromTensor("tension", + AddVarianceInput(NamedOnnxValue.CreateFromTensor("tension", new DenseTensor(tension, new int[] { tension.Length }, false) .Reshape(new int[] { 1, totalFrames }))); } @@ -234,12 +242,12 @@ public VarianceResult Process(RenderPhrase phrase){ dsConfig.predict_tension, }.Sum(Convert.ToInt32); var retake = Enumerable.Repeat(true, totalFrames * numVariances).ToArray(); - varianceInputs.Add(NamedOnnxValue.CreateFromTensor("retake", + AddVarianceInput(NamedOnnxValue.CreateFromTensor("retake", new DenseTensor(retake, new int[] { retake.Length }, false) .Reshape(new int[] { 1, totalFrames, numVariances }))); var steps = Preferences.Default.DiffSingerStepsVariance; if (dsConfig.useContinuousAcceleration) { - varianceInputs.Add(NamedOnnxValue.CreateFromTensor("steps", + AddVarianceInput(NamedOnnxValue.CreateFromTensor("steps", new DenseTensor(new long[] { steps }, new int[] { 1 }, false))); } else { // find a largest integer speedup that are less than 1000 / steps and is a factor of 1000 @@ -247,14 +255,20 @@ public VarianceResult Process(RenderPhrase phrase){ while (1000 % speedup != 0 && speedup > 1) { speedup--; } - varianceInputs.Add(NamedOnnxValue.CreateFromTensor("speedup", + AddVarianceInput(NamedOnnxValue.CreateFromTensor("speedup", new DenseTensor(new long[] { speedup }, new int[] { 1 },false))); } //Speaker if(dsConfig.speakers != null) { var speakerEmbedManager = getSpeakerEmbedManager(); var spkEmbedTensor = speakerEmbedManager.PhraseSpeakerEmbedByFrame(phrase, ph_dur, frameMs, totalFrames, headFrames, tailFrames); - varianceInputs.Add(NamedOnnxValue.CreateFromTensor("spk_embed", spkEmbedTensor)); + AddVarianceInput(NamedOnnxValue.CreateFromTensor("spk_embed", spkEmbedTensor)); + } + ulong? variancePatchKey = null; + if (Preferences.Default.DiffSingerTensorCache && + Preferences.Default.DiffSingerVarianceLocalPitchPatch) { + var baseHash = new DiffSingerCache(varianceHash, variancePatchInputs).Hash; + variancePatchKey = DiffSingerVariancePatch.BuildStateKey(baseHash, phrase.position, phrase.end); } Onnx.VerifyInputNames(varianceModel, varianceInputs); var varianceCache = Preferences.Default.DiffSingerTensorCache @@ -290,7 +304,7 @@ public VarianceResult Process(RenderPhrase phrase){ .First() .AsTensor() : null; - return new VarianceResult{ + var result = new VarianceResult{ energy = energy_pred?.ToArray(), breathiness = breathiness_pred?.ToArray(), voicing = voicing_pred?.ToArray(), @@ -300,6 +314,23 @@ public VarianceResult Process(RenderPhrase phrase){ tailFrames = tailFrames, totalFrames = totalFrames, }; + if (variancePatchKey.HasValue) { + result = ApplyVariancePatch(variancePatchKey.Value, pitch, result); + } + return result; + } + + VarianceResult ApplyVariancePatch(ulong patchKey, float[] pitch, VarianceResult result) { + try { + variancePatchStates.TryGetValue(patchKey, out var previous); + var merged = DiffSingerVariancePatch.Merge(previous, pitch, result); + variancePatchStates[patchKey] = new VariancePatchState(pitch, merged); + return merged; + } catch (Exception e) { + Log.Warning(e, "Failed to apply DiffSinger variance local pitch patch."); + variancePatchStates[patchKey] = new VariancePatchState(pitch, result); + return result; + } } private bool disposedValue; diff --git a/OpenUtau.Core/DiffSinger/DiffSingerVariancePatch.cs b/OpenUtau.Core/DiffSinger/DiffSingerVariancePatch.cs new file mode 100644 index 000000000..bd74de2de --- /dev/null +++ b/OpenUtau.Core/DiffSinger/DiffSingerVariancePatch.cs @@ -0,0 +1,143 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace OpenUtau.Core.DiffSinger { + internal readonly struct VariancePatchRange { + public readonly int start; + public readonly int end; + + public VariancePatchRange(int start, int end) { + this.start = start; + this.end = end; + } + } + + internal class VariancePatchState { + public readonly float[] pitch; + public readonly VarianceResult result; + + public VariancePatchState(float[] pitch, VarianceResult result) { + this.pitch = pitch.ToArray(); + this.result = DiffSingerVariancePatch.CloneResult(result); + } + } + + internal static class DiffSingerVariancePatch { + const float PitchEpsilon = 1e-4f; + const float CrossfadeMs = 50f; + + public static ulong BuildStateKey(ulong baseHash, int phrasePosition, int phraseEnd) { + unchecked { + ulong hash = baseHash; + hash = (hash ^ (uint)phrasePosition) * 1099511628211UL; + hash = (hash ^ (uint)phraseEnd) * 1099511628211UL; + return hash; + } + } + + public static VarianceResult Merge( + VariancePatchState? previous, + float[] currentPitch, + VarianceResult current) { + if (previous == null || + previous.pitch.Length != currentPitch.Length || + !IsMetadataCompatible(previous.result, current)) { + return CloneResult(current); + } + var ranges = FindChangedRanges(previous.pitch, currentPitch, PitchEpsilon); + if (ranges.Count == 0) { + return CloneResult(previous.result); + } + int crossfadeFrames = Math.Clamp((int)Math.Round(CrossfadeMs / current.frameMs), 1, 20); + var weights = BuildWeights(currentPitch.Length, ranges, crossfadeFrames); + return new VarianceResult { + energy = Blend(previous.result.energy, current.energy, weights), + breathiness = Blend(previous.result.breathiness, current.breathiness, weights), + voicing = Blend(previous.result.voicing, current.voicing, weights), + tension = Blend(previous.result.tension, current.tension, weights), + frameMs = current.frameMs, + headFrames = current.headFrames, + tailFrames = current.tailFrames, + totalFrames = current.totalFrames, + }; + } + + internal static List FindChangedRanges( + IReadOnlyList previousPitch, + IReadOnlyList currentPitch, + float epsilon) { + var ranges = new List(); + int length = Math.Min(previousPitch.Count, currentPitch.Count); + int start = -1; + for (int i = 0; i < length; ++i) { + bool changed = Math.Abs(previousPitch[i] - currentPitch[i]) > epsilon; + if (changed && start < 0) { + start = i; + } else if (!changed && start >= 0) { + ranges.Add(new VariancePatchRange(start, i)); + start = -1; + } + } + if (start >= 0) { + ranges.Add(new VariancePatchRange(start, length)); + } + return ranges; + } + + internal static float[] BuildWeights(int length, IReadOnlyList ranges, int crossfadeFrames) { + var weights = new float[length]; + foreach (var range in ranges) { + int start = Math.Clamp(range.start, 0, length); + int end = Math.Clamp(range.end, start, length); + for (int i = start; i < end; ++i) { + weights[i] = 1f; + } + int leftStart = Math.Max(0, start - crossfadeFrames); + int leftLength = start - leftStart; + for (int i = leftStart; i < start; ++i) { + float weight = (float)(i - leftStart + 1) / (leftLength + 1); + weights[i] = Math.Max(weights[i], weight); + } + int rightEnd = Math.Min(length, end + crossfadeFrames); + int rightLength = rightEnd - end; + for (int i = end; i < rightEnd; ++i) { + float weight = 1f - (float)(i - end + 1) / (rightLength + 1); + weights[i] = Math.Max(weights[i], weight); + } + } + return weights; + } + + internal static float[]? Blend(float[]? previous, float[]? current, IReadOnlyList weights) { + if (previous == null || current == null || previous.Length != current.Length || previous.Length != weights.Count) { + return current?.ToArray(); + } + var result = new float[current.Length]; + for (int i = 0; i < result.Length; ++i) { + result[i] = previous[i] * (1f - weights[i]) + current[i] * weights[i]; + } + return result; + } + + internal static VarianceResult CloneResult(VarianceResult result) { + return new VarianceResult { + energy = result.energy?.ToArray(), + breathiness = result.breathiness?.ToArray(), + voicing = result.voicing?.ToArray(), + tension = result.tension?.ToArray(), + frameMs = result.frameMs, + headFrames = result.headFrames, + tailFrames = result.tailFrames, + totalFrames = result.totalFrames, + }; + } + + static bool IsMetadataCompatible(VarianceResult previous, VarianceResult current) { + return previous.totalFrames == current.totalFrames && + previous.headFrames == current.headFrames && + previous.tailFrames == current.tailFrames && + Math.Abs(previous.frameMs - current.frameMs) < 1e-4f; + } + } +} diff --git a/OpenUtau.Core/Properties/AssemblyInfo.cs b/OpenUtau.Core/Properties/AssemblyInfo.cs new file mode 100644 index 000000000..aa105cb38 --- /dev/null +++ b/OpenUtau.Core/Properties/AssemblyInfo.cs @@ -0,0 +1,3 @@ +using System.Runtime.CompilerServices; + +[assembly: InternalsVisibleTo("OpenUtau.Test")] diff --git a/OpenUtau.Core/Util/Preferences.cs b/OpenUtau.Core/Util/Preferences.cs index 132412639..8029c7048 100644 --- a/OpenUtau.Core/Util/Preferences.cs +++ b/OpenUtau.Core/Util/Preferences.cs @@ -165,6 +165,7 @@ public class SerializablePreferences { public int DiffSingerStepsVariance = 20; public int DiffSingerStepsPitch = 10; public bool DiffSingerTensorCache = true; + public bool DiffSingerVarianceLocalPitchPatch = false; public bool DiffSingerLangCodeHide = false; public bool SkipRenderingMutedTracks = false; public string Language = string.Empty; diff --git a/OpenUtau.Test/Core/DiffSinger/DiffSingerVariancePatchTest.cs b/OpenUtau.Test/Core/DiffSinger/DiffSingerVariancePatchTest.cs new file mode 100644 index 000000000..7b05b0b21 --- /dev/null +++ b/OpenUtau.Test/Core/DiffSinger/DiffSingerVariancePatchTest.cs @@ -0,0 +1,69 @@ +using System.Linq; +using OpenUtau.Core.DiffSinger; +using Xunit; + +namespace OpenUtau.Core { + public class DiffSingerVariancePatchTest { + [Fact] + public void FindChangedRangesGroupsContiguousPitchChanges() { + var previous = new[] { 1f, 1f, 1f, 1f, 1f, 1f }; + var current = new[] { 1f, 2f, 2f, 1f, 2f, 1f }; + + var ranges = DiffSingerVariancePatch.FindChangedRanges(previous, current, 1e-4f); + + Assert.Equal(2, ranges.Count); + Assert.Equal(1, ranges[0].start); + Assert.Equal(3, ranges[0].end); + Assert.Equal(4, ranges[1].start); + Assert.Equal(5, ranges[1].end); + } + + [Fact] + public void MergeKeepsPreviousResultWhenPitchDoesNotChange() { + var previousResult = Result(new[] { 1f, 2f, 3f }); + var currentResult = Result(new[] { 10f, 20f, 30f }); + var previous = new VariancePatchState(new[] { 60f, 61f, 62f }, previousResult); + + var merged = DiffSingerVariancePatch.Merge(previous, new[] { 60f, 61f, 62f }, currentResult); + + Assert.Equal(previousResult.energy!, merged.energy!); + } + + [Fact] + public void MergeBlendsOnlyChangedPitchRange() { + var previousResult = Result(Enumerable.Repeat(0f, 6).ToArray(), frameMs: 50); + var currentResult = Result(Enumerable.Repeat(10f, 6).ToArray(), frameMs: 50); + var previous = new VariancePatchState( + new[] { 60f, 60f, 60f, 60f, 60f, 60f }, + previousResult); + + var merged = DiffSingerVariancePatch.Merge( + previous, + new[] { 60f, 60f, 61f, 61f, 60f, 60f }, + currentResult); + + Assert.Equal(new[] { 0f, 5f, 10f, 10f, 5f, 0f }, merged.energy!); + } + + [Fact] + public void MergeFallsBackToCurrentResultWhenMetadataChanges() { + var previousResult = Result(new[] { 1f, 2f, 3f }, frameMs: 50); + var currentResult = Result(new[] { 10f, 20f, 30f }, frameMs: 60); + var previous = new VariancePatchState(new[] { 60f, 61f, 62f }, previousResult); + + var merged = DiffSingerVariancePatch.Merge(previous, new[] { 60f, 62f, 62f }, currentResult); + + Assert.Equal(currentResult.energy!, merged.energy!); + } + + static VarianceResult Result(float[] energy, float frameMs = 50) { + return new VarianceResult { + energy = energy, + frameMs = frameMs, + headFrames = 1, + tailFrames = 1, + totalFrames = energy.Length, + }; + } + } +} diff --git a/OpenUtau/Strings/Strings.axaml b/OpenUtau/Strings/Strings.axaml index 4aa620c1e..eae594d7d 100644 --- a/OpenUtau/Strings/Strings.axaml +++ b/OpenUtau/Strings/Strings.axaml @@ -627,6 +627,7 @@ Warning: this option removes custom presets. DiffSinger Render Steps for Acoustic DiffSinger Render Steps for Pitch DiffSinger Render Steps for Variance + DiffSinger Local Variance Update for Pitch Edits GPU Machine Learning Runner Phase Compensation diff --git a/OpenUtau/ViewModels/PreferencesViewModel.cs b/OpenUtau/ViewModels/PreferencesViewModel.cs index 9db829b46..341e18782 100644 --- a/OpenUtau/ViewModels/PreferencesViewModel.cs +++ b/OpenUtau/ViewModels/PreferencesViewModel.cs @@ -120,6 +120,7 @@ public int SafeMaxThreadCount { [Reactive] public int DiffSingerStepsPitch { get; set; } [Reactive] public double DiffSingerDepth { get; set; } [Reactive] public bool DiffSingerTensorCache { get; set; } + [Reactive] public bool DiffSingerVarianceLocalPitchPatch { get; set; } [Reactive] public bool DiffSingerLangCodeHide { get; set; } // Advanced @@ -174,6 +175,7 @@ public PreferencesViewModel() { DiffSingerStepsVariance = Preferences.Default.DiffSingerStepsVariance; DiffSingerStepsPitch = Preferences.Default.DiffSingerStepsPitch; DiffSingerTensorCache = Preferences.Default.DiffSingerTensorCache; + DiffSingerVarianceLocalPitchPatch = Preferences.Default.DiffSingerVarianceLocalPitchPatch; DiffSingerLangCodeHide = Preferences.Default.DiffSingerLangCodeHide; SkipRenderingMutedTracks = Preferences.Default.SkipRenderingMutedTracks; ThemeName = Preferences.Default.ThemeName; @@ -393,6 +395,11 @@ public PreferencesViewModel() { Preferences.Default.DiffSingerTensorCache = useCache; Preferences.Save(); }); + this.WhenAnyValue(vm => vm.DiffSingerVarianceLocalPitchPatch) + .Subscribe(useLocalPatch => { + Preferences.Default.DiffSingerVarianceLocalPitchPatch = useLocalPatch; + Preferences.Save(); + }); this.WhenAnyValue(vm => vm.DiffSingerLangCodeHide) .Subscribe(useCache => { Preferences.Default.DiffSingerLangCodeHide = useCache; diff --git a/OpenUtau/Views/PreferencesDialog.axaml b/OpenUtau/Views/PreferencesDialog.axaml index e116bbe34..fd0e01234 100644 --- a/OpenUtau/Views/PreferencesDialog.axaml +++ b/OpenUtau/Views/PreferencesDialog.axaml @@ -325,6 +325,10 @@ + + + +