使用C#压缩/解压缩字符串


144

我是.net的新手。我在C#中执行压缩和解压缩字符串。有一个XML,我要转换为字符串,然后进行压缩和解压缩。除了对代码进行解压缩并返回字符串(仅返回XML的一半)时,代码中没有编译错误。

以下是我的代码,请在错误之处纠正我。

码:

class Program
{
    public static string Zip(string value)
    {
        //Transform string into byte[]  
        byte[] byteArray = new byte[value.Length];
        int indexBA = 0;
        foreach (char item in value.ToCharArray())
        {
            byteArray[indexBA++] = (byte)item;
        }

        //Prepare for compress
        System.IO.MemoryStream ms = new System.IO.MemoryStream();
        System.IO.Compression.GZipStream sw = new System.IO.Compression.GZipStream(ms, System.IO.Compression.CompressionMode.Compress);

        //Compress
        sw.Write(byteArray, 0, byteArray.Length);
        //Close, DO NOT FLUSH cause bytes will go missing...
        sw.Close();

        //Transform byte[] zip data to string
        byteArray = ms.ToArray();
        System.Text.StringBuilder sB = new System.Text.StringBuilder(byteArray.Length);
        foreach (byte item in byteArray)
        {
            sB.Append((char)item);
        }
        ms.Close();
        sw.Dispose();
        ms.Dispose();
        return sB.ToString();
    }

    public static string UnZip(string value)
    {
        //Transform string into byte[]
        byte[] byteArray = new byte[value.Length];
        int indexBA = 0;
        foreach (char item in value.ToCharArray())
        {
            byteArray[indexBA++] = (byte)item;
        }

        //Prepare for decompress
        System.IO.MemoryStream ms = new System.IO.MemoryStream(byteArray);
        System.IO.Compression.GZipStream sr = new System.IO.Compression.GZipStream(ms,
            System.IO.Compression.CompressionMode.Decompress);

        //Reset variable to collect uncompressed result
        byteArray = new byte[byteArray.Length];

        //Decompress
        int rByte = sr.Read(byteArray, 0, byteArray.Length);

        //Transform byte[] unzip data to string
        System.Text.StringBuilder sB = new System.Text.StringBuilder(rByte);
        //Read the number of bytes GZipStream red and do not a for each bytes in
        //resultByteArray;
        for (int i = 0; i < rByte; i++)
        {
            sB.Append((char)byteArray[i]);
        }
        sr.Close();
        ms.Close();
        sr.Dispose();
        ms.Dispose();
        return sB.ToString();
    }

    static void Main(string[] args)
    {
        XDocument doc = XDocument.Load(@"D:\RSP.xml");
        string val = doc.ToString(SaveOptions.DisableFormatting);
        val = Zip(val);
        val = UnZip(val);
    }
} 

我的XML大小为63KB。


1
我怀疑如果使用UTF8Encoding(或UTF16或诸如此类)和GetBytes / GetString ,问题将“自行修复” 。这也将大大简化代码。还建议使用using

您不能像您一样使用简单的转换将char转换为byte或相反。您需要使用一种编码,并且压缩/解压缩使用相同的编码。请参阅下面的xanatos答案。
西蒙·穆里尔

@pst不,不会;您将使用Encoding错误的方法。根据xanatos的答案,您需要在此处使用base-64
Marc Gravell

@Marc Gravell True,错过了签名/意图的那一部分。绝对不是我签名的第一选择。

Answers:


257

压缩/解压缩字符串的代码

public static void CopyTo(Stream src, Stream dest) {
    byte[] bytes = new byte[4096];

    int cnt;

    while ((cnt = src.Read(bytes, 0, bytes.Length)) != 0) {
        dest.Write(bytes, 0, cnt);
    }
}

public static byte[] Zip(string str) {
    var bytes = Encoding.UTF8.GetBytes(str);

    using (var msi = new MemoryStream(bytes))
    using (var mso = new MemoryStream()) {
        using (var gs = new GZipStream(mso, CompressionMode.Compress)) {
            //msi.CopyTo(gs);
            CopyTo(msi, gs);
        }

        return mso.ToArray();
    }
}

public static string Unzip(byte[] bytes) {
    using (var msi = new MemoryStream(bytes))
    using (var mso = new MemoryStream()) {
        using (var gs = new GZipStream(msi, CompressionMode.Decompress)) {
            //gs.CopyTo(mso);
            CopyTo(gs, mso);
        }

        return Encoding.UTF8.GetString(mso.ToArray());
    }
}

static void Main(string[] args) {
    byte[] r1 = Zip("StringStringStringStringStringStringStringStringStringStringStringStringStringString");
    string r2 = Unzip(r1);
}

请记住,Zip返回byte[],而Unzip返回string。如果您想要一个字符串,Zip可以对它进行Base64编码(例如,使用Convert.ToBase64String(r1))(结果Zip是非常二进制的!则不能将其打印到屏幕上或直接用XML编写)

建议的版本适用于.NET 2.0,适用于.NET 4.0使用MemoryStream.CopyTo

重要说明:GZipStream知道压缩内容具有所有输入之前(即,有效压缩它需要所有数据),才可以将压缩内容写入输出流。在检查输出流(例如)之前,需要确保您Dispose()的身份。这是通过上面的块完成的。请注意,是最里面的块,并且可以在其外部访问内容。这同样适用于解压缩:在中尝试访问之前的数据。GZipStreammso.ToArray()using() { }GZipStreamDispose()GZipStream


谢谢您的答复。当我使用您的代码时,它给了我编译错误。“ CopyTo()没有名称空间或程序集引用。” 之后,我在Google上搜索并找到了.NET 4 Framework的CopyTo()部分。但是我正在研究.net 2.0和3.5框架。请建议我。:)
Mohit Kumar

我只想强调指出,必须在输出流上调用ToArray()之前处置GZipStream。我忽略了这一点,但这有所作为!
湿面

1
在.net 4.5中压缩是最有效的方法吗?
MonsterMMORPG 2014年

1
请注意,如果字符串包含代理对(例如),此操作将失败(未压缩字符串!=原始)string s = "X\uD800Y"。我注意到,如果我们将Encoding更改为UTF7,它会起作用...但是使用UTF7是否可以确定所有字符都可以表示?
digEmAll 2015年

@digEmAll我要说的是,如果有无效的代理对(根据您的情况),它将不起作用。UTF8 GetByes转换以0xFFFD静默替换无效的代理对。
xanatos 2015年

103

根据 此代码段, 我使用此代码,它工作正常:

using System;
using System.IO;
using System.IO.Compression;
using System.Text;

namespace CompressString
{
    internal static class StringCompressor
    {
        /// <summary>
        /// Compresses the string.
        /// </summary>
        /// <param name="text">The text.</param>
        /// <returns></returns>
        public static string CompressString(string text)
        {
            byte[] buffer = Encoding.UTF8.GetBytes(text);
            var memoryStream = new MemoryStream();
            using (var gZipStream = new GZipStream(memoryStream, CompressionMode.Compress, true))
            {
                gZipStream.Write(buffer, 0, buffer.Length);
            }

            memoryStream.Position = 0;

            var compressedData = new byte[memoryStream.Length];
            memoryStream.Read(compressedData, 0, compressedData.Length);

            var gZipBuffer = new byte[compressedData.Length + 4];
            Buffer.BlockCopy(compressedData, 0, gZipBuffer, 4, compressedData.Length);
            Buffer.BlockCopy(BitConverter.GetBytes(buffer.Length), 0, gZipBuffer, 0, 4);
            return Convert.ToBase64String(gZipBuffer);
        }

        /// <summary>
        /// Decompresses the string.
        /// </summary>
        /// <param name="compressedText">The compressed text.</param>
        /// <returns></returns>
        public static string DecompressString(string compressedText)
        {
            byte[] gZipBuffer = Convert.FromBase64String(compressedText);
            using (var memoryStream = new MemoryStream())
            {
                int dataLength = BitConverter.ToInt32(gZipBuffer, 0);
                memoryStream.Write(gZipBuffer, 4, gZipBuffer.Length - 4);

                var buffer = new byte[dataLength];

                memoryStream.Position = 0;
                using (var gZipStream = new GZipStream(memoryStream, CompressionMode.Decompress))
                {
                    gZipStream.Read(buffer, 0, buffer.Length);
                }

                return Encoding.UTF8.GetString(buffer);
            }
        }
    }
}

2
我只想感谢您发布此代码。我将其放入我的项目中,它开箱即用,没有任何问题。
BoltBait

3
是的,开箱即用!我还喜欢将长度添加为前四个字节的想法
JustADev

2
这是最好的答案。这应该被标记为答案!
Eriawan Kusumawardhono

1
@Matt就像压缩一个.zip文件一样-.png已经是压缩的内容
fubo

2
标记为答案的答案不稳定。这是最好的答案。
莎丽

38

随着.NET 4.0(及更高版本)的Stream.CopyTo()方法的出现,我想我会发布一种更新的方法。

我也认为下面的版本作为独立类的清晰示例非常有用,该类用于将常规字符串压缩为Base64编码的字符串,反之亦然:

public static class StringCompression
{
    /// <summary>
    /// Compresses a string and returns a deflate compressed, Base64 encoded string.
    /// </summary>
    /// <param name="uncompressedString">String to compress</param>
    public static string Compress(string uncompressedString)
    {
        byte[] compressedBytes;

        using (var uncompressedStream = new MemoryStream(Encoding.UTF8.GetBytes(uncompressedString)))
        {
            using (var compressedStream = new MemoryStream())
            { 
                // setting the leaveOpen parameter to true to ensure that compressedStream will not be closed when compressorStream is disposed
                // this allows compressorStream to close and flush its buffers to compressedStream and guarantees that compressedStream.ToArray() can be called afterward
                // although MSDN documentation states that ToArray() can be called on a closed MemoryStream, I don't want to rely on that very odd behavior should it ever change
                using (var compressorStream = new DeflateStream(compressedStream, CompressionLevel.Fastest, true))
                {
                    uncompressedStream.CopyTo(compressorStream);
                }

                // call compressedStream.ToArray() after the enclosing DeflateStream has closed and flushed its buffer to compressedStream
                compressedBytes = compressedStream.ToArray();
            }
        }

        return Convert.ToBase64String(compressedBytes);
    }

    /// <summary>
    /// Decompresses a deflate compressed, Base64 encoded string and returns an uncompressed string.
    /// </summary>
    /// <param name="compressedString">String to decompress.</param>
    public static string Decompress(string compressedString)
    {
        byte[] decompressedBytes;

        var compressedStream = new MemoryStream(Convert.FromBase64String(compressedString));

        using (var decompressorStream = new DeflateStream(compressedStream, CompressionMode.Decompress))
        {
            using (var decompressedStream = new MemoryStream())
            {
                decompressorStream.CopyTo(decompressedStream);

                decompressedBytes = decompressedStream.ToArray();
            }
        }

        return Encoding.UTF8.GetString(decompressedBytes);
    }

这是使用扩展方法技术来扩展String类以添加字符串压缩和解压缩的另一种方法。您可以将下面的类放到现有项目中,然后使用:

var uncompressedString = "Hello World!";
var compressedString = uncompressedString.Compress();

var decompressedString = compressedString.Decompress();

以机智:

public static class Extensions
{
    /// <summary>
    /// Compresses a string and returns a deflate compressed, Base64 encoded string.
    /// </summary>
    /// <param name="uncompressedString">String to compress</param>
    public static string Compress(this string uncompressedString)
    {
        byte[] compressedBytes;

        using (var uncompressedStream = new MemoryStream(Encoding.UTF8.GetBytes(uncompressedString)))
        {
            using (var compressedStream = new MemoryStream())
            { 
                // setting the leaveOpen parameter to true to ensure that compressedStream will not be closed when compressorStream is disposed
                // this allows compressorStream to close and flush its buffers to compressedStream and guarantees that compressedStream.ToArray() can be called afterward
                // although MSDN documentation states that ToArray() can be called on a closed MemoryStream, I don't want to rely on that very odd behavior should it ever change
                using (var compressorStream = new DeflateStream(compressedStream, CompressionLevel.Fastest, true))
                {
                    uncompressedStream.CopyTo(compressorStream);
                }

                // call compressedStream.ToArray() after the enclosing DeflateStream has closed and flushed its buffer to compressedStream
                compressedBytes = compressedStream.ToArray();
            }
        }

        return Convert.ToBase64String(compressedBytes);
    }

    /// <summary>
    /// Decompresses a deflate compressed, Base64 encoded string and returns an uncompressed string.
    /// </summary>
    /// <param name="compressedString">String to decompress.</param>
    public static string Decompress(this string compressedString)
    {
        byte[] decompressedBytes;

        var compressedStream = new MemoryStream(Convert.FromBase64String(compressedString));

        using (var decompressorStream = new DeflateStream(compressedStream, CompressionMode.Decompress))
        {
            using (var decompressedStream = new MemoryStream())
            {
                decompressorStream.CopyTo(decompressedStream);

                decompressedBytes = decompressedStream.ToArray();
            }
        }

        return Encoding.UTF8.GetString(decompressedBytes);
    }

2
Jace:我认为您缺少usingMemoryStream实例的语句。然后到F#开发人员那里:不要use为CompressorStream / decompressorStream实例使用关键字,因为在ToArray()调用它们之前需要手动处理它们
knocte

1
使用GZipStream是否会更好,因为它会添加一些额外的验证?是GZipStream还是DeflateStream类?
Michael Freidgeim '18

2
@Michael Freidgeim对于压缩和解压缩内存流,我不这么认为。对于文件或不可靠的传输,这是有道理的。我会说,在我的特定用例中,非常需要高速,因此我可以避免的任何开销都更好。
杰斯

固体。将我的20MB JSON字符串减小到4.5MB。🎉–
詹姆斯·艾什

1
效果很好,但您应在使用后处置内存流,或按照@knocte的建议使用每个流
Sebastian

8

这是.NET 4.5的更新版本,并且使用async / await和IEnumerables进行了更新:

public static class CompressionExtensions
{
    public static async Task<IEnumerable<byte>> Zip(this object obj)
    {
        byte[] bytes = obj.Serialize();

        using (MemoryStream msi = new MemoryStream(bytes))
        using (MemoryStream mso = new MemoryStream())
        {
            using (var gs = new GZipStream(mso, CompressionMode.Compress))
                await msi.CopyToAsync(gs);

            return mso.ToArray().AsEnumerable();
        }
    }

    public static async Task<object> Unzip(this byte[] bytes)
    {
        using (MemoryStream msi = new MemoryStream(bytes))
        using (MemoryStream mso = new MemoryStream())
        {
            using (var gs = new GZipStream(msi, CompressionMode.Decompress))
            {
                // Sync example:
                //gs.CopyTo(mso);

                // Async way (take care of using async keyword on the method definition)
                await gs.CopyToAsync(mso);
            }

            return mso.ToArray().Deserialize();
        }
    }
}

public static class SerializerExtensions
{
    public static byte[] Serialize<T>(this T objectToWrite)
    {
        using (MemoryStream stream = new MemoryStream())
        {
            BinaryFormatter binaryFormatter = new BinaryFormatter();
            binaryFormatter.Serialize(stream, objectToWrite);

            return stream.GetBuffer();
        }
    }

    public static async Task<T> _Deserialize<T>(this byte[] arr)
    {
        using (MemoryStream stream = new MemoryStream())
        {
            BinaryFormatter binaryFormatter = new BinaryFormatter();
            await stream.WriteAsync(arr, 0, arr.Length);
            stream.Position = 0;

            return (T)binaryFormatter.Deserialize(stream);
        }
    }

    public static async Task<object> Deserialize(this byte[] arr)
    {
        object obj = await arr._Deserialize<object>();
        return obj;
    }
}

有了这个你可以序列化一切 BinaryFormatter支持的内容,而不仅仅是字符串。

编辑:

如果需要照顾Encoding,可以只使用Convert.ToBase64String(byte []) ...

如果需要示例,请查看此答案!


在反序列化,编辑样本之前,您必须重置流的位置。另外,您的XML注释也不相关。
Magnus Johansson

值得注意的是,这仅适用于基于UTF8的东西。例如,如果将诸如åäö之类的瑞典语字符添加到要序列化/反序列化的字符串值中,它将无法通过双向测试:/
bc3tech

在这种情况下,您可以使用Convert.ToBase64String(byte[])。请参阅此答案(stackoverflow.com/a/23908465/3286975)。希望能帮助到你!
z3nth10n

6

对于仍在获取的人GZip标头中的幻数不正确。确保您正在传递GZip流。错误, 并且如果您的字符串是使用php压缩的,则需要执行以下操作:

       public static string decodeDecompress(string originalReceivedSrc) {
        byte[] bytes = Convert.FromBase64String(originalReceivedSrc);

        using (var mem = new MemoryStream()) {
            //the trick is here
            mem.Write(new byte[] { 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00 }, 0, 8);
            mem.Write(bytes, 0, bytes.Length);

            mem.Position = 0;

            using (var gzip = new GZipStream(mem, CompressionMode.Decompress))
            using (var reader = new StreamReader(gzip)) {
                return reader.ReadToEnd();
                }
            }
        }

我收到此异常:抛出异常:System.dll中的'System.IO.InvalidDataException'其他信息:GZip页脚中的CRC与从解压缩的数据计算出的CRC不匹配。
Dainius Kreivys'9
By using our site, you acknowledge that you have read and understand our Cookie Policy and Privacy Policy.
Licensed under cc by-sa 3.0 with attribution required.