Tus*_*ang 6 c# arrays json json.net
我有一个非常大的JSON文件,现在下面的汽车数组可以达到100,000,000条记录.文件总大小可以从500mb到10GB不等.我正在使用Newtonsoft json.net
输入
{
"name": "John",
"age": "30",
"cars": [{
"brand": "ABC",
"models": ["Alhambra", "Altea", "AlteaXL", "Arosa", "Cordoba", "CordobaVario", "Exeo", "Ibiza", "IbizaST", "ExeoST", "Leon", "LeonST", "Inca", "Mii", "Toledo"],
"year": "2019",
"month": "1",
"day": "1"
}, {
"brand": "XYZ",
"models": ["Alhambra", "Altea", "AlteaXL", "Arosa", "Cordoba", "CordobaVario", "Exeo", "Ibiza", "IbizaST", "ExeoST", "Leon", "LeonST", "Inca", "Mii", "Toledo"],
"year": "2019",
"month": "10",
"day": "01"
}],
"TestCity": "TestCityValue",
"TestCity1": "TestCityValue1"}
Run Code Online (Sandbox Code Playgroud)
期望的输出 文件1 Json
{
"name": "John",
"age": "30",
"cars": {
"brand": "ABC",
"models": ["Alhambra", "Altea", "AlteaXL", "Arosa", "Cordoba", "CordobaVario", "Exeo", "Ibiza", "IbizaST", "ExeoST", "Leon", "LeonST", "Inca", "Mii", "Toledo"],
"year": "2019",
"month": "1",
"day": "1"
},
"TestCity": "TestCityValue",
"TestCity1": "TestCityValue1"
}
Run Code Online (Sandbox Code Playgroud)
文件2 Json
{
"name": "John",
"age": "30",
"cars": {
"brand": "XYZ",
"models": ["Alhambra", "Altea", "AlteaXL", "Arosa", "Cordoba", "CordobaVario", "Exeo", "Ibiza", "IbizaST", "ExeoST", "Leon", "LeonST", "Inca", "Mii", "Toledo"],
"year": "2019",
"month": "10",
"day": "01"
},
"TestCity": "TestCityValue",
"TestCity1": "TestCityValue1"
}
Run Code Online (Sandbox Code Playgroud)
所以我想出了以下有用的代码
public static void SplitJson(Uri objUri, string splitbyProperty)
{
try
{
bool readinside = false;
HttpClient client = new HttpClient();
using (Stream stream = client.GetStreamAsync(objUri).Result)
using (StreamReader streamReader = new StreamReader(stream))
using (JsonTextReader reader = new JsonTextReader(streamReader))
{
Node objnode = new Node();
while (reader.Read())
{
JObject obj = new JObject(reader);
if (reader.TokenType == JsonToken.String && reader.Path.ToString().Contains("name") && !reader.Value.ToString().Equals(reader.Path.ToString()))
{
objnode.name = reader.Value.ToString();
}
if (reader.TokenType == JsonToken.Integer && reader.Path.ToString().Contains("age") && !reader.Value.ToString().Equals(reader.Path.ToString()))
{
objnode.age = reader.Value.ToString();
}
if (reader.Path.ToString().Contains(splitbyProperty) && reader.TokenType == JsonToken.StartArray)
{
int counter = 0;
while (reader.Read())
{
if (reader.TokenType == JsonToken.StartObject)
{
counter = counter + 1;
var item = JsonSerializer.Create().Deserialize<Car>(reader);
objnode.cars = new List<Car>();
objnode.cars.Add(item);
insertIntoFileSystem(objnode, counter);
}
if (reader.TokenType == JsonToken.EndArray)
break;
}
}
}
}
}
catch (Exception)
{
throw;
}
}
public static void insertIntoFileSystem(Node objNode, int counter)
{
string fileName = @"C:\Temp\output_" + objNode.name + "_" + objNode.age + "_" + counter + ".json";
var serialiser = new JsonSerializer();
using (TextWriter tw = new StreamWriter(fileName))
{
using (StringWriter textWriter = new StringWriter())
{
serialiser.Serialize(textWriter, objNode);
tw.WriteLine(textWriter);
}
}
}
Run Code Online (Sandbox Code Playgroud)
问题
当文件很大时,不会捕获数组之后的任何字段.有没有办法在json中跳过或对大型数组的阅读器进行并行处理.简而言之,我无法使用我的代码捕获下面的部分
"TestCity":"TestCityValue","TestCity1":"TestCityValue1"}
您将需要分两次处理大型 JSON 文件才能获得您想要的结果。
在第一遍中,将文件分成两部分:创建一个仅包含巨大数组的文件,以及包含所有其他信息的第二个文件,该文件将用作您最终要创建的各个 JSON 文件的模板。
在第二遍中,将模板文件读入内存(我假设 JSON 的这一部分相对较小,因此这应该不是问题),然后使用读取器一次处理数组文件中的一项。对于每个项目,将其与模板合并并将其写入单独的文件。
最后,您可以删除临时数组和模板文件。
代码如下:
using System.IO;
using System.Text;
using System.Net.Http;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
public static void SplitJson(Uri objUri, string arrayPropertyName)
{
string templateFileName = @"C:\Temp\template.json";
string arrayFileName = @"C:\Temp\array.json";
// Split the original JSON stream into two temporary files:
// one that has the huge array and one that has everything else
HttpClient client = new HttpClient();
using (Stream stream = client.GetStreamAsync(objUri).Result)
using (JsonReader reader = new JsonTextReader(new StreamReader(inputStream)))
using (JsonWriter templateWriter = new JsonTextWriter(new StreamWriter(templateFileName)))
using (JsonWriter arrayWriter = new JsonTextWriter(new StreamWriter(arrayFileName)))
{
if (reader.Read() && reader.TokenType == JsonToken.StartObject)
{
templateWriter.WriteStartObject();
while (reader.Read() && reader.TokenType != JsonToken.EndObject)
{
string propertyName = (string)reader.Value;
reader.Read();
templateWriter.WritePropertyName(propertyName);
if (propertyName == arrayPropertyName)
{
arrayWriter.WriteToken(reader);
templateWriter.WriteStartObject(); // empty placeholder object
templateWriter.WriteEndObject();
}
else if (reader.TokenType == JsonToken.StartObject ||
reader.TokenType == JsonToken.StartArray)
{
templateWriter.WriteToken(reader);
}
else
{
templateWriter.WriteValue(reader.Value);
}
}
templateWriter.WriteEndObject();
}
}
// Now read the huge array file and combine each item in the array
// with the template to make new files
JObject template = JObject.Parse(File.ReadAllText(templateFileName));
using (JsonReader arrayReader = new JsonTextReader(new StreamReader(arrayFileName)))
{
int counter = 0;
while (arrayReader.Read())
{
if (arrayReader.TokenType == JsonToken.StartObject)
{
counter++;
JObject item = JObject.Load(arrayReader);
template[arrayPropertyName] = item;
string fileName = string.Format(@"C:\Temp\output_{0}_{1}_{2}.json",
template["name"], template["age"], counter);
File.WriteAllText(fileName, template.ToString());
}
}
}
// Clean up temporary files
File.Delete(templateFileName);
File.Delete(arrayFileName);
}
Run Code Online (Sandbox Code Playgroud)
请注意,由于存在临时文件,上述方法在处理过程中将需要原始 JSON 两倍的磁盘空间。如果这是一个问题,您可以修改代码以下载文件两次(尽管这可能会增加处理时间)。第一次下载时,创建模板 JSON 并忽略数组;在第二次下载中,前进到数组并像以前一样使用模板对其进行处理以创建输出文件。
| 归档时间: |
|
| 查看次数: |
365 次 |
| 最近记录: |