I had this "JSON" file sent to me:
"{ "key": "value", "anotherkey": \"value"value\", "number": 45., "number2": nan }"
Let's remove the enclosing quotes and format it for readability:
{
"key": "value",
"anotherkey": \"value"value\",
"number": 45.,
"number2": nan
}
I have no idea how something like this was generated.
If the file is small enough or the data regular enough, you could fix it by hand with some search & replace.
But the file I had was gigabytes in size and most of it looked fine.
Except for lines like these:
{ "key": ":\\\\\\\\\\", "anotherkey": "value" }
{ "key": "Something \\\\"Name\\\\" something\", "anotherkey": "value" }
Instead of crafting unreadable regexes, I decided to write my own quick & dirty malfomed JSON parser.
Here is the first version:
s = open('input.json').read()
in_key = False
i = 1 # skip initial "
while i < len(s) - 2: # skip trailing " and \n
c = s[i]
i += 1
if c == '{':
in_key = True
elif c == '}':
pass
elif c == '[':
pass
elif c == ']':
pass
elif c == ':':
in_key = False
elif c == ',':
in_key = True
elif is_num(c):
v = c
while i < len(s) and is_num(s[i]):
v += s[i]
i += 1
i += 1
print(v)
elif c == 'n' and s[i] == 'a' and s[i+1] == 'n': # possible out of bounds
i += 2
print('NaN')
elif c.isspace():
pass
elif c == '"':
v = read_until(s, i, '"')
i += len(v) + 1
print(v)
else:
print('UNKNOWN: ' + c)
#break
with these two helper functions
# numbers can be -4.
def is_num(c):
return c.isdigit() or c in ['.', '-']
# read "strings" and "st\"rin\"gs"
def read_until(s, i, token):
value = ''
prev = None
while i+len(token) < len(s):
if s[i:i+len(token)] == token and prev != '\\':
break
value += s[i]
prev = s[i]
i += 1
return value
Not elegant but that's not my goal here.
I'm going to get this output:
key
value
anotherkey
UNKNOWN: \
Now I need to parse strings like these:
{ "anotherkey": \"value"value\" }
I am going to add:
in_key = False
reverse = False
# ...
elif c == '\\' and s[i] == '"':
if not in_key:
reverse = True
elif c == '"':
v = ''
if not reverse:
v = read_until(s, i, '"')
i += len(v) + 1
else:
v = read_until(s, i, '\\"')
i += len(v) + 2
reverse = False
print(v)
and get this:
key
value
anotherkey
value"value
number
45.
number2
NaN
Unfortunately, there were even more malformed strings like { "key": "wtf\", "another": "value" }
, so I had to add some dirty code like this:
elif c == '"':
v = ''
# ...
if v.endswith('", '):
v = v[:-3]
i -= 3 # go back to comma
if v.endswith('\\"}, {'):
v = v[:-6]
i -= 5 # go back to }
print(v)
Another option would be to ignore everything until the next }
(I'd lose this object literal) but in my case I had to preserve all data.
Finally, let's save the result as a valid JSON file:
s = open('input.json').read()
out = open('output.json', 'w')
in_key = False
reverse = False
def is_num(c):
return c.isdigit() or c in ['.', '-']
def read_until(s, i, token):
value = ''
prev = None
while i+len(token) < len(s):
if s[i:i+len(token)] == token and prev != '\\':
break
value += s[i]
prev = s[i]
i += 1
return value
i = 1 # skip initial "
line = 1
while i < len(s) - 2: # skip trailing " and \n
c = s[i]
i += 1
if c == '{':
in_key = True
out.write('{ ')
elif c == '}':
out.write(' }\n')
line += 1
elif c == '[':
out.write('[ ')
elif c == ']':
out.write(' ]')
elif c == ':':
in_key = False
out.write(': ')
elif c == ',':
in_key = True
out.write(', ')
elif is_num(c):
v = c
while i < len(s) and is_num(s[i]):
v += s[i]
i += 1
i += 1
out.write(v)
if v != '0': out.write('0')
out.write(', ')
elif c == 'n' and s[i] == 'a' and s[i+1] == 'n':
i += 2
out.write('null')
elif c.isspace():
pass
elif c == '\\' and s[i] == '"':
if not in_key:
reverse = True
elif c == '"':
v = ''
if not reverse:
v = read_until(s, i, '"')
i += len(v) + 1
else:
v = read_until(s, i, '\\"')
i += len(v) + 2
reverse = False
if v.endswith('", '):
v = v[:-3]
i -= 3 # go back to comma
if v.endswith('\\"}, {'):
v = v[:-6]
i -= 5 # go back to }
if not in_key:
v = v.replace('\\', '').replace('"', '\\"')
out.write('"')
out.write(v)
out.write('"')
else:
print('UNKNOWN: ' + c)
#break
out.close()
Which looks like this:
{ "key": "value", "anotherkey": "value\"value", "number": 45.0, "number2": null }
Took me about 2 hours.