sr-download/sr_download/sql/xml_parse.py

82 lines
2.1 KiB
Python
Raw Normal View History

2024-08-29 02:18:50 +08:00
from __future__ import annotations
import psycopg2
import tomli
from lib_not_dr import loggers
with open("../../config.toml", "rb") as f:
CONFIG = tomli.load(f)
logger = loggers.config.get_logger("xml_parse")
def get_db():
connect = psycopg2.connect(
CONFIG["db"]["url"]
)
return connect
2024-08-30 02:15:22 +08:00
def fetch_data(db_cur, offset, limit):
# xml_fetch = f"""
# WITH data AS (
# SELECT save_id as id, data
# FROM public.full_data
# WHERE "save_type" != 'none'
# AND xml_is_well_formed_document(full_data."data")
# LIMIT {limit} OFFSET {offset}
# )
# SELECT data.id, string_agg(parts.part_type, '|') AS part_types
# FROM data,
# XMLTABLE (
# '//Ship/Parts/Part'
# PASSING BY VALUE xmlparse(document data."data")
# COLUMNS part_type text PATH '@partType',
# part_id text PATH '@id'
# ) AS parts
# GROUP BY data.id;
# """
xml_fetch = f"""
WITH data AS (
SELECT save_id as id, data
FROM public.full_data
WHERE "save_type" != 'none'
AND xml_is_well_formed_document(full_data."data")
LIMIT {limit} OFFSET {offset}
),
parts_data AS (
SELECT data.id, parts.part_type
FROM data,
XMLTABLE (
'//Ship/Parts/Part'
PASSING BY VALUE xmlparse(document data."data")
COLUMNS part_type text PATH '@partType',
part_id text PATH '@id'
) AS parts
)
SELECT id, string_agg(part_type || ':' || part_count, '|') AS part_types
FROM (
SELECT id, part_type, COUNT(part_type) AS part_count
FROM parts_data
GROUP BY id, part_type
) AS counted_parts
GROUP BY id;
"""
db_cur.execute(xml_fetch)
return db_cur.fetchall()
2024-08-29 02:18:50 +08:00
def main():
db = get_db()
db_cur = db.cursor()
2024-08-30 02:15:22 +08:00
offset = 0
limit = 100
while True:
datas = fetch_data(db_cur, offset, limit)
if not datas:
break
for data in datas:
logger.info(data)
offset += limit
2024-08-29 02:18:50 +08:00
main()