python - Split large text file into smaller chunks by byte size and to retain end of line - Stack Overflow-软件玩家

admin管理员组
文章数量:1332360

I have python code which splits a file into smaller chunks with byte size, for example filename.txt being 1GB split into 10 smaller files of 100MB - filename_001.txt, filename_002.txt etc.

However although I'm happy with the file size splitting I noticed within these smaller split files the file contents can be cut anywhere on the line, but I would like to retain the line and split at the end of the line with the byte size.

See the following code I have:

import os
import sys

def getfilesize(sfilename):
    with open(sfilename,"rb") as fr:
        fr.seek(0,2) # move to end of the file
        size=fr.tell()
        print("getfilesize: size: %s" % size)
        return fr.tell()

def splitfile(spath, sfilename, splitsize, dpath):
    # Open original file in read only mode
    if not os.path.isfile(spath + sfilename):
        print("No such file as: \"%s\"" % spath + sfilename)
        return

    filesize=getfilesize(spath + sfilename)

    fullsfilepath = spath + sfilename
    fulldfilepath = dpath + sfilename

    with open(spath + sfilename,"rb") as fr:
        counter=1
        inalfilename = fullsfilepath.split(".")
        newfilename = fulldfilepath.split(".")
        print(inalfilename)
        readlimit = 5000 #read 5kb at a time
        n_splits = filesize//splitsize
        print("splitfile: No of splits required: %s" % str(n_splits))
        for i in range(n_splits+1):
            chunks_count = int(splitsize)//int(readlimit)
            data_5kb = fr.read(readlimit) # read
            # Create split files
            print("chunks_count: %d" % chunks_count)

            with open(newfilename[0]+"_{id}.".format(id=str(counter))+newfilename[1],"ab") as fw:
                fw.seek(0)
                fw.truncate()# truncate original if present
                while data_5kb:
                    fw.write(data_5kb)
                    if chunks_count:
                        chunks_count-=1
                        data_5kb = fr.read(readlimit)
                    else: break
            counter+=1

if __name__ == "__main__":
    if len(sys.argv) < 4: print("Missing argument: Usage: filesplit.py sfilename splitsizeinkb dfilename")
    else:
        filesize = int(sys.argv[3]) * 1000 #make into kb
        spath = sys.argv[1]
        sfilename = sys.argv[2]
        dpath = sys.argv[4]
        splitfile(spath, sfilename, filesize, dpath)

Is it possible to achieve this? If so how can this be done?

I have python code which splits a file into smaller chunks with byte size, for example filename.txt being 1GB split into 10 smaller files of 100MB - filename_001.txt, filename_002.txt etc.

See the following code I have:

import os
import sys

def getfilesize(sfilename):
    with open(sfilename,"rb") as fr:
        fr.seek(0,2) # move to end of the file
        size=fr.tell()
        print("getfilesize: size: %s" % size)
        return fr.tell()

def splitfile(spath, sfilename, splitsize, dpath):
    # Open original file in read only mode
    if not os.path.isfile(spath + sfilename):
        print("No such file as: \"%s\"" % spath + sfilename)
        return

    filesize=getfilesize(spath + sfilename)

    fullsfilepath = spath + sfilename
    fulldfilepath = dpath + sfilename

    with open(spath + sfilename,"rb") as fr:
        counter=1
        inalfilename = fullsfilepath.split(".")
        newfilename = fulldfilepath.split(".")
        print(inalfilename)
        readlimit = 5000 #read 5kb at a time
        n_splits = filesize//splitsize
        print("splitfile: No of splits required: %s" % str(n_splits))
        for i in range(n_splits+1):
            chunks_count = int(splitsize)//int(readlimit)
            data_5kb = fr.read(readlimit) # read
            # Create split files
            print("chunks_count: %d" % chunks_count)

            with open(newfilename[0]+"_{id}.".format(id=str(counter))+newfilename[1],"ab") as fw:
                fw.seek(0)
                fw.truncate()# truncate original if present
                while data_5kb:
                    fw.write(data_5kb)
                    if chunks_count:
                        chunks_count-=1
                        data_5kb = fr.read(readlimit)
                    else: break
            counter+=1

if __name__ == "__main__":
    if len(sys.argv) < 4: print("Missing argument: Usage: filesplit.py sfilename splitsizeinkb dfilename")
    else:
        filesize = int(sys.argv[3]) * 1000 #make into kb
        spath = sys.argv[1]
        sfilename = sys.argv[2]
        dpath = sys.argv[4]
        splitfile(spath, sfilename, filesize, dpath)

Is it possible to achieve this? If so how can this be done?

Share Improve this question edited Nov 21, 2024 at 20:01 asked Nov 21, 2024 at 2:15 rob 1731 silver badge11 bronze badges

How about splitting on number of lines having the overall count. Also, readlimit could be a multiple of 4096 and big enough to reduce the number of reads 1024*1024*1024*1024/4096 = 268435456. ~260M reads with readlimi 4096 – LMC Commented Nov 21, 2024 at 2:47
Please use the internationally agreed SI units. For gigabytes, use GB because gb is not defined and Gb is gigabits. Likewise please use MB for megabytes, since your mb would read as millibits. – Mark Setchell Commented Nov 21, 2024 at 7:31
What do you expect to happen when you retain a line at the end of a chunk? Do you leave out the incomplete line so that the size of the chunk does not exceed the maximum chunk size? Or do you want the rest of the line to be included even though it would make the chunk size larger than the maximum chunk size? – blhsing Commented Nov 21, 2024 at 9:25
GNU Parallel will do that for you without writing any Python, e.g. parallel --pipe --block 100m wc :::: YOURFILE – Mark Setchell Commented Nov 21, 2024 at 10:16
@blhsing either way is fine - I understand that the split files would not all be exact same size but close to the specified chunk size – rob Commented Nov 21, 2024 at 20:04

Add a comment |

1 Answer 1

Sorted by: Reset to default 0

Maybe you should get max lines size first,then you would know how may you should set the size of chunk.Here are the codes that may help:

import os
import sys
from pathlib import Path

def lines_max_size(filepath: str):
    """
    iter file content by size and return max fize of them

    Args:
        filepath (str): path of file to iter
    """
    ans = 0
    with open(filepath, "r") as f:
        for line in f:
            size = sys.getsizeof(line)
            if size > ans:
                ans = size
    return ans


def split_file(filepath: str, chunk_size: int, outfile_prefix):
    """
    split large file to smaller files by lines, whiches size will lower than chunk_size

    Args:
         filepath (str): path of file to split
         chunk_size (str): size the files after spliting should be lower than, unit - bytes
         outfile_prefix (str): name of output files should be in the format - outfile_prefix+count_number
    """
    content = ""
    count = 1
    save_path = Path(filepath).parent
    with open(filepath, "r") as f:
        for line in f:
            line_size = sys.getsizeof(line)
            if line_size > chunk_size:
                raise Exception("current line size is %s, chunk_size - %s too small to splite file by lines" % (
                    line_size,
                    chunk_size
                ))

            if sys.getsizeof(content+line) <= chunk_size:
                content += line
            else:
                fn = outfile_prefix + str(count) + filepath.split(".")[-1]
                fp = save_path.joinpath(fn)
                with open(fp, "w") as fw:
                    fw.write(content)
                content = line
                count += 1
    if content:
        fn = outfile_prefix + str(count) + filepath.split(".")[-1]
        fp = save_path.joinpath(fn)
        with open(fp, "w") as fw:
            fw.write(content)


def list_files_content_size(path: Path, file_prefix: str):
    """
    under path, list size of file's content, those files' name starts with file_prefix

    Args:
        path (Path): files under this path to list
        file_prefix (str): files name should start with 
"""
    for fn in os.listdir(path):
        if not fn.startswith(file_prefix):
            continue
        fp = path.joinpath(fn)
        with open(fp, "r") as f:
            size = sys.getsizeof(f.read())
            print(fn, str(size) + "bytes")


if __name__ == "__main__":
    filepath = "a.txt"
    #Firstly,get max fize of lines 
    print("max size of lines is: ", lines_max_size(filepath))

    # Then, split large file
    split_file(filepath="a.txt", chunk_size=240, outfile_prefix="output")

    # list out files's size
    current_path = Path(__file__).parent
    list_files_content_size(current_path, "output")

本文标签： pythonSplit large text file into smaller chunks by byte size and to retain end of lineStack Overflow

版权声明：本文标题：python - Split large text file into smaller chunks by byte size and to retain end of line - Stack Overflow 内容由网友自发贡献，该文观点仅代表作者本人，转载请联系作者并注明出处：http://www.betaflare.com/web/1742318269a2452257.html，本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容，一经查实，本站将立刻删除。

编程频道|软件玩家 - 软件改变生活！

python - Split large text file into smaller chunks by byte size and to retain end of line - Stack Overflow

1 Answer 1

更多相关文章