怎样批量提取 PPT 中多个文本框里的文字？

梦貘 2024-06-03 16:41

他山之石

前言：最近整理笔记，需要从PPT中批量提取所有问题，于是从知乎上搜到了这篇文章，十分有用，遂存档备用。

原文地址

首先考虑最简单的情况，如果你的 PPT 是严格按照模板的大纲来写的话，可以直接用 PowerPoint 自带的「另存为/导出」功能，将文本导出为「rtf 文件」：

导出的 rtf 文件如下（与 text 文件的区别在于，它还保留了字体、字号、颜色等信息）：

然后考虑复杂点的情况，如果你的 PPT 文件不是按照标准的模板来做的话，比如插入了很多自定义的文本框，这时就需要用到下面的方法了：

一、Windows 上从单页或多页 PPT 中提取所有文本框内的文字

主要利用 VBA 代码[1][2]实现

首先需要打开 PowerPoint 中的 Visual Basic 编辑器，并插入一个「模块」：

在新插入的模块的编辑器中输入以下代码，然后点击顶部菜单栏中的三角按钮来运行：

运行之后，就会在 PPT 同目录下生成与 PTT 文件同名的 .txt 文件，其中内容格式如下：

Slide: [页码编号]

Slide: [页码编号]

……

上面用到的 VBA 代码如下（也适用于 Mac）：

如果不想要导出全部幻灯片的文本，可以在代码里面修改页码范围，即把下面的代码 For i = 1 To num_slides 修改成你想要的范围（如第 4 页到第 8 页）： For i = 4 To 8

Sub ExportText()
  Dim oPres As Presentation
  Dim oSlides As Slides
  Dim oSld As Slide         'Slide Object
  Dim oShp As Shape         'Shape Object
  Dim iFile As Integer      'File handle for output
  iFile = FreeFile          'Get a free file number
  Dim PathSep As String
  Dim FileNum As Integer
  Dim sTempString As String

  #If Mac Then
    PathSep = "/"
  #Else
    PathSep = "\"
  #End If

  Set oPres = ActivePresentation
  Set oSlides = oPres.Slides

  FileNum = FreeFile

  'Open output file
  ' NOTE:  errors here if file hasn't been saved
  'Open oPres.Path & PathSep & "AllText.txt" For Output As FileNum
  Open oPres.Path & PathSep & oPres.Name & ".txt" For Output As FileNum
  
  num_slides = ActivePresentation.Slides.Count

  For i = 1 To num_slides
    Set oSld = ActivePresentation.Slides(i)
    Print #iFile, "Slide:" & vbTab & CStr(oSld.SlideNumber)

    For Each oShp In oSld.Shapes
      'Check to see if shape has a text frame and text
      If oShp.HasTextFrame And oShp.TextFrame.HasText Then
        If oShp.Type = msoPlaceholder Then
            Select Case oShp.PlaceholderFormat.Type
                Case Is = ppPlaceholderTitle, ppPlaceholderCenterTitle
                    Print #iFile, "标题：" & vbTab & oShp.TextFrame.TextRange
                Case Is = ppPlaceholderBody
                    Print #iFile, "正文：" & vbTab & oShp.TextFrame.TextRange
                Case Is = ppPlaceholderSubtitle
                    Print #iFile, "副标题：" & vbTab & oShp.TextFrame.TextRange
                Case Else
                    Print #iFile, "其他占位符：" & vbTab & oShp.TextFrame.TextRange
            End Select
        Else
            Print #iFile, vbTab & oShp.TextFrame.TextRange
        End If  ' msoPlaceholder
      Else  ' it doesn't have a textframe - it might be a group that contains text so:
        If oShp.Type = msoGroup Then
            sTempString = TextFromGroupShape(oShp)
            If Len(sTempString) > 0 Then
                Print #iFile, sTempString
            End If
        ElseIf oShp.Type = msoSmartArt Then
            sTempString = TextFromSmartArtNode(oShp.SmartArt.Nodes, 0)
            If Len(sTempString) > 0 Then
                Print #iFile, sTempString
            End If
        End If
      End If    ' Has text frame/Has text
    Next oShp
    
    Print #iFile, vbCrLf
    Next i
  Close #iFile

  'MsgBox "文件已创建在 '" & oPres.Path & PathSep & "AllText.txt'"
  MsgBox "文件已创建在 '" & oPres.Path & PathSep & oPres.Name & ".txt'"
End Sub

Function TextFromGroupShape(oSh As Shape) As String
' Returns the text from the shapes in a group
' and recursively, text within shapes within groups within groups etc.

    Dim oGpSh As Shape
    Dim sTempText As String

    If oSh.Type = msoGroup Then
        For Each oGpSh In oSh.GroupItems
            With oGpSh
                If .Type = msoGroup Then
                    sTempText = sTempText & TextFromGroupShape(oGpSh)
                Else
                    If .HasTextFrame Then
                        If .TextFrame.HasText Then
                            sTempText = sTempText & "(Gp:) " & .TextFrame.TextRange.Text & vbCrLf
                        End If
                    End If
                End If
            End With
        Next
    End If

    TextFromGroupShape = sTempText

End Function

Function TextFromSmartArtNode(oSh As SmartArtNodes, depth As Long) As String
' Returns the text from the shapes in a SmartArt shape recursively

    Dim sTempText As String
    For i = 1 To oSh.Count
        With oSh(i)
            If .TextFrame2.TextRange.Text <> "" Then
                If depth = 0 Then
                    sTempText = sTempText & "(SmartArt:)" & .TextFrame2.TextRange & vbCrLf
                Else
                    sTempText = sTempText & Space(depth * 4) & .TextFrame2.TextRange & vbCrLf
                End If
                sTempText = sTempText & TextFromSmartArtNode(.Nodes, depth + 1)
            End If
        End With
    Next i

    TextFromSmartArtNode = sTempText

End Function

【更新】

增加了批量处理多个文件的功能，现在会自动在所处理的 PPT 文件同目录下生成同名的 txt 文件，如处理 “C:\Users\Emrys\Desktop\demo.pptx” 后会生成 “C:\Users\Emrys\Desktop\demo.pptx.txt”。

注意：如果选择的 PPT 文件比较多/比较大，运行可能比较卡顿且耗时稍久，此时请耐心等待，不要频繁操作电脑。

Sub ExportText()
  Dim oPres As Presentation
  Dim oSlides As Slides
  Dim oSld As Slide         'Slide Object
  Dim oShp As Shape         'Shape Object
  Dim iFile As Integer      'File handle for output
  iFile = FreeFile          'Get a free file number
  Dim PathSep As String
  Dim FileNum As Integer
  Dim sTempString As String
  Dim fd() As String

  #If Mac Then
    PathSep = "/"
  #Else
    PathSep = "\"
  #End If

  fd = Split(FileDialogOpen, vbLf)
  If Left(fd(0), 1) = "-" Then
    Debug.Print "Canceled"
    Exit Sub
  End If

  For n = LBound(fd) To UBound(fd)
    Set oPres = Presentations.Open(FileName:=fd(n), ReadOnly:=msoTrue, WithWindow:=msoTrue)
    Set oSlides = oPres.Slides

    FileNum = FreeFile

    'Open output file
    ' NOTE:  errors here if file hasn't been saved
    Open oPres.Path & PathSep & oPres.Name & ".txt" For Output As FileNum
    
    num_slides = oPres.Slides.Count

    For i = 1 To num_slides
      Set oSld = oPres.Slides(i)
      Print #iFile, "Slide:" & vbTab & CStr(oSld.SlideNumber)
      For Each oShp In oSld.Shapes
        'Check to see if shape has a text frame and text
        If oShp.HasTextFrame And oShp.TextFrame.HasText Then
          If oShp.Type = msoPlaceholder Then
            Select Case oShp.PlaceholderFormat.Type
              Case Is = ppPlaceholderTitle, ppPlaceholderCenterTitle
                Print #iFile, "标题：" & vbTab & oShp.TextFrame.TextRange
              Case Is = ppPlaceholderBody
                Print #iFile, "正文：" & vbTab & oShp.TextFrame.TextRange
              Case Is = ppPlaceholderSubtitle
                Print #iFile, "副标题：" & vbTab & oShp.TextFrame.TextRange
              Case Else
                Print #iFile, "其他占位符：" & vbTab & oShp.TextFrame.TextRange
            End Select
          Else
            Print #iFile, vbTab & oShp.TextFrame.TextRange
          End If  ' msoPlaceholder
        Else  ' it doesn't have a textframe - it might be a group that contains text so:
          If oShp.Type = msoGroup Then
            sTempString = TextFromGroupShape(oShp)
            If Len(sTempString) > 0 Then
              Print #iFile, sTempString
            End If
          ElseIf oShp.Type = msoSmartArt Then
            sTempString = TextFromSmartArtNode(oShp.SmartArt.Nodes, 0)
            If Len(sTempString) > 0 Then
              Print #iFile, sTempString
            End If
          End If
        End If    ' Has text frame/Has text
      Next oShp
      
      Print #iFile, vbCrLf
    Next i
    Close #iFile
    oPres.Close
  Next n

  MsgBox "已处理 " & UBound(fd) - LBound(fd) + 1 & " 个文件"
End Sub

Function TextFromGroupShape(oSh As Shape) As String
' Returns the text from the shapes in a group
' and recursively, text within shapes within groups within groups etc.

  Dim oGpSh As Shape
  Dim sTempText As String

  If oSh.Type = msoGroup Then
    For Each oGpSh In oSh.GroupItems
      With oGpSh
        If .Type = msoGroup Then
          sTempText = sTempText & TextFromGroupShape(oGpSh)
        Else
          If .HasTextFrame Then
            If .TextFrame.HasText Then
              sTempText = sTempText & "(Gp:) " & .TextFrame.TextRange.Text & vbCrLf
            End If
          End If
        End If
      End With
    Next
  End If

  TextFromGroupShape = sTempText

NormalExit:
  Exit Function

Errorhandler:
  Resume Next

End Function


Function TextFromSmartArtNode(oSh As SmartArtNodes, depth As Long) As String
' Returns the text from the shapes in a SmartArt shape recursively

    Dim sTempText As String
    For i = 1 To oSh.Count
        With oSh(i)
            If .TextFrame2.TextRange.Text <> "" Then
                If depth = 0 Then
                    sTempText = sTempText & "(SmartArt:)" & .TextFrame2.TextRange & vbCrLf
                Else
                    sTempText = sTempText & Space(depth * 4) & .TextFrame2.TextRange & vbCrLf
                End If
                sTempText = sTempText & TextFromSmartArtNode(.Nodes, depth + 1)
            End If
        End With
    Next i

    TextFromSmartArtNode = sTempText

End Function


Function FileDialogOpen() As String

  #If Mac Then
    ' 默认路径
    mypath = MacScript("return (path to desktop folder) as String")

    sMacScript = "set applescript's text item delimiters to "","" " & vbNewLine & _
    "try " & vbNewLine & _
    "set theFiles to (choose file of type {""ppt"", ""pptx""}" & _
    "with prompt ""请选择要处理的一个或多个 PowerPoint 文档"" default location alias """ & _
    mypath & """ multiple selections allowed true)" & vbNewLine & _
    "set applescript's text item delimiters to """" " & vbNewLine & _
    "on error errStr number errorNumber" & vbNewLine & _
    "return errorNumber " & vbNewLine & _
    "end try " & vbNewLine & _
    "repeat with i from 1 to length of theFiles" & vbNewLine & _
    "if i = 1 then" & vbNewLine & _
    "set fpath to POSIX path of item i of theFiles" & vbNewLine & _
    "else" & vbNewLine & _
    "set fpath to fpath & """ & vbNewLine & _
    """ & POSIX path of item i of theFiles" & vbNewLine & _
    "end if" & vbNewLine & _
    "end repeat" & vbNewLine & _
    "return fpath"

    FileDialogOpen = MacScript(sMacScript)

  #Else
    With Application.FileDialog(msoFileDialogOpen)
      .AllowMultiSelect = True
      .Title = "请选择要处理的一个或多个 PowerPoint 文档"
      .Filters.Add "PowerPoint 文档", "*.ppt; *.pptx", 1
      If .Show = -1 Then
        FileDialogOpen = ""
        For i = 1 To .SelectedItems.Count
          If i = 1 Then
            FileDialogOpen = .SelectedItems.Item(i)
          Else
            FileDialogOpen = FileDialogOpen & vbLf & .SelectedItems.Item(i)
          End If
        Next
      Else
        FileDialogOpen = "-"
      End If
    End With

  #End If
End Function

二、Mac OS 上从单页或多页 PPT 中提取所有文本框内的文字

主要利用 AppleScript 脚本实现

首先打开想要提取文字的 PPT 文档，然后打开「脚本编辑器/Script Editor」app，在其中输入下图中的代码后，点击运行按钮：

运行之后，会弹窗询问需要提取文字的 PPT 页码范围，输入之后，就会在桌面上自动生成以该 PPT 文件名为前缀的 txt 文件，并打开它。txt 文件的内容为所选 PPT 页码范围内的所有文本框内容，默认格式为：

(Page ?) xxxxx -————————- yyyyyy -————————-

(Page ?) xxxx -————————-

其中 (Page ?) 表示下面的文本为第 ? 页的内容，-------------------------- 用于分隔不同文本框的文字。如果不想要这些信息，只要把代码中的相应文本（如第 3 行和第 8 行代码）改掉即可。

上面用到的 AppleScript 脚本如下：

applescript
set userLocale to user locale of (system info)
on pptGetAllText(page_from as integer, page_to as integer)
   set delimit to "--------------------------"
   tell application "Microsoft PowerPoint"
      activate
      set total_text to ""
      repeat with slideNumber from page_from to page_to
         set page_text to "(Page " & (slideNumber as text) & ")
"
         tell slide slideNumber of active presentation
            repeat with shapeNumber from 1 to count of shapes
               set shapeText to content of text range of text frame of shape shapeNumber
               if shapeText is not missing value then
                  set page_text to (page_text & shapeText & "
" & delimit & "
")
               end if
            end repeat
         end tell
         set total_text to total_text & page_text & "


"
      end repeat
   end tell
   return total_text
end pptGetAllText

on validateInt(str_number as text)
   try
      set N to str_number as integer
      return {0, N}
   on error number errorNumber
      return {-1, -1}
   end try
end validateInt

on validateInput(page_from_to as text, num_slides as integer)
   set {flag, page_from, page_to} to {-1, -1, -1}
   set oldDelims to my text item delimiters -- save the current delimiters
   set my text item delimiters to {"-", "~", "～"} -- the character to split on
   set num to (count of text items of page_from_to)
   if num = 1 then
      # single page
      set {flag1, page_from} to my validateInt(first text item of page_from_to)
      if flag1 = 0 and page_from > 0 and page_from ≤ num_slides then set {flag, page_to} to {0, page_from}
   else if num = 2 then
      # multiple pages
      set {flag1, page_from} to my validateInt(first text item of page_from_to)
      set {flag2, page_to} to my validateInt(second text item of page_from_to)
      if flag1 = 0 and flag2 = 0 and page_from > 0 and page_from ≤ num_slides and page_from ≤ page_to and page_to > 0 and page_to ≤ num_slides then set flag to 0
   end if
   set my text item delimiters to oldDelims -- just to be safe, restore the old delimiters
   return {flag, page_from, page_to}
end validateInput

tell application "Microsoft PowerPoint" -- version: 2019
   set ppt_name to name of active presentation
   set num_slides to count of slides of active presentation
   set cur_page to slide number of slide of view of active window
   if cur_page is missing value then
      set tips to ""
   else
      set tips to "
[当前页码为 " & cur_page & "]"
   end if
   if num_slides = 1 then
      set default_ans to "1"
   else
      set default_ans to "1-" & (num_slides as text)
   end if
   set page_from_to to the text returned of (display dialog "请输入要处理的 PPT 页码范围" & tips & ":" default answer default_ans with title "提取 PPT 文本框内容" with icon note)
   set {flag, page_from, page_to} to my validateInput(page_from_to, num_slides)
   
   repeat while flag ≠ 0
      set page_from_to to the text returned of (display dialog "请输入要处理的 PPT 页码范围
[请确保页码范围有效]:" default answer default_ans with title "提取 PPT 文本框内容" with icon caution)
      set {flag, page_from, page_to} to my validateInput(page_from_to, num_slides)
   end repeat
end tell

set content_to_write to pptGetAllText(page_from, page_to)
set filename to POSIX path of (((path to desktop folder) as text) & ppt_name & ".txt")
(*
tell application "TextEdit"
   activate
   set newDoc to (make new document with properties {text:(content_to_write as Unicode text)})
end tell
*)
do shell script "cat > " & filename & " << 'EOF'
" & content_to_write as Unicode text & "
'EOF'"

set file_path to filename as POSIX file
tell application "Finder" to open file file_path

拓展

此处是博主自己拓展的内容

清除txt中多余内容

处理完之后应该会有很多TXT文档，其中有很多换行，“Slide 1”“标题”、“副标题”、“其他占位符”之类的表述，如果不想要，只想要一个干净的纯PPT文字内容的txt，可以用下面这个python代码，它会自动遍历当前目录下的所有txt文件然后删掉上面那些多余的内容

注意：需要先将TXT文件另存为，编码格式选“UTF-8”，否则会出现乱码情况

代码是GPT-4o写的，带有注释，不需要详细解释了吧：

import os
import re

def read_file_with_encodings(file_path, encodings):
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except UnicodeDecodeError:
            pass
    raise UnicodeDecodeError(f"Cannot decode file {file_path} with given encodings.")

def convert_to_utf8(file_path):
    encodings = ['cp1252', 'utf-8', 'gbk', 'gb2312']  # 常见的编码，可以根据需要调整
    content = read_file_with_encodings(file_path, encodings)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

def process_file_content(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # 删除 '副标题：\t' 、 '标题：\t'、 '正文：\t'和 '其他占位符：\t', '
    content = content.replace('副标题：\t', '')
    content = content.replace('标题：\t', '')
    content = content.replace('其他占位符：\t', '')
    content = content.replace('正文：\t', '')

    # 删除匹配 'Slide:\s+\d+' 的内容
    content = re.sub(r'Slide:\s+\d+', '', content)

    # 删除多于一个的换行符
    while '\n\n' in content:
        content = content.replace('\n\n', '\n')

    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

def main():
    current_directory = os.getcwd()
    txt_files = [f for f in os.listdir(current_directory) if f.endswith('.txt')]

    for txt_file in txt_files:
        file_path = os.path.join(current_directory, txt_file)
        convert_to_utf8(file_path)
        process_file_content(file_path)

if __name__ == "__main__":
    main()

快速排版

Word中的查找功能可以使用通配符，一般文档的标题结构都是有特征的，直接用正则去查找然后排版就行，效率比较高。

比如，查找“第X章”这种，可以用第[一二三四五六七八九十]章代码；查找“一、”这种可以用[一二三四五六七八九十]、代码，记得查找的时候把“使用通配符”勾上

参考

[^](#ref_1_0)Stack Overflow: Extracting all text from a powerpoint file in VBA https://stackoverflow.com/a/4675964
[^](#ref_20)Export Text to a text file, extract text from PowerPoint (Mac or PC) http://www.pptfaq.com/FAQ00274_Export_Text_to_a_text_file-_extract_text_from_PowerPoint-Mac_or_PC-.htm
后记：将PPT或者DOCX文件的拓展名改成ZIP，就能从中批量提取出视频、图片等媒体文件。